Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"height_in_feet": [6.5, 7, 6.1, 5.1, 4],
"sex": ["M", "M", "F", "F", "F"]
})
)
# Initialise the different ways of calling a test:
schema_pass_ttest_on_alpha_val_1 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="sex",
relationship="greater_than",
alpha=0.5),
]),
"sex": Column(String)
})
schema_pass_ttest_on_alpha_val_2 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis(test=stats.ttest_ind,
samples=["M", "F"],
groupby="sex",
relationship="greater_than",
relationship_kwargs={"alpha": 0.5}
),
]),
"sex": Column(String)
})
schema_pass_ttest_on_alpha_val_3 = DataFrameSchema({
"height_in_feet": Column(Float, [
relationship="greater_than",
relationship_kwargs={"alpha": 0.05}),
]),
"sex": Column(String)
})
schema_fail_ttest_on_alpha_val_3 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="sex",
relationship="greater_than",
alpha=0.05),
]),
"sex": Column(String)
})
with pytest.raises(errors.SchemaError):
schema_fail_ttest_on_alpha_val_1.validate(df)
with pytest.raises(errors.SchemaError):
schema_fail_ttest_on_alpha_val_2.validate(df)
with pytest.raises(errors.SchemaError):
schema_fail_ttest_on_alpha_val_3.validate(df)
def test_check_groupby():
schema = DataFrameSchema(
columns={
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10, groupby="col2"),
Check(lambda s: s["bar"] < 10, groupby=["col2"]),
Check(lambda s: s["foo"] > 10,
groupby=lambda df: df.groupby("col2")),
Check(lambda s: s["bar"] < 10,
groupby=lambda df: df.groupby("col2"))
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
},
index=Index(Int, name="data_id"),
)
df_pass = pd.DataFrame(
data={
"col1": [7, 8, 9, 11, 12, 13],
"col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
},
index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
)
df = schema.validate(df_pass)
assert isinstance(df, pd.DataFrame)
assert len(df.columns) == 2
assert set(df.columns) == {"col1", "col2"}
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
element_wise=True))
})
df = pd.DataFrame({
"a": [1, 2, 3],
def test_coerce_dtype_nullable_str():
df_nans = pd.DataFrame({
"col": ["foobar", "foo", "bar", "baz", np.nan, np.nan],
})
df_nones = pd.DataFrame({
"col": ["foobar", "foo", "bar", "baz", None, None],
})
with pytest.raises(errors.SchemaError):
for df in [df_nans, df_nones]:
DataFrameSchema({
"col": Column(String, coerce=True, nullable=False)
}).validate(df)
schema = DataFrameSchema({
"col": Column(String, coerce=True, nullable=True)
})
for df in [df_nans, df_nones]:
assert isinstance(schema.validate(df), pd.DataFrame)
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
element_wise=True))
})
df = pd.DataFrame(
{
def test_check_function_decorators():
in_schema = DataFrameSchema(
{
"a": Column(Int, [
Check(lambda x: x >= 1, element_wise=True),
Check(lambda s: s.mean() > 0)]),
"b": Column(String,
Check(lambda x: x in ["x", "y", "z"],
element_wise=True)),
"c": Column(DateTime,
Check(lambda x: pd.Timestamp("2018-01-01") <= x,
element_wise=True)),
"d": Column(Float,
Check(lambda x: np.isnan(x) or x < 3,
element_wise=True),
nullable=True)
},
transformer=lambda df: df.assign(e="foo")
)
out_schema = DataFrameSchema(
{
"e": Column(String,
Check(lambda s: s == "foo")),
# referenced in the Check function
schema_fail_nonexistent_key_in_fn = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError, match="^'baz'"):
schema_fail_nonexistent_key_in_fn.validate(df)
# raise KeyError when the group does not exist in the groups argument.
schema_fail_nonexistent_key_in_groups = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError):
schema_fail_nonexistent_key_in_groups.validate(df)
df = pd.DataFrame({
"col1": [7, 8, 9, 11, 12, 13],
"col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
})
validated_df = schema.validate(df)
assert isinstance(validated_df, pd.DataFrame)
assert len(validated_df.columns) == 2
assert set(validated_df.columns) == {"col1", "col2"}
# raise KeyError when groups does not include a particular group name
schema_fail_key_error = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError, match="^'bar'"):
schema_fail_key_error.validate(df)
# raise KeyError when the group does not exist in the groupby column when
# referenced in the Check function
schema_fail_nonexistent_key_in_fn = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError, match="^'baz'"):
schema_fail_nonexistent_key_in_fn.validate(df)
# raise KeyError when the group does not exist in the groups argument.
def test_check_input_method_decorators():
in_schema = DataFrameSchema({"column1": Column(String)})
out_schema = DataFrameSchema({"column2": Column(Int)})
dataframe = pd.DataFrame({"column1": ["a", "b", "c"]})
def _transform_helper(df):
return df.assign(column2=[1, 2, 3])
class TransformerClass(object):
"""A repeatable set of decorator input styles for testing"""
@check_input(in_schema)
@check_output(out_schema)
def transform_first_arg(self, df):
return _transform_helper(df)
@check_input(in_schema, 0)
@check_output(out_schema)