Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
relationship=lambda stat, pvalue, alpha=0.01: (
stat > 0 and pvalue / 2 < alpha
),
relationship_kwargs={"alpha": 0.5}
)
]),
"sex": Column(String),
})
# Check the 3 happy paths are successful:
schema_pass_ttest_on_alpha_val_1.validate(df)
schema_pass_ttest_on_alpha_val_2.validate(df)
schema_pass_ttest_on_alpha_val_3.validate(df)
schema_pass_ttest_on_custom_relationship.validate(df)
schema_fail_ttest_on_alpha_val_1 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="sex",
relationship="greater_than",
alpha=0.05),
]),
"sex": Column(String)
})
schema_fail_ttest_on_alpha_val_2 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis(test=stats.ttest_ind,
samples=["M", "F"],
groupby="sex",
def test_index_schema():
schema = DataFrameSchema(
columns={},
index=Index(
Int, [
Check(lambda x: 1 <= x <= 11, element_wise=True),
Check(lambda index: index.mean() > 1)]
))
df = pd.DataFrame(index=range(1, 11), dtype="int64")
assert isinstance(schema.validate(df), pd.DataFrame)
with pytest.raises(errors.SchemaError):
schema.validate(pd.DataFrame(index=range(1, 20)))
def test_head_dataframe_schema():
"""
Test that schema can validate head of dataframe, returns entire dataframe.
"""
df = pd.DataFrame({
"col1": [i for i in range(100)] + [i for i in range(-1, -1001, -1)]
})
schema = DataFrameSchema(
columns={"col1": Column(Int, Check(lambda s: s >= 0))})
# Validating with head of 100 should pass
assert schema.validate(df, head=100).equals(df)
with pytest.raises(errors.SchemaError):
schema.validate(df)
with pytest.raises(KeyError, match="^'bar'"):
schema_fail_key_error.validate(df)
# raise KeyError when the group does not exist in the groupby column when
# referenced in the Check function
schema_fail_nonexistent_key_in_fn = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError, match="^'baz'"):
schema_fail_nonexistent_key_in_fn.validate(df)
# raise KeyError when the group does not exist in the groups argument.
schema_fail_nonexistent_key_in_groups = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError):
schema_fail_nonexistent_key_in_groups.validate(df)
def test_coerce_dtype():
df = pd.DataFrame({
"column1": [10.0, 20.0, 30.0],
"column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
"column3": [1, 2, 3],
"column4": [1., 1., np.nan],
})
# specify `coerce` at the Column level
schema1 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0), coerce=True),
"column2": Column(DateTime, coerce=True),
"column3": Column(String, coerce=True),
})
# specify `coerce` at the DataFrameSchema level
schema2 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0)),
"column2": Column(DateTime),
"column3": Column(String),
}, coerce=True)
for schema in [schema1, schema2]:
result = schema.validate(df)
assert result.column1.dtype == Int.value
assert result.column2.dtype == DateTime.value
for _, x in result.column3.iteritems():
def test_check_function_decorators():
in_schema = DataFrameSchema(
{
"a": Column(Int, [
Check(lambda x: x >= 1, element_wise=True),
Check(lambda s: s.mean() > 0)]),
"b": Column(String,
Check(lambda x: x in ["x", "y", "z"],
element_wise=True)),
"c": Column(DateTime,
Check(lambda x: pd.Timestamp("2018-01-01") <= x,
element_wise=True)),
"d": Column(Float,
Check(lambda x: np.isnan(x) or x < 3,
element_wise=True),
nullable=True)
},
transformer=lambda df: df.assign(e="foo")
def test_datetime():
schema = DataFrameSchema(
columns={
"col": Column(
dtypes.DateTime,
checks=Check(lambda s: s.min() > pd.Timestamp("2015")),
)
}
)
validated_df = schema.validate(
pd.DataFrame(
{"col": pd.to_datetime(["2019/01/01", "2018/05/21", "2016/03/10"])}
)
)
assert isinstance(validated_df, pd.DataFrame)
def test_dataframe_hypothesis_checks():
df = pd.DataFrame({
"col1": range(100, 201),
"col2": range(0, 101),
})
hypothesis_check_schema = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Int),
},
checks=[
# two-sample test
Hypothesis(
test=stats.ttest_ind,
samples=["col1", "col2"],
relationship=lambda stat, pvalue, alpha=0.01: (
stat > 0 and pvalue / 2 < alpha
),
relationship_kwargs={"alpha": 0.5},
),
# one-sample test
Hypothesis(
def test_check_groupby():
schema = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10, groupby="col2"),
Check(lambda s: s["bar"] < 10, groupby=["col2"]),
Check(lambda s: s["foo"] > 10,
groupby=lambda df: df.groupby("col2")),
Check(lambda s: s["bar"] < 10,
groupby=lambda df: df.groupby("col2"))
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
df_pass = pd.DataFrame({
"col1": [7, 8, 9, 11, 12, 13],
"col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
})
columns = {
"col": Column(
dtypes.Category,
checks=Check(lambda s: set(s) == {"A", "B", "C"}),
nullable=False
),
}
with pytest.raises(SchemaError):
DataFrameSchema(columns=columns, coerce=False).validate(
pd.DataFrame(
{"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")}
)
)
validated_df = DataFrameSchema(columns=columns, coerce=True).validate(
pd.DataFrame(
{"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")}
)
)
assert isinstance(validated_df, pd.DataFrame)