Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dataframe_checks():
schema = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Float),
"col3": Column(String),
"col4": Column(String),
},
checks=[
Check(lambda df: df["col1"] < df["col2"]),
Check(lambda df: df["col3"] == df["col4"]),
]
)
df = pd.DataFrame({
"col1": [1, 2, 3],
"col2": [2.0, 3.0, 4.0],
"col3": ["foo", "bar", "baz"],
"col4": ["foo", "bar", "baz"],
})
def test_check_function_decorator_transform():
"""Test that transformer argument is in effect in check_input decorator."""
in_schema = DataFrameSchema(
{"column1": Column(Int)},
transformer=lambda df: df.assign(column2="foo"))
out_schema = DataFrameSchema(
{"column1": Column(Int),
"column2": Column(String)})
@check_input(in_schema)
@check_output(out_schema)
def func_input_transform1(df):
return df
result1 = func_input_transform1(pd.DataFrame({"column1": [1, 2, 3]}))
assert "column2" in result1
@check_input(in_schema, 1)
@check_output(out_schema, 1)
def func_input_transform2(_, df):
def test_tail_dataframe_schema():
df = pd.DataFrame({
"col1": [i for i in range(100)] + [i for i in range(-1, -1001, -1)]
})
schema = DataFrameSchema(
columns={"col1": Column(Int, Check(lambda s: s < 0))})
# Validating with tail of 1000 should pass
assert schema.validate(df, tail=1000).equals(df)
with pytest.raises(errors.SchemaError):
schema.validate(df)
def test_column():
schema = DataFrameSchema({
"a": Column(Int, Check(lambda x: x > 0, element_wise=True))
})
data = pd.DataFrame({"a": [1, 2, 3]})
assert isinstance(schema.validate(data), pd.DataFrame)
def init_schema_element_wise():
DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10,
element_wise=True,
groupby=["col2"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
def test_dataframe_hypothesis_checks():
df = pd.DataFrame({
"col1": range(100, 201),
"col2": range(0, 101),
})
hypothesis_check_schema = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Int),
},
checks=[
# two-sample test
Hypothesis(
test=stats.ttest_ind,
samples=["col1", "col2"],
relationship=lambda stat, pvalue, alpha=0.01: (
stat > 0 and pvalue / 2 < alpha
),
relationship_kwargs={"alpha": 0.5},
),
# one-sample test
Hypothesis(
test=stats.ttest_1samp,
samples=["col1"],
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
def test_vectorized_checks():
schema = SeriesSchema(
Int, Check(
lambda s: s.value_counts() == 2, element_wise=False))
validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3]))
assert isinstance(validated_series, pd.Series)
# error case
with pytest.raises(errors.SchemaError):
schema.validate(pd.Series([1, 2, 3]))
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
df = pd.DataFrame({
"col1": [7, 8, 9, 11, 12, 13],
"col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
})
validated_df = schema.validate(df)
assert isinstance(validated_df, pd.DataFrame)
assert len(validated_df.columns) == 2
assert set(validated_df.columns) == {"col1", "col2"}
# raise KeyError when groups does not include a particular group name
schema_fail_key_error = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError, match="^'bar'"):
schema_fail_key_error.validate(df)
# raise KeyError when the group does not exist in the groupby column when
# referenced in the Check function
schema_fail_nonexistent_key_in_fn = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
with pytest.raises(KeyError, match="^'baz'"):
samples=["col1"],
relationship=lambda stat, pvalue, alpha=0.01: (
stat > 0 and pvalue / 2 < alpha
),
test_kwargs={"popmean": 50},
relationship_kwargs={"alpha": 0.01},
),
]
)
hypothesis_check_schema.validate(df)
# raise error when using groupby
hypothesis_check_schema_groupby = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Int),
},
checks=[
# two-sample test
Hypothesis(
test=stats.ttest_ind,
samples=["col1", "col2"],
groupby="col3",
relationship=lambda stat, pvalue, alpha=0.01: (
stat > 0 and pvalue / 2 < alpha
),
relationship_kwargs={"alpha": 0.5},
),
]
)
with pytest.raises(errors.SchemaDefinitionError):