Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_hypothesis():
# Example df for tests:
df = (
pd.DataFrame({
"height_in_feet": [6.5, 7, 6.1, 5.1, 4],
"sex": ["M", "M", "F", "F", "F"]
})
)
# Initialise the different ways of calling a test:
schema_pass_ttest_on_alpha_val_1 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="sex",
relationship="greater_than",
alpha=0.5),
]),
"sex": Column(String)
})
schema_pass_ttest_on_alpha_val_2 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis(test=stats.ttest_ind,
samples=["M", "F"],
groupby="sex",
relationship="greater_than",
# Initialise the different ways of calling a test:
schema_pass_ttest_on_alpha_val_1 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="sex",
relationship="greater_than",
alpha=0.5),
]),
"sex": Column(String)
})
schema_pass_ttest_on_alpha_val_2 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis(test=stats.ttest_ind,
samples=["M", "F"],
groupby="sex",
relationship="greater_than",
relationship_kwargs={"alpha": 0.5}
),
]),
"sex": Column(String)
})
schema_pass_ttest_on_alpha_val_3 = DataFrameSchema({
"height_in_feet": Column(Float, [
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="sex",
errors.SchemaError,
match=r"^Expected .+ to have name"):
named_schema.validate(pd.Series(range(5), name="your_series"))
# when series floats are declared to be integer
with pytest.raises(
errors.SchemaError,
match=r"^after dropping null values, expected values in series"):
SeriesSchema(Int, nullable=True).validate(
pd.Series([1.1, 2.3, 5.5, np.nan]))
# when series contains null values when schema is not nullable
with pytest.raises(
errors.SchemaError,
match=r"^non-nullable series .+ contains null values"):
SeriesSchema(Float, nullable=False).validate(
pd.Series([1.1, 2.3, 5.5, np.nan]))
# when series contains null values when schema is not nullable in addition
# to having the wrong data type
with pytest.raises(
errors.SchemaError,
match=(
r"^expected series '.+' to have type .+, got .+ and "
"non-nullable series contains null values")):
SeriesSchema(Int, nullable=False).validate(
pd.Series([1.1, 2.3, 5.5, np.nan]))
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
element_wise=True))
})
@check_output(DataFrameSchema({"column2": Column(Float)}))
def test_func(df):
return df
},
checks=[
Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"),
Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"),
Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"),
Check(lambda g: g[("foo", "foo")]["col1"].iat[0] == 1,
groupby=["col3", "col4"]),
]
)
assert isinstance(groupby_check_schema.validate(df), pd.DataFrame)
# test element-wise checks
element_wise_check_schema = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Float),
},
checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True)
)
assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
errors.SchemaError,
match=r"^Expected .+ to have name"):
named_schema.validate(pd.Series(range(5), name="your_series"))
# when series floats are declared to be integer
with pytest.raises(
errors.SchemaError,
match=r"^after dropping null values, expected values in series"):
SeriesSchema(Int, nullable=True).validate(
pd.Series([1.1, 2.3, 5.5, np.nan]))
# when series contains null values when schema is not nullable
with pytest.raises(
errors.SchemaError,
match=r"^non-nullable series .+ contains null values"):
SeriesSchema(Float, nullable=False).validate(
pd.Series([1.1, 2.3, 5.5, np.nan]))
# when series contains null values when schema is not nullable in addition
# to having the wrong data type
with pytest.raises(
errors.SchemaError,
match=(
r"^expected series '.+' to have type .+, got .+ and "
"non-nullable series contains null values")):
SeriesSchema(Int, nullable=False).validate(
pd.Series([1.1, 2.3, 5.5, np.nan]))
def test_dataframe_checks():
schema = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Float),
"col3": Column(String),
"col4": Column(String),
},
checks=[
Check(lambda df: df["col1"] < df["col2"]),
Check(lambda df: df["col3"] == df["col4"]),
]
)
df = pd.DataFrame({
"col1": [1, 2, 3],
"col2": [2.0, 3.0, 4.0],
"col3": ["foo", "bar", "baz"],
"col4": ["foo", "bar", "baz"],
})
assert isinstance(schema.validate(df), pd.DataFrame)
def test_multi_index_columns():
schema = DataFrameSchema({
("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
("zero", "bar"): Column(
String, Check(lambda s: s.isin(["a", "b", "c", "d"]))),
("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
("one", "bar"): Column(
DateTime, Check(lambda s: s == pd.datetime(2019, 1, 1)))
})
validated_df = schema.validate(
pd.DataFrame({
("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
("zero", "bar"): ["a", "b", "c", "d"],
("one", "foo"): [1, 6, 4, 7],
("one", "bar"): pd.to_datetime(["2019/01/01"] * 4)
})
)
assert isinstance(validated_df, pd.DataFrame)
def test_check_function_decorators():
in_schema = DataFrameSchema(
{
"a": Column(Int, [
Check(lambda x: x >= 1, element_wise=True),
Check(lambda s: s.mean() > 0)]),
"b": Column(String,
Check(lambda x: x in ["x", "y", "z"],
element_wise=True)),
"c": Column(DateTime,
Check(lambda x: pd.Timestamp("2018-01-01") <= x,
element_wise=True)),
"d": Column(Float,
Check(lambda x: np.isnan(x) or x < 3,
element_wise=True),
nullable=True)
},
transformer=lambda df: df.assign(e="foo")
)
out_schema = DataFrameSchema(
{
"e": Column(String,
Check(lambda s: s == "foo")),
"f": Column(String,
Check(lambda x: x in ["a", "b"], element_wise=True))
})
# case 1: simplest path test - df is first argument and function returns
# single dataframe as output.