Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
element_wise=True))
})
def test_coerce_dtype():
df = pd.DataFrame({
"column1": [10.0, 20.0, 30.0],
"column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
"column3": [1, 2, 3],
"column4": [1., 1., np.nan],
})
# specify `coerce` at the Column level
schema1 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0), coerce=True),
"column2": Column(DateTime, coerce=True),
"column3": Column(String, coerce=True),
})
# specify `coerce` at the DataFrameSchema level
schema2 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0)),
"column2": Column(DateTime),
"column3": Column(String),
}, coerce=True)
for schema in [schema1, schema2]:
result = schema.validate(df)
assert result.column1.dtype == Int.value
assert result.column2.dtype == DateTime.value
for _, x in result.column3.iteritems():
assert isinstance(x, str)
def test_category_dtype():
schema = DataFrameSchema(
columns={
"col": Column(
dtypes.Category,
checks=[
Check(lambda s: set(s) == {"A", "B", "C"}),
Check(lambda s:
s.cat.categories.tolist() == ["A", "B", "C"]),
Check(lambda s: s.isin(["A", "B", "C"]))
],
nullable=False
),
},
coerce=False
)
validated_df = schema.validate(
pd.DataFrame(
{"col": pd.Series(["A", "B", "A", "B", "C"], dtype="category")}
)
def test_check_groupby_multiple_columns():
schema = DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s[("bar", True)].sum() == 16, # 7 + 9
groupby=["col2", "col3"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
"col3": Column(Bool),
})
df_pass = pd.DataFrame({
"col1": [7, 8, 9, 11, 12, 13],
"col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
"col3": [True, False, True, False, True, False],
})
df = schema.validate(df_pass)
assert isinstance(df, pd.DataFrame)
assert len(df.columns) == 3
assert set(df.columns) == {"col1", "col2", "col3"}
def test_multi_index_columns():
schema = DataFrameSchema({
("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
("zero", "bar"): Column(
String, Check(lambda s: s.isin(["a", "b", "c", "d"]))),
("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
("one", "bar"): Column(
DateTime, Check(lambda s: s == pd.datetime(2019, 1, 1)))
})
validated_df = schema.validate(
pd.DataFrame({
("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
("zero", "bar"): ["a", "b", "c", "d"],
("one", "foo"): [1, 6, 4, 7],
("one", "bar"): pd.to_datetime(["2019/01/01"] * 4)
})
)
assert isinstance(validated_df, pd.DataFrame)
def test_dataframe_hypothesis_checks():
df = pd.DataFrame({
"col1": range(100, 201),
"col2": range(0, 101),
})
hypothesis_check_schema = DataFrameSchema(
columns={
"col1": Column(Int),
"col2": Column(Int),
},
checks=[
# two-sample test
Hypothesis(
test=stats.ttest_ind,
samples=["col1", "col2"],
relationship=lambda stat, pvalue, alpha=0.01: (
stat > 0 and pvalue / 2 < alpha
),
relationship_kwargs={"alpha": 0.5},
),
# one-sample test
Hypothesis(
test=stats.ttest_1samp,
samples=["col1"],
relationship=lambda stat, pvalue, alpha=0.01: (
@check_output(DataFrameSchema({"column2": Column(Float)}))
def test_func(df):
return df
def test_dtypes():
for dtype in [
dtypes.Float,
dtypes.Float16,
dtypes.Float32,
dtypes.Float64]:
schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
validated_df = schema.validate(
pd.DataFrame(
{"col": [-123.1, -7654.321, 1.0, 1.1, 1199.51, 5.1, 4.6]},
dtype=dtype.value))
assert isinstance(validated_df, pd.DataFrame)
for dtype in [
dtypes.Int,
dtypes.Int8,
dtypes.Int16,
dtypes.Int32,
dtypes.Int64]:
schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
validated_df = schema.validate(
pd.DataFrame(
{"col": [-712, -4, -321, 0, 1, 777, 5, 123, 9000]},