Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
element_wise=True))
})
df = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [1.1, 2.5, 9.9],
"c": ["z", "y", "x"],
"d": [True, True, False],
"e": pd.Series(["c2", "c1", "c3"], dtype="category"),
"f": [(3,), (2,), (1,)],
"g": [pd.Timestamp("2015-02-01"),
pd.Timestamp("2015-02-02"),
df = pd.DataFrame({
"column1": [10.0, 20.0, 30.0],
"column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
"column3": [1, 2, None],
"column4": [1., 1., np.nan],
})
# specify `coerce` at the Column level
schema1 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0), coerce=True),
"column2": Column(DateTime, coerce=True),
"column3": Column(String, coerce=True, nullable=True),
})
# specify `coerce` at the DataFrameSchema level
schema2 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0)),
"column2": Column(DateTime),
"column3": Column(String, nullable=True),
}, coerce=True)
for schema in [schema1, schema2]:
result = schema.validate(df)
assert result.column1.dtype == Int.value
assert result.column2.dtype == DateTime.value
for _, x in result.column3.iteritems():
assert pd.isna(x) or isinstance(x, str)
# make sure that correct error is raised when null values are present
# in a float column that's coerced to an int
schema = DataFrameSchema({"column4": Column(Int, coerce=True)})
with pytest.raises(ValueError):
schema.validate(df)
def test_multi_index_columns():
schema = DataFrameSchema({
("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
("zero", "bar"): Column(
String, Check(lambda s: s.isin(["a", "b", "c", "d"]))),
("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
("one", "bar"): Column(
DateTime, Check(lambda s: s == pd.datetime(2019, 1, 1)))
})
validated_df = schema.validate(
pd.DataFrame({
("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
("zero", "bar"): ["a", "b", "c", "d"],
("one", "foo"): [1, 6, 4, 7],
("one", "bar"): pd.to_datetime(["2019/01/01"] * 4)
})
)
assert isinstance(validated_df, pd.DataFrame)
def test_dataframe_schema():
schema = DataFrameSchema(
{
"a": Column(Int,
Check(lambda x: x > 0, element_wise=True)),
"b": Column(Float,
Check(lambda x: 0 <= x <= 10, element_wise=True)),
"c": Column(String,
Check(lambda x: set(x) == {"x", "y", "z"})),
"d": Column(Bool,
Check(lambda x: x.mean() > 0.5)),
"e": Column(Category,
Check(lambda x: set(x) == {"c1", "c2", "c3"})),
"f": Column(Object,
Check(lambda x: x.isin([(1,), (2,), (3,)]))),
"g": Column(DateTime,
Check(lambda x: x >= pd.Timestamp("2015-01-01"),
element_wise=True)),
"i": Column(Timedelta,
Check(lambda x: x < pd.Timedelta(10, unit="D"),
element_wise=True))
})
df = pd.DataFrame({
"a": [1, 2, 3],
"b": [1.1, 2.5, 9.9],
"c": ["z", "y", "x"],
"d": [True, True, False],
"e": pd.Series(["c2", "c1", "c3"], dtype="category"),
"f": [(3,), (2,), (1,)],
"g": [pd.Timestamp("2015-02-01"),
pd.Timestamp("2015-02-02"),
pd.Timestamp("2015-02-03")],
def test_check_function_decorators():
in_schema = DataFrameSchema(
{
"a": Column(Int, [
Check(lambda x: x >= 1, element_wise=True),
Check(lambda s: s.mean() > 0)]),
"b": Column(String,
Check(lambda x: x in ["x", "y", "z"],
element_wise=True)),
"c": Column(DateTime,
Check(lambda x: pd.Timestamp("2018-01-01") <= x,
element_wise=True)),
"d": Column(Float,
Check(lambda x: np.isnan(x) or x < 3,
element_wise=True),
nullable=True)
},
transformer=lambda df: df.assign(e="foo")
)
out_schema = DataFrameSchema(
{
"e": Column(String,
Check(lambda s: s == "foo")),
"f": Column(String,
Check(lambda x: x in ["a", "b"], element_wise=True))
})