Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dataframe_schema_check_function_types(check_function, should_fail):
schema = DataFrameSchema(
{
"a": Column(Int,
Check(fn=check_function, element_wise=False)),
"b": Column(Float,
Check(fn=check_function, element_wise=False))
})
df = pd.DataFrame({
"a": [1, 2, 3],
"b": [1.1, 2.5, 9.9]
})
if should_fail:
with pytest.raises(errors.SchemaError):
schema.validate(df)
else:
schema.validate(df)
def test_sample_dataframe_schema():
df = pd.DataFrame({
"col1": range(1, 1001)
})
# assert all values -1
schema = DataFrameSchema(
columns={"col1": Column(Int, Check(lambda s: s == -1))})
for seed in [11, 123456, 9000, 654]:
sample_index = df.sample(100, random_state=seed).index
df.loc[sample_index] = -1
assert schema.validate(df, sample=100, random_state=seed).equals(df)
def test_multi_index_columns():
schema = DataFrameSchema({
("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
("zero", "bar"): Column(
String, Check(lambda s: s.isin(["a", "b", "c", "d"]))),
("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
("one", "bar"): Column(
DateTime, Check(lambda s: s == pd.datetime(2019, 1, 1)))
})
validated_df = schema.validate(
pd.DataFrame({
("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
("zero", "bar"): ["a", "b", "c", "d"],
("one", "foo"): [1, 6, 4, 7],
("one", "bar"): pd.to_datetime(["2019/01/01"] * 4)
})
)
assert isinstance(validated_df, pd.DataFrame)
def init_schema_element_wise():
DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10,
element_wise=True,
groupby=["col2"]),
]),
"col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
})
def test_index_schema():
schema = DataFrameSchema(
columns={},
index=Index(
Int, [
Check(lambda x: 1 <= x <= 11, element_wise=True),
Check(lambda index: index.mean() > 1)]
))
df = pd.DataFrame(index=range(1, 11), dtype="int64")
assert isinstance(schema.validate(df), pd.DataFrame)
with pytest.raises(errors.SchemaError):
schema.validate(pd.DataFrame(index=range(1, 20)))
def test_series_schema_multiple_validators():
schema = SeriesSchema(
Int, [
Check(lambda x: 0 <= x <= 50, element_wise=True),
Check(lambda s: (s == 21).any())])
validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
assert isinstance(validated_series, pd.Series)
# raise error if any of the validators fails
with pytest.raises(errors.SchemaError):
schema.validate(pd.Series([1, 5, 20, 50]))
def test_coerce_dtype_in_dataframe():
df = pd.DataFrame({
"column1": [10.0, 20.0, 30.0],
"column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
"column3": [1, 2, None],
"column4": [1., 1., np.nan],
})
# specify `coerce` at the Column level
schema1 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0), coerce=True),
"column2": Column(DateTime, coerce=True),
"column3": Column(String, coerce=True, nullable=True),
})
# specify `coerce` at the DataFrameSchema level
schema2 = DataFrameSchema({
"column1": Column(Int, Check(lambda x: x > 0)),
"column2": Column(DateTime),
"column3": Column(String, nullable=True),
}, coerce=True)
for schema in [schema1, schema2]:
result = schema.validate(df)
assert result.column1.dtype == Int.value
assert result.column2.dtype == DateTime.value
for _, x in result.column3.iteritems():
assert pd.isna(x) or isinstance(x, str)
# make sure that correct error is raised when null values are present
# in a float column that's coerced to an int
schema = DataFrameSchema({"column4": Column(Int, coerce=True)})
with pytest.raises(ValueError):
schema.validate(df)
def test_format_failure_case_exceptions():
check = Check(lambda x: x.isna().sum() == 0)
for data in [1, "foobar", 1.0, {"key": "value"}, list(range(10))]:
with pytest.raises(TypeError):
check._format_failure_cases(data)
def test_head_dataframe_schema():
"""
Test that schema can validate head of dataframe, returns entire dataframe.
"""
df = pd.DataFrame({
"col1": [i for i in range(100)] + [i for i in range(-1, -1001, -1)]
})
schema = DataFrameSchema(
columns={"col1": Column(Int, Check(lambda s: s >= 0))})
# Validating with head of 100 should pass
assert schema.validate(df, head=100).equals(df)
with pytest.raises(errors.SchemaError):
schema.validate(df)
def init_schema_no_groupby_column():
DataFrameSchema({
"col1": Column(Int, [
Check(lambda s: s["foo"] > 10, groupby=["col2"]),
]),
})
with pytest.raises(errors.SchemaInitError):
init_schema_no_groupby_column()
# can't use groupby argument in SeriesSchema or Index objects
for SchemaClass in [SeriesSchema, Index]:
with pytest.raises(
errors.SchemaInitError,
match="^Cannot use groupby checks with"):
SchemaClass(Int, Check(lambda s: s["bar"] == 1, groupby="foo"))