Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data={
"col1": [7, 8, 9, 11, 1, 13],
"col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
},
index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
)
# raise errors.SchemaError when groupby column doesn't exist
df_fail_no_column = pd.DataFrame(
data={
"col1": [7, 8, 20, 11, 12, 13],
},
index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
)
for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]:
with pytest.raises(errors.SchemaError):
schema.validate(df)
int_schema = SeriesSchema(
Int, Check(lambda x: 0 <= x <= 100, element_wise=True))
assert isinstance(int_schema.validate(
pd.Series([0, 30, 50, 100])), pd.Series)
str_schema = SeriesSchema(
String, Check(lambda s: s.isin(["foo", "bar", "baz"])),
nullable=True, coerce=True)
assert isinstance(str_schema.validate(
pd.Series(["foo", "bar", "baz", None])), pd.Series)
assert isinstance(str_schema.validate(
pd.Series(["foo", "bar", "baz", np.nan])), pd.Series)
# error cases
for data in [-1, 101, 50.1, "foo"]:
with pytest.raises(errors.SchemaError):
int_schema.validate(pd.Series([data]))
for data in [-1, {"a": 1}, -1.0]:
with pytest.raises(TypeError):
int_schema.validate(TypeError)
non_duplicate_schema = SeriesSchema(
Int, allow_duplicates=False)
with pytest.raises(errors.SchemaError):
non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))
# when series name doesn't match schema
named_schema = SeriesSchema(Int, name="my_series")
with pytest.raises(
errors.SchemaError,
match=r"^Expected .+ to have name"):
def test_category_dtype_coerce():
columns = {
"col": Column(
dtypes.Category,
checks=Check(lambda s: set(s) == {"A", "B", "C"}),
nullable=False
),
}
with pytest.raises(SchemaError):
DataFrameSchema(columns=columns, coerce=False).validate(
pd.DataFrame(
{"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")}
)
)
validated_df = DataFrameSchema(columns=columns, coerce=True).validate(
pd.DataFrame(
{"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")}
)
)
assert isinstance(validated_df, pd.DataFrame)
if any(duplicates):
raise errors.SchemaError(
"series '%s' contains duplicate values: %s" %
(series.name,
series[duplicates].head(
constants.N_FAILURE_CASES).to_dict()))
try:
series.dtype == _dtype
except TypeError:
types_not_matching = True
else:
types_not_matching = series.dtype != _dtype
if _dtype is not None and types_not_matching:
raise errors.SchemaError(
"expected series '%s' to have type %s, got %s" %
(series.name, _dtype, series.dtype))
val_results = []
for check_index, check in enumerate(self.checks):
val_results.append(
check(
self,
check_index,
check._prepare_series_input(series, dataframe_context)))
return all(val_results)
"""Check pandas DataFrame or Series before calling the function.
:param fn: check the DataFrame or Series input of this function
:param instance: the object to which the wrapped function was bound
when it was called. Only applies to methods.
:param args: the list of positional arguments supplied when the
decorated function was called.
:param kwargs: the dictionary of keyword arguments supplied when the
decorated function was called.
"""
args = list(args)
if isinstance(obj_getter, int):
try:
args[obj_getter] = schema.validate(args[obj_getter])
except IndexError as e:
raise errors.SchemaError(
"error in check_input decorator of function '%s': the "
"index '%s' was supplied to the check but this "
"function accepts '%s' arguments, so the maximum "
"index is '%s'. The full error is: '%s'" %
(fn.__name__,
obj_getter,
len(_get_fn_argnames(fn)),
max(0, len(_get_fn_argnames(fn))-1),
e
)
)
elif isinstance(obj_getter, str):
if obj_getter in kwargs:
kwargs[obj_getter] = schema.validate(kwargs[obj_getter])
else:
arg_spec_args = _get_fn_argnames(fn)
... pd.DataFrame({
... "height_in_feet": [8.1, 7, 5.2, 5.1, 4],
... "group": ["A", "A", "B", "B", "B"]
... })
... )
>>> schema.validate(df)[["height_in_feet", "group"]]
height_in_feet group
0 8.1 A
1 7.0 A
2 5.2 B
3 5.1 B
4 4.0 B
"""
if relationship not in cls._RELATIONSHIPS:
raise errors.SchemaError(
"relationship must be one of %s" % set(cls._RELATIONSHIPS))
return cls(
test=stats.ttest_ind,
samples=[sample1, sample2],
groupby=groupby,
relationship=relationship,
test_kwargs={"equal_var": equal_var, "nan_policy": nan_policy},
relationship_kwargs={"alpha": alpha},
error="failed two sample ttest between '%s' and '%s'" % (
sample1, sample2),
)
"non-nullable series contains null values: %s" %
(series.name, self._pandas_dtype.value, series.dtype,
series[nulls].head(
constants.N_FAILURE_CASES).to_dict()))
else:
raise errors.SchemaError(
"non-nullable series '%s' contains null values: %s" %
(series.name,
series[nulls].head(
constants.N_FAILURE_CASES).to_dict()))
# Check if the series contains duplicate values
if not self._allow_duplicates:
duplicates = series.duplicated()
if any(duplicates):
raise errors.SchemaError(
"series '%s' contains duplicate values: %s" %
(series.name,
series[duplicates].head(
constants.N_FAILURE_CASES).to_dict()))
try:
series.dtype == _dtype
except TypeError:
types_not_matching = True
else:
types_not_matching = series.dtype != _dtype
if _dtype is not None and types_not_matching:
raise errors.SchemaError(
"expected series '%s' to have type %s, got %s" %
(series.name, _dtype, series.dtype))