How to use the pandera.String function in pandera

To help you get started, we’ve selected a few pandera examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pandera-dev / pandera / tests / test_hypotheses.py View on Github external
"height_in_feet": [6.5, 7, 6.1, 5.1, 4],
            "sex": ["M", "M", "F", "F", "F"]
        })
    )

    # Initialise the different ways of calling a test:
    schema_pass_ttest_on_alpha_val_1 = DataFrameSchema({
        "height_in_feet": Column(Float, [
            Hypothesis.two_sample_ttest(
                sample1="M",
                sample2="F",
                groupby="sex",
                relationship="greater_than",
                alpha=0.5),
        ]),
        "sex": Column(String)
    })

    schema_pass_ttest_on_alpha_val_2 = DataFrameSchema({
        "height_in_feet": Column(Float, [
            Hypothesis(test=stats.ttest_ind,
                       samples=["M", "F"],
                       groupby="sex",
                       relationship="greater_than",
                       relationship_kwargs={"alpha": 0.5}
                       ),
        ]),
        "sex": Column(String)
    })

    schema_pass_ttest_on_alpha_val_3 = DataFrameSchema({
        "height_in_feet": Column(Float, [
github pandera-dev / pandera / tests / test_hypotheses.py View on Github external
relationship="greater_than",
                       relationship_kwargs={"alpha": 0.05}),
        ]),
        "sex": Column(String)
    })

    schema_fail_ttest_on_alpha_val_3 = DataFrameSchema({
        "height_in_feet": Column(Float, [
            Hypothesis.two_sample_ttest(
                sample1="M",
                sample2="F",
                groupby="sex",
                relationship="greater_than",
                alpha=0.05),
        ]),
        "sex": Column(String)
    })

    with pytest.raises(errors.SchemaError):
        schema_fail_ttest_on_alpha_val_1.validate(df)
    with pytest.raises(errors.SchemaError):
        schema_fail_ttest_on_alpha_val_2.validate(df)
    with pytest.raises(errors.SchemaError):
        schema_fail_ttest_on_alpha_val_3.validate(df)
github pandera-dev / pandera / tests / test_checks.py View on Github external
def test_check_groupby():
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int, [
                Check(lambda s: s["foo"] > 10, groupby="col2"),
                Check(lambda s: s["bar"] < 10, groupby=["col2"]),
                Check(lambda s: s["foo"] > 10,
                      groupby=lambda df: df.groupby("col2")),
                Check(lambda s: s["bar"] < 10,
                      groupby=lambda df: df.groupby("col2"))
            ]),
            "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        },
        index=Index(Int, name="data_id"),
    )

    df_pass = pd.DataFrame(
        data={
            "col1": [7, 8, 9, 11, 12, 13],
            "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        },
        index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"),
    )

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 2
    assert set(df.columns) == {"col1", "col2"}
github pandera-dev / pandera / tests / test_pandera.py View on Github external
def test_dataframe_schema():
    schema = DataFrameSchema(
        {
            "a": Column(Int,
                        Check(lambda x: x > 0, element_wise=True)),
            "b": Column(Float,
                        Check(lambda x: 0 <= x <= 10, element_wise=True)),
            "c": Column(String,
                        Check(lambda x: set(x) == {"x", "y", "z"})),
            "d": Column(Bool,
                        Check(lambda x: x.mean() > 0.5)),
            "e": Column(Category,
                        Check(lambda x: set(x) == {"c1", "c2", "c3"})),
            "f": Column(Object,
                        Check(lambda x: x.isin([(1,), (2,), (3,)]))),
            "g": Column(DateTime,
                        Check(lambda x: x >= pd.Timestamp("2015-01-01"),
                              element_wise=True)),
            "i": Column(Timedelta,
                        Check(lambda x: x < pd.Timedelta(10, unit="D"),
                              element_wise=True))
        })
    df = pd.DataFrame({
        "a": [1, 2, 3],
github pandera-dev / pandera / tests / test_schemas.py View on Github external
def test_coerce_dtype_nullable_str():
    df_nans = pd.DataFrame({
        "col": ["foobar", "foo", "bar", "baz", np.nan, np.nan],
    })
    df_nones = pd.DataFrame({
        "col": ["foobar", "foo", "bar", "baz", None, None],
    })

    with pytest.raises(errors.SchemaError):
        for df in [df_nans, df_nones]:
            DataFrameSchema({
                "col": Column(String, coerce=True, nullable=False)
            }).validate(df)

    schema = DataFrameSchema({
        "col": Column(String, coerce=True, nullable=True)
    })

    for df in [df_nans, df_nones]:
        assert isinstance(schema.validate(df), pd.DataFrame)
github pandera-dev / pandera / tests / test_schemas.py View on Github external
def test_dataframe_schema():
    schema = DataFrameSchema(
        {
            "a": Column(Int,
                        Check(lambda x: x > 0, element_wise=True)),
            "b": Column(Float,
                        Check(lambda x: 0 <= x <= 10, element_wise=True)),
            "c": Column(String,
                        Check(lambda x: set(x) == {"x", "y", "z"})),
            "d": Column(Bool,
                        Check(lambda x: x.mean() > 0.5)),
            "e": Column(Category,
                        Check(lambda x: set(x) == {"c1", "c2", "c3"})),
            "f": Column(Object,
                        Check(lambda x: x.isin([(1,), (2,), (3,)]))),
            "g": Column(DateTime,
                        Check(lambda x: x >= pd.Timestamp("2015-01-01"),
                              element_wise=True)),
            "i": Column(Timedelta,
                        Check(lambda x: x < pd.Timedelta(10, unit="D"),
                              element_wise=True))
        })
    df = pd.DataFrame(
        {
github pandera-dev / pandera / tests / test_decorators.py View on Github external
def test_check_function_decorators():
    in_schema = DataFrameSchema(
        {
            "a": Column(Int, [
                Check(lambda x: x >= 1, element_wise=True),
                Check(lambda s: s.mean() > 0)]),
            "b": Column(String,
                        Check(lambda x: x in ["x", "y", "z"],
                              element_wise=True)),
            "c": Column(DateTime,
                        Check(lambda x: pd.Timestamp("2018-01-01") <= x,
                              element_wise=True)),
            "d": Column(Float,
                        Check(lambda x: np.isnan(x) or x < 3,
                              element_wise=True),
                        nullable=True)
        },
        transformer=lambda df: df.assign(e="foo")
    )
    out_schema = DataFrameSchema(
        {
            "e": Column(String,
                        Check(lambda s: s == "foo")),
github pandera-dev / pandera / tests / test_checks.py View on Github external
# referenced in the Check function
    schema_fail_nonexistent_key_in_fn = DataFrameSchema({
        "col1": Column(Int, [
            Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
        ]),
        "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'baz'"):
        schema_fail_nonexistent_key_in_fn.validate(df)

    # raise KeyError when the group does not exist in the groups argument.
    schema_fail_nonexistent_key_in_groups = DataFrameSchema({
        "col1": Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]),
        ]),
        "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError):
        schema_fail_nonexistent_key_in_groups.validate(df)
github pandera-dev / pandera / tests / test_pandera.py View on Github external
df = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)
    assert len(validated_df.columns) == 2
    assert set(validated_df.columns) == {"col1", "col2"}

    # raise KeyError when groups does not include a particular group name
    schema_fail_key_error = DataFrameSchema({
        "col1": Column(Int, [
            Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'bar'"):
        schema_fail_key_error.validate(df)

    # raise KeyError when the group does not exist in the groupby column when
    # referenced in the Check function
    schema_fail_nonexistent_key_in_fn = DataFrameSchema({
        "col1": Column(Int, [
            Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
        ]),
        "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'baz'"):
        schema_fail_nonexistent_key_in_fn.validate(df)

    # raise KeyError when the group does not exist in the groups argument.
github pandera-dev / pandera / tests / test_decorators.py View on Github external
def test_check_input_method_decorators():
    in_schema = DataFrameSchema({"column1": Column(String)})
    out_schema = DataFrameSchema({"column2": Column(Int)})
    dataframe = pd.DataFrame({"column1": ["a", "b", "c"]})

    def _transform_helper(df):
        return df.assign(column2=[1, 2, 3])

    class TransformerClass(object):
        """A repeatable set of decorator input styles for testing"""

        @check_input(in_schema)
        @check_output(out_schema)
        def transform_first_arg(self, df):
            return _transform_helper(df)

        @check_input(in_schema, 0)
        @check_output(out_schema)