Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_node_returning_none(self):
pipeline = Pipeline([node(identity, "A", "B"), node(return_none, "B", "C")])
catalog = DataCatalog({"A": MemoryDataSet("42")})
pattern = "Saving `None` to a `DataSet` is not allowed"
with pytest.raises(DataSetError, match=pattern):
ParallelRunner().run(pipeline, catalog)
def memory_catalog():
ds1 = MemoryDataSet({"data": 42})
ds2 = MemoryDataSet([1, 2, 3, 4, 5])
return DataCatalog({"ds1": ds1, "ds2": ds2})
def test_save_modify_original_data(spark_data_frame):
"""Check that the data set object is not updated when the original
SparkDataFrame is changed."""
memory_data_set = MemoryDataSet()
memory_data_set.save(spark_data_frame)
spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, "new value")
assert not _check_equals(memory_data_set.load(), spark_data_frame)
def conflicting_feed_dict(pandas_df_feed_dict):
ds1 = MemoryDataSet({"data": 0})
ds3 = pandas_df_feed_dict["ds3"]
return {"ds1": ds1, "ds3": ds3}
def test_exists(self, new_data):
"""Test `exists` method invocation"""
data_set = MemoryDataSet()
assert not data_set.exists()
data_set.save(new_data)
assert data_set.exists()
def test_memory_data_set_input(self, fan_out_fan_in):
pipeline = Pipeline([fan_out_fan_in])
catalog = DataCatalog({"A": MemoryDataSet("42")})
result = ParallelRunner().run(pipeline, catalog)
assert "Z" in result
assert len(result["Z"]) == 3
assert result["Z"] == ("42", "42", "42")
def memory_data_set(spark_data_frame):
return MemoryDataSet(data=spark_data_frame)
def create_default_data_set(self, ds_name: str) -> AbstractDataSet:
"""Factory method for creating the default data set for the runner.
Args:
ds_name: Name of the missing data set
Returns:
An instance of an implementation of AbstractDataSet to be used
for all unregistered data sets.
"""
return MemoryDataSet()
if self.runner is None:
self.df = bundle_train_and_test_data(args_raw, train_df, test_df)
self.args = impute_cols_features(args_raw, self.df)
self.args = schedule_propensity_scoring(self.args, self.df)
self.treatment_fractions = treatment_fractions_(self.args, self.df)
if self.args.need_propensity_scoring:
self.propensity_model = fit_propensity(self.args, self.df)
self.df = estimate_propensity(self.args, self.df, self.propensity_model)
if self.runner:
self.kedro_context.catalog.add_feed_dict(
{
"train_df": MemoryDataSet(train_df),
"test_df": MemoryDataSet(test_df),
"args_raw": MemoryDataSet(args_raw),
},
replace=True,
)
self.kedro_context.catalog.add_feed_dict(dataset_catalog, replace=True)
self.kedro_context.run(tags=["011_bundle_train_and_test_data"])
self.df = self.kedro_context.catalog.load("df_00")
self.kedro_context.run(
tags=[
"121_prepare_args",
"131_treatment_fractions_",
"141_initialize_model",
]
)
self.args = self.kedro_context.catalog.load("args")
def set_catalog(context, key1, key2, key3, key4):
ds1 = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
ds2 = pd.DataFrame({"col1": [9, 8], "col2": [7, 6], "col3": [5, 4]})
context.catalog = DataCatalog(
{
key1: MemoryDataSet(ds1),
key2: MemoryDataSet(),
key3: MemoryDataSet(ds2),
key4: MemoryDataSet(),
}