Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
test_set: DataFrame,
training_set: DataFrame,
alpha: float = 0.5,
l1_ratio: float = 0.5,
) -> ElasticNet:
""" Train wine prediction model """
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
lr.fit(training_set.drop(["quality"], 1), training_set[["quality"]])
prediction = lr.predict(test_set.drop(["quality"], 1))
(rmse, mae, r2) = calculate_metrics(test_set[["quality"]], prediction)
log_metric("alpha", alpha)
log_metric("rmse", rmse)
log_metric("mae", rmse)
log_metric("r2", r2)
logging.info(
"Elasticnet model (alpha=%f, l1_ratio=%f): rmse = %f, mae = %f, r2 = %f",
alpha,
l1_ratio,
rmse,
mae,
r2,
)
return lr
def train_model(
test_set: pd.DataFrame,
training_set: pd.DataFrame,
alpha: float = 1.0,
l1_ratio: float = 0.5,
) -> ElasticNet:
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
lr.fit(training_set.drop(["target"], 1), training_set[["target"]])
prediction = lr.predict(test_set.drop(["target"], 1))
(rmse, mae, r2) = calculate_metrics(test_set[["target"]], prediction)
log_metric("rmse", rmse)
log_metric("mae", mae)
log_metric("r2", r2)
logging.info(
"Elasticnet model (alpha=%f, l1_ratio=%f): rmse = %f, mae = %f, r2 = %f",
alpha,
l1_ratio,
rmse,
mae,
r2,
)
return lr
def split_data_spark(
raw_data: spark.DataFrame,
) -> Tuple[spark.DataFrame, spark.DataFrame, spark.DataFrame]:
columns_to_remove = set(["id", "0_norm", "10_norm"])
if columns_to_remove.issubset(list(raw_data.schema.names)):
raw_data = raw_data.drop(columns_to_remove)
(train, test) = raw_data.randomSplit([0.8, 0.2])
(test, validation) = raw_data.randomSplit([0.5, 0.5])
target_stats = raw_data.describe(["target"])
log_metric(
"target.mean",
target_stats.filter(target_stats["summary"] == "mean")
.collect()[0]
.asDict()["target"],
)
log_metric(
"target.std",
target_stats.filter(target_stats["summary"] == "stddev")
.collect()[0]
.asDict()["target"],
)
return train, test, validation
def split_data(
raw_data: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
columns_to_remove = set(["id", "0_norm", "10_norm"])
if columns_to_remove.issubset(raw_data.columns):
raw_data.drop(columns_to_remove, axis=1, inplace=True)
train_df, test_df = train_test_split(raw_data)
test_df, validation_df = train_test_split(test_df, test_size=0.5)
log_dataframe("raw", raw_data)
log_metric("target.mean", raw_data["target"].mean())
log_metric("target.std", raw_data["target"].std())
return train_df, test_df, validation_df
def validate_model(model, validation_dataset):
# type: (ElasticNet, pd.DataFrame) -> str
""" Calculates metrics of wine prediction model (py27) """
validation_x = validation_dataset.drop(["quality"], 1)
validation_y = validation_dataset[["quality"]]
prediction = model.predict(validation_x)
(rmse, mae, r2) = calculate_metrics(validation_y, prediction)
log_artifact(
"prediction_scatter_plot", _create_scatter_plot(validation_y, prediction)
)
log_metric("rmse", rmse)
log_metric("mae", rmse)
log_metric("r2", r2)
return "%s,%s,%s" % (rmse, mae, r2)
def validate_model_for_customer(
model: ElasticNet, validation_dataset: pd.DataFrame, threshold=0.2
) -> Tuple[str, figure.Figure]:
log_dataframe("validation", validation_dataset)
# support for py3 parqeut
validation_dataset = validation_dataset.rename(str, axis="columns")
validation_x = validation_dataset.drop(["target"], 1)
validation_y = validation_dataset[["target"]]
prediction = model.predict(validation_x)
(rmse, mae, r2) = calculate_metrics(validation_y, prediction)
log_metric("rmse", rmse)
log_metric("mae", mae)
log_metric("r2", r2)
fig = _create_scatter_plot(validation_y, prediction)
if r2 < threshold:
raise Exception(
"Model quality is below threshold. Got R2 equal to %s, expect at least %s"
% (r2, threshold)
)
return "%s,%s,%s" % (rmse, mae, r2), fig
def validate_model(model, validation_dataset):
# type: (ElasticNet, pd.DataFrame) -> str
""" Calculates metrics of wine prediction model (py27) """
validation_x = validation_dataset.drop(["quality"], 1)
validation_y = validation_dataset[["quality"]]
prediction = model.predict(validation_x)
(rmse, mae, r2) = calculate_metrics(validation_y, prediction)
log_artifact(
"prediction_scatter_plot", _create_scatter_plot(validation_y, prediction)
)
log_metric("rmse", rmse)
log_metric("mae", rmse)
log_metric("r2", r2)
return "%s,%s,%s" % (rmse, mae, r2)
def a():
log_metric("step", stepper.step())
return "a"