Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
prediction_column : str
The name of the column with the predictions from the model.
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
def_params = {"fit_intercept": True}
params = def_params if not params else merge(def_params, params)
weights = df[weight_column].values if weight_column else None
features = features if not encode_extra_cols else expand_features_encoded(df, features)
regr = LinearRegression(**params)
regr.fit(df[features].values, df[target].values, sample_weight=weights)
def p(new_df: pd.DataFrame) -> pd.DataFrame:
return new_df.assign(**{prediction_column: regr.predict(new_df[features].values)})
p.__doc__ = learner_pred_fn_docstring("linear_regression_learner")
log = {'linear_regression_learner': {
'features': features,
'target': target,
'parameters': params,
'prediction_column': prediction_column,
'package': "sklearn",
'package_version': sk_version,
The name of the column with the predictions from the model.
If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes).
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
def_params = {"C": 0.1, "multi_class": "ovr"}
merged_params = def_params if not params else merge(def_params, params)
weights = df[weight_column].values if weight_column else None
features = features if not encode_extra_cols else expand_features_encoded(df, features)
clf = LogisticRegression(**merged_params)
clf.fit(df[features].values, df[target].values, sample_weight=weights)
def p(new_df: pd.DataFrame) -> pd.DataFrame:
pred = clf.predict_proba(new_df[features].values)
if merged_params["multi_class"] == "multinomial":
col_dict = {prediction_column + "_" + str(key): value for (key, value) in enumerate(pred.T)}
col_dict.update({prediction_column: pred.argmax(axis=1)})
else:
col_dict = {prediction_column: pred[:, 1]}
return new_df.assign(**col_dict)
p.__doc__ = learner_pred_fn_docstring("logistic_classification_learner")
http://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html
If not passed, the default will be used.
prediction_column : str
The name of the column with the predictions from the model.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
params = extra_params if extra_params else {}
params['alpha'] = alpha
params['kernel'] = kernel
features = features if not encode_extra_cols else expand_features_encoded(df, features)
gp = GaussianProcessRegressor(**params)
gp.fit(df[features], df[target])
extra_variance = df[target].std() if extra_variance == "fit" else extra_variance if extra_variance else 1
def p(new_df: pd.DataFrame) -> pd.DataFrame:
if return_std:
pred_mean, pred_std = gp.predict(df[features], return_std=True)
pred_std *= extra_variance
return new_df.assign(**{prediction_column: pred_mean, prediction_column + "_std": pred_std})
else:
return new_df.assign(**{prediction_column: gp.predict(df[features])})
p.__doc__ = learner_pred_fn_docstring("gp_regression_learner")
params : dict
The IsolationForest parameters in the format {"par_name": param}. See:
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
prediction_column : str
The name of the column with the predictions from the model.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
default_params = {"n_jobs": -1, "random_state": 1729}
params = default_params if not params else merge(default_params, params)
features = features if not encode_extra_cols else expand_features_encoded(df, features)
model = IsolationForest()
model.set_params(**params)
model.fit(df[features].values)
def p(new_df: pd.DataFrame) -> pd.DataFrame:
output_col = {prediction_column: model.decision_function(
new_df[features])}
return new_df.assign(**output_col)
p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")
log = {'isolation_forest_learner': {
'features': features,
'parameters': params,
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
from catboost import Pool, CatBoostClassifier
import catboost
weights = df[weight_column].values if weight_column else None
params = extra_params if extra_params else {}
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'Logloss')
features = features if not encode_extra_cols else expand_features_encoded(df, features)
cat_features = params["cat_features"] if "cat_features" in params else None
dtrain = Pool(df[features].values, df[target].values, weight=weights,
feature_names=list(map(str, features)), cat_features=cat_features)
cat_boost_classifier = CatBoostClassifier(iterations=num_estimators, **params)
cbr = cat_boost_classifier.fit(dtrain, verbose=0)
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
dtest = Pool(new_df[features].values, feature_names=list(map(str, features)),
cat_features=cat_features)
pred = cbr.predict_proba(dtest)[:, 1]
if params["objective"] == "MultiClass":
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
import xgboost as xgb
weights = df[weight_column].values if weight_column else None
params = extra_params if extra_params else {}
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'reg:linear')
features = features if not encode_extra_cols else expand_features_encoded(df, features)
dtrain = xgb.DMatrix(df[features].values, label=df[target].values, weight=weights, feature_names=map(str, features))
bst = xgb.train(params, dtrain, num_estimators)
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features))
col_dict = {prediction_column: bst.predict(dtest)}
if apply_shap:
import shap
explainer = shap.TreeExplainer(bst)
shap_values = list(explainer.shap_values(new_df[features]))
shap_expected_value = explainer.expected_value
shap_output = {"shap_values": shap_values,
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
import lightgbm as lgbm
params = extra_params if extra_params else {}
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'regression')
weights = df[weight_column].values if weight_column else None
features = features if not encode_extra_cols else expand_features_encoded(df, features)
dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True)
bst = lgbm.train(params, dtrain, num_estimators)
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
col_dict = {prediction_column: bst.predict(new_df[features].values)}
if apply_shap:
import shap
explainer = shap.TreeExplainer(bst)
shap_values = list(explainer.shap_values(new_df[features]))
shap_expected_value = explainer.expected_value
shap_output = {"shap_values": shap_values,
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
import lightgbm as lgbm
params = extra_params if extra_params else {}
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'binary')
weights = df[weight_column].values if weight_column else None
features = features if not encode_extra_cols else expand_features_encoded(df, features)
dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True)
bst = lgbm.train(params, dtrain, num_estimators)
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
if params["objective"] == "multiclass":
col_dict = {prediction_column + "_" + str(key): value
for (key, value) in enumerate(bst.predict(new_df[features].values).T)}
else:
col_dict = {prediction_column: bst.predict(new_df[features].values)}
if apply_shap:
import shap
explainer = shap.TreeExplainer(bst)
weight_column : str, optional
The name of the column with scores to weight the data.
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
import xgboost as xgb
params = extra_params if extra_params else {}
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'binary:logistic')
weights = df[weight_column].values if weight_column else None
features = features if not encode_extra_cols else expand_features_encoded(df, features)
dtrain = xgb.DMatrix(df[features].values, label=df[target].values, feature_names=map(str, features), weight=weights)
bst = xgb.train(params, dtrain, num_estimators)
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features))
pred = bst.predict(dtest)
if params["objective"] == "multi:softprob":
col_dict = {prediction_column + "_" + str(key): value
for (key, value) in enumerate(pred.T)}
col_dict.update({prediction_column: pred.argmax(axis=1)})
else:
col_dict = {prediction_column: pred}