Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
It must contain all columns listed in `columns_to_scale`.
columns_to_scale : list of str
A list of names of the columns for standard scaling.
"""
scaler = StandardScaler()
scaler.fit(df[columns_to_scale].values)
def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
new_data = scaler.transform(new_data_set[columns_to_scale].values)
new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list')
return new_data_set.assign(**new_cols)
p.__doc__ = learner_pred_fn_docstring("standard_scaler")
log = {'standard_scaler': {
'standard_scaler': scaler.get_params(),
'transformed_column': columns_to_scale}}
return p, p(df), log
default_params = {"n_jobs": -1, "random_state": 1729}
params = default_params if not params else merge(default_params, params)
features = features if not encode_extra_cols else expand_features_encoded(df, features)
model = IsolationForest()
model.set_params(**params)
model.fit(df[features].values)
def p(new_df: pd.DataFrame) -> pd.DataFrame:
output_col = {prediction_column: model.decision_function(
new_df[features])}
return new_df.assign(**output_col)
p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")
log = {'isolation_forest_learner': {
'features': features,
'parameters': params,
'prediction_column': prediction_column,
'package': "sklearn",
'package_version': sklearn.__version__,
'training_samples': len(df)}}
return p, p(df), log
def categ_target_dict(column: str) -> Dict:
column_agg = df.groupby(column)[target_column].agg(['count', 'mean'])
column_target_mean = column_agg['mean']
column_target_count = column_agg['count']
smoothed_target_mean = (column_target_count * column_target_mean + smoothing * target_mean) / \
(column_target_count + smoothing)
return smoothed_target_mean.to_dict()
vec = {column: categ_target_dict(column) for column in columns_to_categorize}
def p(new_df: pd.DataFrame) -> pd.DataFrame:
return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen)
p.__doc__ = learner_pred_fn_docstring("target_categorizer")
log = {'target_categorizer': {
'transformed_columns': columns_to_categorize,
'target_column': target_column,
'smoothing': smoothing,
'ignore_unseen': ignore_unseen}
}
if store_mapping:
log['target_categorizer']['mapping'] = vec
return p, p(df), log
col_dict = {prediction_column: bst.predict(dtest)}
if apply_shap:
import shap
explainer = shap.TreeExplainer(bst)
shap_values = list(explainer.shap_values(new_df[features]))
shap_expected_value = explainer.expected_value
shap_output = {"shap_values": shap_values,
"shap_expected_value": np.repeat(shap_expected_value, len(shap_values))}
col_dict = merge(col_dict, shap_output)
return new_df.assign(**col_dict)
p.__doc__ = learner_pred_fn_docstring("xgb_regression_learner", shap=True)
log = {'xgb_regression_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "xgboost",
'package_version': xgb.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': bst.get_score(),
'training_samples': len(df)},
'object': bst}
return p, p(df), log
if ascending:
base = 0
sign = 1
else:
base = max_range
sign = -1
values = df[prediction_column]
ecdf = ed.ECDF(values)
def p(new_df: pd.DataFrame) -> pd.DataFrame:
return new_df.assign(**{ecdf_column: (base + sign * max_range * ecdf(new_df[prediction_column]))})
p.__doc__ = learner_pred_fn_docstring("ecdefer")
log = {'ecdfer': {
'nobs': len(values),
'prediction_column': prediction_column,
'ascending': ascending,
'transformed_column': [ecdf_column]}}
return p, p(df), log
The value to impute unseen categories.
store_mapping : bool (default: False)
Whether to store the feature value -> integer dictionary in the log
"""
def categ_dict(series: pd.Series) -> Dict:
categs = series.dropna().unique()
return dict(map(reversed, enumerate(categs))) # type: ignore
vec = {column: categ_dict(df[column]) for column in columns_to_categorize}
def p(new_df: pd.DataFrame) -> pd.DataFrame:
return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen)
p.__doc__ = learner_pred_fn_docstring("label_categorizer")
log: LearnerLogType = {'label_categorizer': {
'transformed_column': columns_to_categorize,
'replace_unseen': replace_unseen}
}
if store_mapping:
log['label_categorizer']['mapping'] = vec
return p, p(df), log
n_rows = df.shape[0]
groups = [[f] for f in columns_to_inject] if columns_to_inject is not None else groups
null_cols = {} # type: ignore
for seed_i, group in enumerate(groups): # type: ignore
np.random.seed(seed + seed_i)
replace_mask = np.random.binomial(1, 1 - proportion, n_rows).astype(bool)
null_cols = merge(null_cols, {feature: df[feature].where(replace_mask) for feature in group})
null_data = df.assign(**null_cols)
def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
return new_data_set
p.__doc__ = learner_pred_fn_docstring("null_injector")
log = {'null_injector': {
"columns_to_inject": columns_to_inject,
"proportion": proportion,
"groups": groups
}}
return p, null_data, log
shap_expected_value_multiclass = {f"shap_expected_value_{class_index}":
np.repeat(expected_value, len(class_shap_values))
for (class_index, (expected_value, class_shap_values))
in enumerate(zip(shap_expected_value, shap_values))}
shap_output = merge(shap_values_multiclass, shap_expected_value_multiclass)
else:
shap_values = list(shap_values)
shap_output = {"shap_values": shap_values,
"shap_expected_value": np.repeat(shap_expected_value, len(shap_values))}
col_dict = merge(col_dict, shap_output)
return new_df.assign(**col_dict)
p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner", shap=True)
log = {'xgb_classification_learner': {
'features': features,
'target': target,
'prediction_column': prediction_column,
'package': "xgboost",
'package_version': xgb.__version__,
'parameters': assoc(params, "num_estimators", num_estimators),
'feature_importance': bst.get_score(),
'training_samples': len(df)},
'object': bst}
return p, p(df), log
A list of categorical column names.
replace_unseen : int
The value to impute unseen categories.
store_mapping : bool (default: False)
Whether to store the feature value -> integer dictionary in the log
"""
categ_getter = lambda col: df[col].value_counts().to_dict()
vec = {column: categ_getter(column) for column in columns_to_categorize}
def p(new_df: pd.DataFrame) -> pd.DataFrame:
return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen)
p.__doc__ = learner_pred_fn_docstring("count_categorizer")
log: LearnerLogType = {'count_categorizer': {
'transformed_column': columns_to_categorize,
'replace_unseen': replace_unseen}
}
if store_mapping:
log['count_categorizer']['mapping'] = vec
return p, p(df), log
prediction_min : float
The floor for the prediction.
prediction_max : float
The cap for the prediction.
prediction_column : str
The name of the column in `df` to cap and floor
"""
def p(new_df: pd.DataFrame) -> pd.DataFrame:
return new_df.assign(
**{prediction_column: new_df[prediction_column].clip(lower=prediction_min, upper=prediction_max)}
)
p.__doc__ = learner_pred_fn_docstring("prediction_ranger")
log = {'prediction_ranger': {
'prediction_min': prediction_min,
'prediction_max': prediction_max,
'transformed_column': [prediction_column]}}
return p, p(df), log