Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_catboost():
try:
import catboost
except:
print("Skipping test_catboost!")
return
import shap
# train catboost model
X, y = shap.datasets.boston()
X["RAD"] = X["RAD"].astype(np.int)
model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
p = catboost.Pool(X, y, cat_features=["RAD"])
model.fit(p, verbose=False, plot=False)
# explain the model's predictions using SHAP values
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(p)
predicted = model.predict(X)
assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
"SHAP values don't sum to model output!"
def test_catboost():
try:
import catboost
except:
print("Skipping test_catboost!")
return
import shap
# train catboost model
X, y = shap.datasets.boston()
X["RAD"] = X["RAD"].astype(np.int)
model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
p = catboost.Pool(X, y, cat_features=["RAD"])
model.fit(p, verbose=False, plot=False)
# explain the model's predictions using SHAP values
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(p)
predicted = model.predict(X)
assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
"SHAP values don't sum to model output!"
def fit(self, X_train, y_train):
bst = cv(
Pool(X_train, y_train),
self.params
)
best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
print('Best Iteration: {}'.format(best_rounds))
self.params['iterations'] = best_rounds
self.model = CatBoostClassifier(**self.params)
self.model.fit(
X_train, y_train
)
encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""
from catboost import Pool, CatBoostClassifier
import catboost
weights = df[weight_column].values if weight_column else None
params = extra_params if extra_params else {}
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'Logloss')
features = features if not encode_extra_cols else expand_features_encoded(df, features)
cat_features = params["cat_features"] if "cat_features" in params else None
dtrain = Pool(df[features].values, df[target].values, weight=weights,
feature_names=list(map(str, features)), cat_features=cat_features)
cat_boost_classifier = CatBoostClassifier(iterations=num_estimators, **params)
cbr = cat_boost_classifier.fit(dtrain, verbose=0)
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
dtest = Pool(new_df[features].values, feature_names=list(map(str, features)),
cat_features=cat_features)
pred = cbr.predict_proba(dtest)[:, 1]
if params["objective"] == "MultiClass":
pred = cbr.predict_proba(dtest)
col_dict = {prediction_column + "_" + str(key): value
for (key, value) in enumerate(pred.T)}
col_dict.update({prediction_column: pred.argmax(axis=1)})
def fit_early_stopping(self, X_train, y_train, X_eval, y_eval):
# specific early stopping for Catboost
train_pool = Pool(X_train, label=y_train.astype(float))
eval_pool = Pool(X_eval, label=y_eval.astype(float))
# set specific parameters for early stopping (overfitting detector with iter)
self.params['iterations'] = MAX_ROUNDS
self.params['od_type'] = 'iter'
self.params['od_wait'] = PATIENCE
self.model.fit(train_pool, eval_set=eval_pool, use_best_model=True)
self.num_rounds = self.model.tree_count_
self.params['iterations'] = self.num_rounds
self.params.pop('od_type')
self.params.pop('od_wait')
def fit_early_stopping(self, X_train, y_train, X_eval, y_eval):
# specific early stopping for Catboost
train_pool = Pool(X_train, label=y_train.astype(float))
eval_pool = Pool(X_eval, label=y_eval.astype(float))
# set specific parameters for early stopping (overfitting detector with iter)
self.params['iterations'] = MAX_ROUNDS
self.params['od_type'] = 'iter'
self.params['od_wait'] = PATIENCE
self.model.fit(train_pool, eval_set=eval_pool, use_best_model=True)
self.num_rounds = self.model.tree_count_
self.params['iterations'] = self.num_rounds
self.params.pop('od_type')
self.params.pop('od_wait')
def test_adult():
train = data_file('adult', 'train_small')
test = data_file('adult', 'test_small')
cd = data_file('adult', 'train.cd')
learn_pool = Pool(data=train, column_description=cd)
test_pool = Pool(data=test, column_description=cd)
model = CatBoostClassifier(iterations=5, loss_function='Logloss')
model.fit(learn_pool, eval_set=test_pool)
predictions = model.predict(test_pool)
def _create_pool(fold_file, thread_count=-1):
from catboost import Pool
data_pool = Pool(fold_file.path(),
column_description=fold_file.column_description(),
delimiter=fold_file.get_separator(),
thread_count=thread_count)
return data_pool
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
dtest = Pool(new_df[features].values, feature_names=list(map(str, features)),
cat_features=cat_features)
pred = cbr.predict_proba(dtest)[:, 1]
if params["objective"] == "MultiClass":
pred = cbr.predict_proba(dtest)
col_dict = {prediction_column + "_" + str(key): value
for (key, value) in enumerate(pred.T)}
col_dict.update({prediction_column: pred.argmax(axis=1)})
else:
col_dict = {prediction_column: pred}
if apply_shap:
import shap
explainer = shap.TreeExplainer(cbr)
shap_values = explainer.shap_values(dtest)
shap_expected_value = explainer.expected_value