Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def catboost_regressor(pandas_data, catboost_params):
return CatBoostRegressor(**catboost_params).fit(pandas_data, [1, 0])
def test_catboost():
try:
import catboost
except:
print("Skipping test_catboost!")
return
import shap
# train catboost model
X, y = shap.datasets.boston()
X["RAD"] = X["RAD"].astype(np.int)
model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
p = catboost.Pool(X, y, cat_features=["RAD"])
model.fit(p, verbose=False, plot=False)
# explain the model's predictions using SHAP values
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(p)
predicted = model.predict(X)
assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
"SHAP values don't sum to model output!"
def CatBoost_First(self, data, catsign, depth=8, iterations=80000):
model = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=0.8, loss_function='RMSE')
model.fit(data['train'][:, :-1], data['train'][:, -1], cat_features=catsign)
# 注意存储验证数据集结果和预测数据集结果的不同
# 训练数据集的预测结果
xul = model.predict(data['train'][:, :-1])
# 验证的预测结果
yanre = model.predict(data['test'][:, :-1])
# 预测的预测结果
prer = model.predict(data['predict'][:, :-1])
# 储存
self.yanzhneg_pr.append(yanre)
self.predi.append(prer)
# 分别计算训练、验证、预测的误差
# 每计算一折后,要计算训练、验证、预测数据的误差
xx = self.RMSE(xul, data['train'][:, -1])
yy = self.RMSE(yanre, data['test'][:, -1])
pp = self.RMSE(prer, data['predict'][:, -1])
if keras_imported:
if isinstance(model, KerasRegressor):
return 'DeepLearningRegressor'
if isinstance(model, KerasClassifier):
return 'DeepLearningClassifier'
if lgb_installed:
if isinstance(model, LGBMClassifier):
return 'LGBMClassifier'
if isinstance(model, LGBMRegressor):
return 'LGBMRegressor'
if catboost_installed:
if isinstance(model, CatBoostClassifier):
return 'CatBoostClassifier'
if isinstance(model, CatBoostRegressor):
return 'CatBoostRegressor'
else:
valid_X_shape = None
X, eval_set = self.process_cats(X, eval_set, orig_cols)
# modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes
self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape)
params = copy.deepcopy(self.params) # keep separate, since then can be pulled form lightgbm params
params = self.transcribe_and_filter_params(params, eval_set is not None)
if logger is not None:
loggerdata(logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params)))
if self.num_classes == 1:
model = CatBoostRegressor(**params)
else:
model = CatBoostClassifier(**params)
# Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
if self.num_classes == 1:
# assume not mae, which would use median
# baseline = [np.mean(y)] * len(y)
baseline = None
else:
baseline = None
kargs=dict(X=X, y=y,
sample_weight=sample_weight,
baseline=baseline,
eval_set=eval_set)
pickle_path = None
if config.debug_daimodel_level >= 2:
def predict(self, X, **kwargs):
model, features, importances, iterations = self.get_model_properties()
if not self._save_by_pickle:
from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
if self.num_classes >= 2:
from_file = CatBoostClassifier()
else:
from_file = CatBoostRegressor()
with open(self.model_path, mode='wb') as f:
f.write(model)
model = from_file.load_model(self.model_path)
# FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up.
if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
# dt -> lightgbm internally using buffer leaks, so convert here
# assume predict is after pipeline collection or in subprocess so needs no protection
X = X.to_numpy() # don't assign back to X so don't damage during predict
X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64)
X, eval_set = self.process_cats(X, None, self.feature_names_fitted)
pred_contribs = kwargs.get('pred_contribs', None)
output_margin = kwargs.get('output_margin', None)
fast_approx = kwargs.pop('fast_approx', False)
model_extra_params=dict(
fit=dict(verbose=50, eval_set=[(env.validation_input, env.validation_target)])
),
)
# Notice above that CatBoost printed scores for our `eval_set` every 50 iterations just like we said
# ... in `model_extra_params["fit"]`; although, it made our results rather difficult to read, so
# ... we'll switch back to `verbose=False` during optimization.
# And/or...
#################### 2. Hyperparameter Optimization ####################
# Notice below that `optimizer` still recognizes the results of `experiment` as valid learning material even
# ... though their `verbose` values differ. This is because it knows that `verbose` has no effect on actual results.
optimizer = DummyOptPro(iterations=10, random_state=777)
optimizer.forge_experiment(
model_initializer=CatBoostRegressor,
model_init_params=dict(
iterations=100,
learning_rate=Real(0.001, 0.2),
depth=Integer(3, 7),
bootstrap_type=Categorical(["Bayesian", "Bernoulli"]),
save_snapshot=False,
allow_writing_files=False,
),
model_extra_params=dict(
fit=dict(verbose=False, eval_set=[(env.validation_input, env.validation_target)])
),
)
optimizer.go()
#################### 1. Perform Experiments ####################
# *Note: If this is your first HyperparameterHunter example, the CatBoost classification example may be a better starting point.*
# In this Experiment, we're also going to use `model_extra_params` to provide arguments to
# ... `CatBoostRegressor`'s `fit` method, just like we would if we weren't using HyperparameterHunter.
# We'll be using the `verbose` argument to print evaluations of our `CatBoostRegressor` every 50 iterations,
# ... and we'll also be using the dataset sentinels offered by `Environment`. You can read more about
# ... the exciting thing you can do with the `Environment` sentinels in the documentation and in the
# ... example dedicated to them. For now, though, we'll be using them to provide each fold's
# ... `env.validation_input`, and `env.validation_target` to `CatBoostRegressor.fit` via its `eval_set` argument.
# You could also easily add `CatBoostRegressor.fit`'s `early_stopping_rounds` argument to `model_extra_params["fit"]`
# ... to use early stopping, but doing so here with only `iterations=100` doesn't make much sense.
experiment = CVExperiment(
model_initializer=CatBoostRegressor,
model_init_params=dict(
iterations=100,
learning_rate=0.05,
depth=5,
bootstrap_type="Bayesian",
save_snapshot=False,
allow_writing_files=False,
),
model_extra_params=dict(
fit=dict(verbose=50, eval_set=[(env.validation_input, env.validation_target)])
),
)
# Notice above that CatBoost printed scores for our `eval_set` every 50 iterations just like we said
# ... in `model_extra_params["fit"]`; although, it made our results rather difficult to read, so
# ... we'll switch back to `verbose=False` during optimization.
def must_process(self, obj) -> bool:
"""
Returns `True` if object is `catboost.CatBoostClassifier` or `catboost.CatBoostRegressor`
:param obj: obj to check
:return: `True` or `False`
"""
return isinstance(obj, (CatBoostClassifier, CatBoostRegressor))