How to use the catboost.Pool function in catboost

To help you get started, we’ve selected a few catboost examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github slundberg / shap / tests / explainers / test_tree.py View on Github external
def test_catboost():
    try:
        import catboost
    except:
        print("Skipping test_catboost!")
        return
    import shap

    # train catboost model
    X, y = shap.datasets.boston()
    X["RAD"] = X["RAD"].astype(np.int)
    model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
    p = catboost.Pool(X, y, cat_features=["RAD"])
    model.fit(p, verbose=False, plot=False)

    # explain the model's predictions using SHAP values
    ex = shap.TreeExplainer(model)
    shap_values = ex.shap_values(p)

    predicted = model.predict(X)

    assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
        "SHAP values don't sum to model output!"
github slundberg / shap / tests / explainers / test_tree.py View on Github external
def test_catboost():
    try:
        import catboost
    except:
        print("Skipping test_catboost!")
        return
    import shap

    # train catboost model
    X, y = shap.datasets.boston()
    X["RAD"] = X["RAD"].astype(np.int)
    model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
    p = catboost.Pool(X, y, cat_features=["RAD"])
    model.fit(p, verbose=False, plot=False)

    # explain the model's predictions using SHAP values
    ex = shap.TreeExplainer(model)
    shap_values = ex.shap_values(p)

    predicted = model.predict(X)

    assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
        "SHAP values don't sum to model output!"
github ybabakhin / kaggle-skeleton / models_zoo.py View on Github external
def fit(self, X_train, y_train):

        bst = cv(
            Pool(X_train, y_train),
            self.params
        )

        best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
        print('Best Iteration: {}'.format(best_rounds))

        self.params['iterations'] = best_rounds
        self.model = CatBoostClassifier(**self.params)

        self.model.fit(
            X_train, y_train
        )
github nubank / fklearn / src / fklearn / training / classification.py View on Github external
encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """
    from catboost import Pool, CatBoostClassifier
    import catboost

    weights = df[weight_column].values if weight_column else None
    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)
    params = params if "objective" in params else assoc(params, "objective", 'Logloss')

    features = features if not encode_extra_cols else expand_features_encoded(df, features)

    cat_features = params["cat_features"] if "cat_features" in params else None

    dtrain = Pool(df[features].values, df[target].values, weight=weights,
                  feature_names=list(map(str, features)), cat_features=cat_features)

    cat_boost_classifier = CatBoostClassifier(iterations=num_estimators, **params)
    cbr = cat_boost_classifier.fit(dtrain, verbose=0)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

        dtest = Pool(new_df[features].values, feature_names=list(map(str, features)),
                     cat_features=cat_features)

        pred = cbr.predict_proba(dtest)[:, 1]
        if params["objective"] == "MultiClass":
            pred = cbr.predict_proba(dtest)
            col_dict = {prediction_column + "_" + str(key): value
                        for (key, value) in enumerate(pred.T)}
            col_dict.update({prediction_column: pred.argmax(axis=1)})
github pierre-chaville / automlk / automlk / models.py View on Github external
def fit_early_stopping(self, X_train, y_train, X_eval, y_eval):
        # specific early stopping for Catboost
        train_pool = Pool(X_train, label=y_train.astype(float))
        eval_pool = Pool(X_eval, label=y_eval.astype(float))
        # set specific parameters for early stopping (overfitting detector with iter)
        self.params['iterations'] = MAX_ROUNDS
        self.params['od_type'] = 'iter'
        self.params['od_wait'] = PATIENCE

        self.model.fit(train_pool, eval_set=eval_pool, use_best_model=True)

        self.num_rounds = self.model.tree_count_

        self.params['iterations'] = self.num_rounds
        self.params.pop('od_type')
        self.params.pop('od_wait')
github pierre-chaville / automlk / automlk / models.py View on Github external
def fit_early_stopping(self, X_train, y_train, X_eval, y_eval):
        # specific early stopping for Catboost
        train_pool = Pool(X_train, label=y_train.astype(float))
        eval_pool = Pool(X_eval, label=y_eval.astype(float))
        # set specific parameters for early stopping (overfitting detector with iter)
        self.params['iterations'] = MAX_ROUNDS
        self.params['od_type'] = 'iter'
        self.params['od_wait'] = PATIENCE

        self.model.fit(train_pool, eval_set=eval_pool, use_best_model=True)

        self.num_rounds = self.model.tree_count_

        self.params['iterations'] = self.num_rounds
        self.params.pop('od_type')
        self.params.pop('od_wait')
github catboost / catboost / catboost / python-package / ut / medium / run_catboost.py View on Github external
def test_adult():
    train = data_file('adult', 'train_small')
    test = data_file('adult', 'test_small')
    cd = data_file('adult', 'train.cd')

    learn_pool = Pool(data=train, column_description=cd)
    test_pool = Pool(data=test, column_description=cd)

    model = CatBoostClassifier(iterations=5, loss_function='Logloss')
    model.fit(learn_pool, eval_set=test_pool)

    predictions = model.predict(test_pool)
github catboost / catboost / catboost / python-package / catboost / eval / _fold_models_handler.py View on Github external
def _create_pool(fold_file, thread_count=-1):
        from catboost import Pool
        data_pool = Pool(fold_file.path(),
                         column_description=fold_file.column_description(),
                         delimiter=fold_file.get_separator(),
                         thread_count=thread_count)
        return data_pool
github nubank / fklearn / src / fklearn / training / classification.py View on Github external
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

        dtest = Pool(new_df[features].values, feature_names=list(map(str, features)),
                     cat_features=cat_features)

        pred = cbr.predict_proba(dtest)[:, 1]
        if params["objective"] == "MultiClass":
            pred = cbr.predict_proba(dtest)
            col_dict = {prediction_column + "_" + str(key): value
                        for (key, value) in enumerate(pred.T)}
            col_dict.update({prediction_column: pred.argmax(axis=1)})
        else:
            col_dict = {prediction_column: pred}

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(cbr)
            shap_values = explainer.shap_values(dtest)
            shap_expected_value = explainer.expected_value