Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
base_dir + 'train_data_catboost_format.tsv'
)
train_targets = np.argmax(train_targets, axis=1)
test_documents, test_targets = read_train_documents_and_one_hot_targets(
base_dir + 'train_data_catboost_format.tsv'
)
train_dir = base_dir + 'ut_tmp/'
if not isdir(train_dir):
mkdir(train_dir)
cbc_params = read_json_params(base_dir + 'catboost_params.json')
cbc_params['leaf_estimation_method'] = method
cbc_params['random_seed'] = 10
cbc_params['train_dir'] = train_dir
cbc = CatBoostClassifier(**cbc_params)
cbc.fit(train_documents, train_targets)
cbc.save_model(train_dir + 'model.bin', format='cbm')
export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
full_model = CBOneStepLeafRefitEnsemble(train_dir + 'model.json', train_documents, train_targets,
learning_rate=0.2, loss_function=BinaryCrossEntropyLoss(),
leaf_method=method,
update_set='AllPoints')
assert np.allclose(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'),
atol=1e-5),\
(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'))
assert np.allclose(full_model(test_documents), cbc.predict(test_documents, prediction_type='RawFormulaVal'),
atol=1e-5)
train_targets = np.argmax(train_targets, axis=1)
test_documents, test_targets = read_train_documents_and_one_hot_targets(
base_dir + 'test_data_catboost_format.tsv'
)
test_targets = np.argmax(test_targets, axis=1)
train_dir = base_dir + 'ut_tmp/'
if not isdir(train_dir):
mkdir(train_dir)
cbc_params = read_json_params(base_dir + 'catboost_params.json')
cbc_params['iterations'] = 2
cbc_params['leaf_estimation_method'] = leaf_method
cbc_params['random_seed'] = 10
cbc_params['train_dir'] = train_dir
cbc = CatBoostClassifier(**cbc_params)
cbc.set_params(boosting_type='Plain')
cbc.fit(train_documents, train_targets)
cbc.save_model(train_dir + 'model.bin', format='cbm')
export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets,
leaf_method=leaf_method,
learning_rate=cbc_params['learning_rate'],
loss_function=BinaryCrossEntropyLoss(),
update_set='AllPoints')
retrained_model_our = deepcopy(full_model)
tf_checker = TFGBApplier(full_model, train_documents, train_targets, leaf_method)
for remove_idx in np.random.randint(len(train_targets), size=30):
full_model.fit(remove_idx, retrained_model_our)
pred_ours = full_model(train_documents)
pred_theirs = tf_checker.get_predicts()
pred_cbc = cbc.predict(train_documents, prediction_type='RawFormulaVal')
return 'XGBRegressor'
if keras_imported:
if isinstance(model, KerasRegressor):
return 'DeepLearningRegressor'
if isinstance(model, KerasClassifier):
return 'DeepLearningClassifier'
if lgb_installed:
if isinstance(model, LGBMClassifier):
return 'LGBMClassifier'
if isinstance(model, LGBMRegressor):
return 'LGBMRegressor'
if catboost_installed:
if isinstance(model, CatBoostClassifier):
return 'CatBoostClassifier'
if isinstance(model, CatBoostRegressor):
return 'CatBoostRegressor'
model_map['Perceptron'] = Perceptron()
model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier()
model_map['SGDRegressor'] = SGDRegressor()
model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()
if xgb_installed:
model_map['XGBClassifier'] = XGBClassifier()
model_map['XGBRegressor'] = XGBRegressor()
if lgb_installed:
model_map['LGBMRegressor'] = LGBMRegressor()
model_map['LGBMClassifier'] = LGBMClassifier()
if catboost_installed:
model_map['CatBoostRegressor'] = CatBoostRegressor(calc_feature_importance=True)
model_map['CatBoostClassifier'] = CatBoostClassifier(calc_feature_importance=True)
if model_name[:12] == 'DeepLearning':
if keras_imported == False:
# Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
try:
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from tensorflow import logging
logging.set_verbosity(logging.INFO)
except:
pass
global maxnorm
global Dense, Dropout
global LeakyReLU, PReLU, ThresholdedReLU, ELU
global Sequential
logger.info('Done creating...')
# Fitting
# ==================================================
logger.info('Training XGBoost & CatBoost model...')
model_xgb = xgboost.XGBClassifier(
learning_rate=0.03,
max_depth=7,
nthread=50,
seed=1,
n_estimators=750
)
model_cb = catboost.CatBoostClassifier(
iterations=2000,
learning_rate=0.03,
depth=7,
loss_function='Logloss',
thread_count=50,
random_seed=1
)
dh.train_model(model_xgb, model_cb)
logger.info('Done training...')
# Model Blending
# ==================================================
logger.info('Start models blending...')
p = dh.blend(pct1=0.6, pct2=0.4)
def prepare(self):
# NOTE: HACK!!
# Due to some issue with CatBoostClassifier class we need to explicitly
# set the below params to None, or else we get exceptions!
params = self.params
params['store_all_simple_ctr'] = None
params['rsm'] = None
# CB_THREAD_LIMIT is set to 56 in catboost source!
if 'thread_count' in params and params['thread_count'] > 56:
print("Warning! catboost sets max-thread-count to 56!")
params['thread_count'] = 56
self.model = cat.CatBoostClassifier(**params)
model_init_params=dict(
iterations=100,
learning_rate=0.03,
depth=6,
save_snapshot=False,
allow_writing_files=False,
loss_function="MultiClass",
classes_count=7,
),
)
# And/or...
#################### 2. Hyperparameter Optimization ####################
optimizer = GBRT(iterations=8, random_state=42)
optimizer.forge_experiment(
model_initializer=CatBoostClassifier,
model_init_params=dict(
iterations=100,
learning_rate=Real(low=0.0001, high=0.5),
depth=Integer(4, 15),
save_snapshot=False,
allow_writing_files=False,
loss_function="MultiClass",
classes_count=7,
),
)
optimizer.go()
def estimator(self):
return CatBoostClassifier