Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import lightgbm as lgb
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
n_estimators = 10
d_train = lgb.Dataset(X_train, label=y_train)
params = {
'boosting_type': 'dart',
'objective': 'binary',
}
clf = lgb.train(params, d_train, n_estimators)
y_pred = clf.predict(X_test)
clf.save_model('lg_dart_breast_cancer.model') # save the model in txt format
np.savetxt('lg_dart_breast_cancer_true_predictions.txt', y_pred)
np.savetxt('breast_cancer_test.tsv', X_test, delimiter='\t')
d = clf.dump_model()
import json
with open('lg_dart_breast_cancer.json', 'w') as fout:
json.dump(d, fout, indent=1)
def get_params_to_params_estimation(x: pd.DataFrame, y: pd.Series, config: Config):
params_to_check = get_sensitive_and_impervious_params(config)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=42)
res = defaultdict(list)
for p in params_to_check:
model = lgb.train(p, lgb.Dataset(x_train, label=y_train), COLD_TOUCH_ITERATIONS)
res['valid'].append(get_score(y_valid, model.predict(x_valid), config))
chooser = np.argmax if config['mode'][0] == 'c' else np.argmin
best_params = params_to_check[chooser(res['valid'])]
log('Estimated params: {}'.format(best_params))
return best_params
'lambda_l2': 10,
'num_class': 12,
'seed': 2019,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
}
cate_cols = ['max_dist_mode', 'min_dist_mode', 'max_price_mode',
'min_price_mode', 'max_eta_mode', 'min_eta_mode', 'first_mode', 'weekday', 'hour']
scores = []
result_proba = []
for tr_idx, val_idx in kfold.split(train_x, train_y):
tr_x, tr_y, val_x, val_y = train_x.iloc[tr_idx], train_y[tr_idx], train_x.iloc[val_idx], train_y[val_idx]
train_set = lgb.Dataset(tr_x, tr_y, categorical_feature=cate_cols)
val_set = lgb.Dataset(val_x, val_y, categorical_feature=cate_cols)
lgb_model = lgb.train(lgb_paras, train_set,
valid_sets=[val_set], early_stopping_rounds=50, num_boost_round=40000, verbose_eval=50, feval=eval_f)
val_pred = np.argmax(lgb_model.predict(
val_x, num_iteration=lgb_model.best_iteration), axis=1)
val_score = f1_score(val_y, val_pred, average='weighted')
result_proba.append(lgb_model.predict(
test_x, num_iteration=lgb_model.best_iteration))
scores.append(val_score)
print('cv f1-score: ', np.mean(scores))
pred_test = np.argmax(np.mean(result_proba, axis=0), axis=1)
return pred_test
clf.fit(X_train, y_train)
y_test = clf.predict_proba(X_test)[:,1]
return y_test
elif classifier == 'lgb':
param = {
'num_leaves':15,
'num_iterations':100,
'max_depth': 5,
'objective':'binary',
'is_unbalance': True,
'metric': ['auc', 'binary_logloss'],
'verbose': -1,
'seed': 848
}
train_data = lgb.Dataset(X_train, label=y_train)
clf = lgb.train(param, train_data)
y_test = clf.predict(X_test)
return y_test
score = 0
for fold, (train_index, valid_index) in enumerate(kf.split(X_train_site, y_train_site)):
X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]
if isMORT:
params['verbose'] = 667 if site_id==0 and fold == 0 else 1
merge_datas = []
model = LiteMORT(params,merge_infos=merge_infos)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], categorical_feature=categorical_features)
else:
dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
watchlist = [dtrain, dvalid]
model = lgb.train(params, train_set=dtrain, num_boost_round=num_rounds, valid_sets=watchlist, verbose_eval=verbose_eval,
early_stopping_rounds=early_stop)
models[site_id].append(model)
y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_train_site[valid_index] = y_pred_valid
rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
print("Site Id:", site_id, ", Fold:", fold + 1, ", RMSE:", rmse)
score += rmse / cv
del X_train, X_valid,y_train, y_valid
gc.collect()
#input("......")
cv_scores["site_id"].append(site_id)
cv_scores["cv_score"].append(score)
print("\nSite Id:", site_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")
'num_class': num_classes,
'metric': 'multi_logloss',
'num_leaves': 15,
'max_depth': 4,
'learning_rate': 0.05,
'feature_fraction': 0.8,
# 'bagging_fraction': 0.8,
# 'bagging_freq': 5,
'verbose': 0
}
num_boost_round = 2000
feature_names = ['embed_' + str(col) for col in range(EMBEDDING_SIZE)]
print("Start training...")
start_time = time.time()
gbm = lgb.train(params,
lgb_train,
num_boost_round=num_boost_round,
valid_sets=lgb_val,
feature_name=feature_names,
early_stopping_rounds=30)
print("Training finished! ^_^")
print("Total seconds: %ds" % (time.time() - start_time))
# Calculate the f1 score and accuracy of training and validation set
probs_train = gbm.predict(X_train, num_iteration=gbm.best_iteration)
preds_train = np.argmax(probs_train, axis=1)
score_train = f1_score(y_train, preds_train, average='weighted')
acc_train = accuracy_score(y_train, preds_train)
print("The f1 score of training set after %d epochs is: %f" % (gbm.best_iteration, score_train))
print("The accuracy of training set after %d epochs is: %f" % (gbm.best_iteration, acc_train))
X, y, video_ids = load_train_data(model_name, fold)
y_cat = np.argmax(y, axis=1)
print(X.shape, y.shape)
print(np.unique(y_cat))
train_data = lgb.Dataset(X, label=y_cat)
param = {'num_leaves': 50,
'objective': 'multiclass',
'max_depth': 5,
'learning_rate': .1,
'max_bin': 200,
'num_class': NB_CAT,
'metric': ['multi_logloss']}
model = lgb.train(param, train_data, num_boost_round=100)
pickle.dump(model, open(Path(__file__).parent.parent / f"output/lgb_{model_name}_{fold}_full.pkl", "wb"))
train_data = lgb.Dataset(train_features, train_y)
valid_data = lgb.Dataset(valid_features, valid_y, reference=train_data)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'learning_rate': lr,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'feature_fraction': 0.9,
'min_data_in_leaf': 20,
'num_leaves': 41,
'scale_pos_weight': 1.2,
'lambda_l2': 1,
}
print(params)
return lgb.train(
params=params,
train_set=train_data,
num_boost_round=num_boost_round,
early_stopping_rounds=20,
valid_sets=[valid_data],
verbose_eval=10,
)
if do_validation:
x_valid, y_valid = load_svmlight_file(file_vali_data)
group_valid = np.loadtxt(file_vali_group)
valid_set = Dataset(data=x_valid, label=y_valid, group=group_valid)
x_test, y_test = load_svmlight_file(file_test_data)
group_test = np.loadtxt(file_test_group)
#test_set = Dataset(data=x_test, label=y_test, group=group_test)
params = self.get_paras_LightGBM(para_dict=para_dict, eval_dict=eval_dict)
if do_validation:
gbm = lgb.train(params=params, train_set=train_set, valid_sets=[valid_set], verbose_eval=10, early_stopping_rounds=100)
else:
gbm = lgb.train(params=params, train_set=train_set, verbose_eval=10, num_boost_round=100)
if data_id in YAHOO_L2R:
model_file = save_dir+'model.txt'
else:
model_file = save_dir+'_'.join(['fold', str(fold_k), 'model'])+'.txt'
gbm.save_model(model_file)
y_pred = gbm.predict(x_test) # fold-wise prediction
return y_test, group_test, y_pred