Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0)
rf.fit(X_train, y_train)
brf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)
# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.
print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_rf),
geometric_mean_score(y_test, y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0],
title='Random forest')
print('Balanced Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_brf),
geometric_mean_score(y_test, y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1],
title='Balanced random forest')
###############################################################################
# Boosting classifier
###############################################################################
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=RANDOM_STATE)
# Train the classifier with balancing
pipeline.fit(X_train, y_train)
# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)
###############################################################################
# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.
print('The geometric mean is {}'.format(geometric_mean_score(
y_test,
y_pred_bal)))
###############################################################################
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.
alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
geometric_mean_score)
print('The IBA using alpha = {} and the geometric mean: {}'.format(
alpha, geo_mean(
y_test,
y_pred_bal)))
###############################################################################
# Boosting classifier
###############################################################################
# In the same manner, easy ensemble classifier is a bag of balanced AdaBoost
# classifier. However, it will be slower to train than random forest and will
# achieve worse performance.
base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
base_estimator=base_estimator)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_eec),
geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
title='Easy ensemble classifier')
rusboost = RUSBoostClassifier(n_estimators=10,
base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_rusboost),
geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target),
ax=ax[1], title='RUSBoost classifier')
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0)
bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)
y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)
###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.
print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_bc),
geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0],
title='Bagging')
print('Balanced Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_bbc),
geometric_mean_score(y_test, y_pred_bbc)))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc)
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target),
ax=ax[1], title='Balanced bagging')
###############################################################################
# Classification using random forest classifier with and without sampling
###############################################################################
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_eec),
geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
title='Easy ensemble classifier')
rusboost = RUSBoostClassifier(n_estimators=10,
base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
.format(balanced_accuracy_score(y_test, y_pred_rusboost),
geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target),
ax=ax[1], title='RUSBoost classifier')
plt.show()
AUPR = []
if np.count_nonzero(labels) > 0 and np.count_nonzero(labels) != labels.shape[0]: #Makes sure both classes present
fpr, tpr, thresholds = roc_curve(y_true = true_classes, y_score = predictions, pos_label = pos_label)
#auc1 = roc_auc_score(y_true = labels, y_score = predictions)
AUROC = auc(fpr, tpr)
precision, recall, thresholds = precision_recall_curve(true_classes, predictions)
AUPR = auc(recall, precision)
if to_plot == True:
plot_ROC_AUC(fpr,tpr, AUROC, data_option)
else:
print('only one class present')
#g_mean = geometric_mean_score(labels, predicted_classes)
g_mean = geometric_mean_score(labels, predicted_classes)
#print(report)
# print("\n")
# print(conf_mat)
return AUROC, conf_mat, g_mean, AUPR
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import validation_curve
from sklearn.metrics import make_scorer, cohen_kappa_score
from sklearn.datasets import make_classification
from imblearn.pipeline import make_pipeline
from imblearn.metrics import geometric_mean_score
from gsmote import GeometricSMOTE
print(__doc__)
RANDOM_STATE = 10
SCORER = make_scorer(geometric_mean_score)
def generate_imbalanced_data(weights, n_samples, n_features, n_informative):
"""Generate imbalanced data."""
X, y = make_classification(
n_classes=2,
class_sep=2,
weights=weights,
n_informative=n_informative,
n_redundant=1,
flip_y=0,
n_features=n_features,
n_clusters_per_class=2,
n_samples=n_samples,
random_state=RANDOM_STATE,
)
# Compute the different metrics
# Precision/recall/f1
precision, recall, f1, support = precision_recall_fscore_support(
y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
)
# Specificity
specificity = specificity_score(
y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
)
# Geometric mean
geo_mean = geometric_mean_score(
y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
)
# Index balanced accuracy
iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
geometric_mean_score
)
iba = iba_gmean(
y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
)
result = {"targets": {}}
for i, label in enumerate(labels):
result["targets"][target_names[i]] = {
"precision": precision[i],
"recall": recall[i],
"specificity": specificity[i],
"f1": f1[i],
"geo_mean": geo_mean[i],
"iba": iba[i],
"support": support[i],
def check_scoring(estimator, scoring=None, allow_none=False):
'''
Surrogate for sklearn's check_scoring to enable use of some other
scoring metrics.
'''
if scoring == 'average_precision_weighted':
scorer = make_scorer(average_precision_score, average='weighted', needs_proba=True)
elif scoring == 'gmean':
scorer = make_scorer(geometric_mean_score(), needs_proba=True)
else:
scorer = check_scoring_sklearn(estimator, scoring=scoring)
return scorer