Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
###############################################################################
# ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude
# samples. All samples which are classified with a low probability will be
# removed.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
class_sep=0.8)
clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = InstanceHardnessThreshold(
random_state=0, estimator=LogisticRegression(solver='lbfgs',
multi_class='auto'))
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()
plt.show()
# nearest-neighbors rule while regular SMOTE will not make any distinction.
# Therefore, the decision function depending of the algorithm.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = SMOTE()
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
sampler = ADASYN()
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax3)
ax3.set_title('Decision function for {}'.format(sampler.__class__.__name__))
fig.tight_layout()
###############################################################################
# Due to those sampling particularities, it can give rise to some specific
# issues as illustrated below.
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
class_sep=0.8)
ax_arr = ((ax1, ax2), (ax3, ax4))
for ax, sampler in zip(ax_arr, (SMOTE(random_state=0),
ADASYN(random_state=0))):
def pseudo_label(pipeline, x_lab, y_lab, x_unlab, y_unlab, threshold=None):
model = make_pipeline(*pipeline)
model.fit(x_lab, y_lab)
pseudo_lab = pd.DataFrame({
'actual': y_unlab,
'predict_proba': model.predict_proba(x_unlab)[:, 1]
})
if threshold:
results = threshold_metrics(pseudo_lab['actual'], pseudo_lab['predict_proba'], threshold=threshold)
else:
results = threshold_metrics(pseudo_lab['actual'], pseudo_lab['predict_proba'], rank_best='lab_gmean')
pseudo_lab['predicted'] = (pseudo_lab['predict_proba'] > results['lab_threshold']).astype(int)
y_pseudo = pseudo_lab['predicted'].values
results['lab_num_pos'] = np.sum(y_pseudo)
results['lab_num_neg'] = y_pseudo.shape[0] - results['lab_num_pos']
# literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning
# methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek``
# and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than
# ``SMOTETomek``.
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
figsize=(15, 25))
X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7))
ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (
SMOTE(random_state=0),
SMOTEENN(random_state=0),
SMOTETomek(random_state=0))):
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax[0])
ax[0].set_title('Decision function for {}'.format(
sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax[1])
ax[1].set_title('Resampling using {}'.format(
sampler.__class__.__name__))
fig.tight_layout()
plt.show()
# Prototype generation: under-sampling by generating new samples
###############################################################################
###############################################################################
# ``ClusterCentroids`` under-samples by replacing the original samples by the
# centroids of the cluster found.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
class_sep=0.8)
clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = ClusterCentroids(random_state=0)
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()
###############################################################################
# Prototype selection: under-sampling by selecting existing samples
###############################################################################
###############################################################################
# The algorithm performing prototype selection can be subdivided into two
# groups: (i) the controlled under-sampling methods and (ii) the cleaning
# under-sampling methods.
def create_pipelines(self):
self.model_pipelines = []
for estimator in self.estimators:
for sampler in self.samplers:
for scaler in self.scalers:
pipeline = make_pipeline(scaler, sampler, estimator)
self.model_pipelines.append(pipeline)
"geometricsmote__selection_strategy",
SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Selection Strategy')
###############################################################################
# High Imbalance Ratio or low Samples to Features Ratio
###############################################################################
###############################################################################
# When :math:`\text{IR}` is high or :math:`\text{SFR}` is low then the majority
# or combined selection strategies and lower absolute values of the truncation
# and deformation factors dominate as optimal hyperparameters.
X, y = generate_imbalanced_data([0.1, 0.9], 2000, 400, 200)
gsmote_gbc = make_pipeline(
GeometricSMOTE(random_state=RANDOM_STATE),
LinearSVC(random_state=RANDOM_STATE, max_iter=1e5),
)
scoring_name = 'Geometric Mean Score'
validation_curve_info = generate_validation_curve_info(
gsmote_gbc, X, y, range(1, 8), "geometricsmote__k_neighbors", SCORER
)
plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors')
validation_curve_info = generate_validation_curve_info(
gsmote_gbc,
X,
y,
np.linspace(-1.0, 1.0, 9),
"geometricsmote__truncation_factor",
df_scores = evaluate_classifier(
lr_clf, df_scores, "LR with class weight"
)
rf_clf.set_params(randomforestclassifier__class_weight="balanced")
df_scores = evaluate_classifier(
rf_clf, df_scores, "RF with class weight"
)
lr_clf = make_pipeline_with_sampler(
preprocessor_linear,
RandomUnderSampler(random_state=42),
LogisticRegression(max_iter=1000)
)
df_scores = evaluate_classifier(
lr_clf, df_scores, "LR with under-sampling"
)
rf_clf = make_pipeline_with_sampler(
preprocessor_tree,
RandomUnderSampler(random_state=42),
RandomForestClassifier(random_state=42, n_jobs=2)
)
df_scores = evaluate_classifier(
rf_clf, df_scores, "RF with under-sampling"
)
rf_clf = make_pipeline(
preprocessor_tree,
BalancedRandomForestClassifier(random_state=42, n_jobs=2)
)
df_scores = evaluate_classifier(rf_clf, df_scores)
df_scores = evaluate_classifier(
bag_clf, df_scores, "Balanced bagging"
)
df_scores