Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from imblearn.under_sampling import TomekLinks
print(__doc__)
rng = np.random.RandomState(0)
n_samples_1 = 500
n_samples_2 = 50
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn, y_syn = shuffle(X_syn, y_syn)
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn,
y_syn)
# remove Tomek links
tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, idx_resampled = tl.fit_resample(X_syn, y_syn)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]),
idx_resampled)
idx_class_0 = y_resampled == 0
plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1],
alpha=.8, label='Class #0')
plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1],
alpha=.8, label='Class #1')
plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1],
alpha=.8, label='Removed samples')
# make nice plotting
def create_sampler(sampler_name, random_state=None):
if sampler_name is None or sampler_name == 'None':
return None
if sampler_name.lower() == 'randomundersampler':
return RandomUnderSampler(random_state=random_state)
if sampler_name.lower() == 'tomeklinks':
return TomekLinks(random_state=random_state)
if sampler_name.lower() == 'enn':
return EditedNearestNeighbours(random_state=random_state)
if sampler_name.lower() == 'ncl':
return NeighbourhoodCleaningRule(random_state=random_state)
if sampler_name.lower() == 'randomoversampler':
return RandomOverSampler(random_state=random_state)
if sampler_name.lower() == 'smote':
return SMOTE(random_state=random_state)
if sampler_name.lower() == 'smotetomek':
return SMOTETomek(random_state=random_state)
if sampler_name.lower() == 'smoteenn':
return SMOTEENN(random_state=random_state)
else:
raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
X_res, y_res = ros.fit_resample(X, y)
print('Information of the iris data set after making it '
'balanced by over-sampling: \n sampling_strategy={} \n y: {}'
.format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)
###############################################################################
# ``sampling_strategy`` as a ``list``
# ...................................
#
# When ``sampling_strategy`` is a ``list``, the list contains the targeted
# classes. It is used only for **cleaning methods** and raise an error
# otherwise.
sampling_strategy = [0, 1, 2]
tl = TomekLinks(sampling_strategy=sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print('Information of the iris data set after making it '
'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}'
.format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)
###############################################################################
# ``sampling_strategy`` as a callable
# ...................................
#
# When callable, function taking ``y`` and returns a ``dict``. The keys
# correspond to the targeted classes. The values correspond to the desired
# number of samples for each class.
def ratio_multiplier(y):
#kind = ['regular', 'borderline1', 'borderline2', 'svm']
#sm = SMOTE(kind='regular',)
#X_res, y_res = sm.fit_sample(X_all, y_all)
#ros = RandomOverSampler()
#X_res, y_res = ros.fit_sample(X_all, y_all)
#ada = ADASYN()
#X_res, y_res = ada.fit_sample(X_all, y_all)
######################################################
# Undersampling
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, CondensedNearestNeighbour, \
NeighbourhoodCleaningRule, InstanceHardnessThreshold
# remove Tomek links
tl = TomekLinks(return_indices=True)
X_res, y_res, idx_resampled = tl.fit_sample(X_all, y_all)
#enn = EditedNearestNeighbours(random_state=0)
#X_res, y_res = enn.fit_sample(X_all, y_all)
#cnn = CondensedNearestNeighbour(random_state=0)
#X_res, y_res = cnn.fit_sample(X_all, y_all)
#ncr = NeighbourhoodCleaningRule(random_state=0)
#X_res, y_res = ncr.fit_sample(X_all, y_all)
#iht = InstanceHardnessThreshold(random_state=0, estimator=clf)
#X_res, y_res = iht.fit_sample(X_all, y_all)
##################
sampling_strategy = 'not majority'
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X, y)
print('Information of the iris data set after making it '
'balanced by over-sampling: \n sampling_strategy={} \n y: {}'
.format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)
###############################################################################
# With **cleaning method**, the number of samples in each class will not be
# equalized even if targeted.
sampling_strategy = 'not minority'
tl = TomekLinks(sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print('Information of the iris data set after making it '
'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}'
.format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)
###############################################################################
# ``sampling_strategy`` as a ``dict``
# ...................................
#
# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
# classes. The values correspond to the desired number of samples for each
# targeted class. This is working for both **under- and over-sampling**
# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
from sklearn.tree import tree
from etl import ETLUtils
from etl import sampler_factory
from nlp import nlp_utils
from topicmodeling.context import review_metrics_extractor
from utils.constants import Constants
RANDOM_STATE = 0
SCORE_METRIC = 'accuracy'
# SCORE_METRIC = 'roc_auc'
resamplers = [
None,
RandomUnderSampler(random_state=RANDOM_STATE),
TomekLinks(random_state=RANDOM_STATE),
EditedNearestNeighbours(random_state=RANDOM_STATE),
NeighbourhoodCleaningRule(random_state=RANDOM_STATE),
RandomOverSampler(random_state=RANDOM_STATE),
SMOTE(random_state=RANDOM_STATE),
SMOTETomek(random_state=RANDOM_STATE),
SMOTEENN(random_state=RANDOM_STATE)
]
PARAM_GRID_MAP = {
'DummyClassifier': {
'resampler': resamplers,
'classifier': [DummyClassifier(random_state=RANDOM_STATE)],
'classifier__strategy': ['most_frequent', 'stratified', 'uniform']
},
'LogisticRegression': {
# highlight the samples of interest
ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
[X_minority[-1, 1], X_majority[1, 1]],
label='Tomek link', s=200, alpha=0.3)
ax.set_title('Illustration of a Tomek link')
make_plot_despine(ax)
fig.tight_layout()
###############################################################################
# We can run the ``TomekLinks`` sampling to remove the corresponding
# samples. If ``sampling_strategy='auto'`` only the sample from the majority
# class will be removed. If ``sampling_strategy='all'`` both samples will be
# removed.
sampler = TomekLinks()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
'Removing all samples')
for ax, title, sampler in zip(ax_arr,
title_arr,
[TomekLinks(sampling_strategy='auto'),
TomekLinks(sampling_strategy='all')]):
X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)),
np.array([0] * X_minority.shape[0] +
[1] * X_majority.shape[0]))
ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
label='Minority class', s=200, marker='_')
ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1],
# We can run the ``TomekLinks`` sampling to remove the corresponding
# samples. If ``sampling_strategy='auto'`` only the sample from the majority
# class will be removed. If ``sampling_strategy='all'`` both samples will be
# removed.
sampler = TomekLinks()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
'Removing all samples')
for ax, title, sampler in zip(ax_arr,
title_arr,
[TomekLinks(sampling_strategy='auto'),
TomekLinks(sampling_strategy='all')]):
X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)),
np.array([0] * X_minority.shape[0] +
[1] * X_majority.shape[0]))
ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
label='Minority class', s=200, marker='_')
ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1],
label='Majority class', s=200, marker='+')
# highlight the samples of interest
ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
[X_minority[-1, 1], X_majority[1, 1]],
label='Tomek link', s=200, alpha=0.3)
ax.set_title(title)
make_plot_despine(ax)
fig.tight_layout()