Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
elif by == 'ALLKNN':
sampler = AllKNN(random_state=random_state)
elif by == 'OSS':
sampler = OneSidedSelection(random_state=random_state)
elif by == 'NM':
sampler = NearMiss(random_state=random_state)
elif by == 'CC':
sampler = ClusterCentroids(random_state=random_state)
elif by == 'SMOTE':
sampler = SMOTE(random_state=random_state)
elif by == 'ADASYN':
sampler = ADASYN(random_state=random_state)
elif by == 'BorderSMOTE':
sampler = BorderlineSMOTE(random_state=random_state)
elif by == 'SMOTEENN':
sampler = SMOTEENN(random_state=random_state)
elif by == 'SMOTETomek':
sampler = SMOTETomek(random_state=random_state)
elif by == 'ORG':
sampler = None
else:
raise Error('Unexpected \'by\' type {}'.format(by))
if by != 'ORG':
X_train, y_train = sampler.fit_resample(X, y)
else:
X_train, y_train = X, y
if visualize:
df = pd.DataFrame(X_train)
df['label'] = y_train
df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by))
self.base_estimator.fit(X_train, y_train)
# be well separated. Hence, it can be beneficial to apply an under-sampling
# algorithm to clean the noisy samples. Two methods are usually used in the
# literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning
# methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek``
# and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than
# ``SMOTETomek``.
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
figsize=(15, 25))
X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7))
ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (
SMOTE(random_state=0),
SMOTEENN(random_state=0),
SMOTETomek(random_state=0))):
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax[0])
ax[0].set_title('Decision function for {}'.format(
sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax[1])
ax[1].set_title('Resampling using {}'.format(
sampler.__class__.__name__))
fig.tight_layout()
plt.show()
def resample_smote_enn(train_inputs, train_targets):
sampler = SMOTEENN(random_state=32)
train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
return train_inputs, train_targets
if sampler_name.lower() == 'randomundersampler':
return RandomUnderSampler(random_state=random_state)
if sampler_name.lower() == 'tomeklinks':
return TomekLinks(random_state=random_state)
if sampler_name.lower() == 'enn':
return EditedNearestNeighbours(random_state=random_state)
if sampler_name.lower() == 'ncl':
return NeighbourhoodCleaningRule(random_state=random_state)
if sampler_name.lower() == 'randomoversampler':
return RandomOverSampler(random_state=random_state)
if sampler_name.lower() == 'smote':
return SMOTE(random_state=random_state)
if sampler_name.lower() == 'smotetomek':
return SMOTETomek(random_state=random_state)
if sampler_name.lower() == 'smoteenn':
return SMOTEENN(random_state=random_state)
else:
raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
def set_samplers(self, *args):
sampler_db = {
'smote':SMOTE(),
'smoteenn':SMOTEENN(),
}
self.samplers = list(map( lambda sampler: sampler_db[sampler],args))
from utils.constants import Constants
RANDOM_STATE = 0
SCORE_METRIC = 'accuracy'
# SCORE_METRIC = 'roc_auc'
resamplers = [
None,
RandomUnderSampler(random_state=RANDOM_STATE),
TomekLinks(random_state=RANDOM_STATE),
EditedNearestNeighbours(random_state=RANDOM_STATE),
NeighbourhoodCleaningRule(random_state=RANDOM_STATE),
RandomOverSampler(random_state=RANDOM_STATE),
SMOTE(random_state=RANDOM_STATE),
SMOTETomek(random_state=RANDOM_STATE),
SMOTEENN(random_state=RANDOM_STATE)
]
PARAM_GRID_MAP = {
'DummyClassifier': {
'resampler': resamplers,
'classifier': [DummyClassifier(random_state=RANDOM_STATE)],
'classifier__strategy': ['most_frequent', 'stratified', 'uniform']
},
'LogisticRegression': {
'resampler': resamplers,
'classifier': [LogisticRegression(random_state=RANDOM_STATE)],
'classifier__C': [0.1, 1.0, 10, 100, 1000]
# 'classifier__C': [0.1, 1.0, 10]
},
'SVC': {
elif sampling_method == SamplingMethod.under_nearmiss:
sampler = NearMiss(version=1)
elif sampling_method == SamplingMethod.under_ncr:
sampler = NeighbourhoodCleaningRule()
elif sampling_method == SamplingMethod.over_random:
sampler = RandomOverSampler(ratio=ratio)
elif sampling_method == SamplingMethod.over_smote:
sampler = SMOTE(ratio=ratio, kind='regular')
elif sampling_method == SamplingMethod.over_smoteb:
sampler = SMOTE(ratio=ratio, kind='borderline1')
elif sampling_method == SamplingMethod.over_smotesv:
sampler = SMOTE(ratio=ratio, kind='svm')
elif sampling_method == SamplingMethod.overunder_smote_tomek:
sampler = SMOTETomek(ratio=ratio)
elif sampling_method == SamplingMethod.overunder_smote_enn:
sampler = SMOTEENN(ratio=ratio)
elif sampling_method == SamplingMethod.ensemble_easy:
sampler = EasyEnsemble()
elif sampling_method == SamplingMethod.ensemble_bc:
sampler = BalanceCascade()
else:
raise ValueError("Unknown Sampling Method %s" % sampling_method)
# Get the newly sampled features.
X, y = sampler.fit_sample(X_train, y_train)
logger.info("Original Samples : %d", X_train.shape[0])
logger.info("New Samples : %d", X.shape[0])
# Store the new features in the model.
def init_SMOTEEN(self, sampling_strategy, ratio, n_jobs):
"""Creata a SMOTEEN sampler object."""
self.object =\
combine.SMOTEENN(random_state=self.random_state,
sampling_strategy=sampling_strategy,
ratio=ratio,
n_jobs=n_jobs)
self.ratio = ratio
self.sampling_strategy = sampling_strategy
self.n_jobs = n_jobs
def __init__(self, operator = None, sampling_strategy='auto', random_state=None, smote=None, enn=None):
if operator is None:
raise ValueError("Operator is a required argument.")
self._hyperparams = {
'sampling_strategy': sampling_strategy,
'random_state': random_state,
'smote': smote,
'enn': enn}
resampler_instance = OrigModel(**self._hyperparams)
super(SMOTEENNImpl, self).__init__(
operator = operator,
resampler = resampler_instance)
print(__doc__)
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=100, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply SMOTE + ENN
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
alpha=0.5)
ax1.set_title('Original set')
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=0.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=0.5)