Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit(self, X, y, by, random_state=None, visualize=False):
'''
by: String
The method used to perform re-sampling
currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
'ORG']
'''
if by == 'RUS':
sampler = RandomUnderSampler(random_state=random_state)
elif by == 'CNN':
sampler = CondensedNearestNeighbour(random_state=random_state)
elif by == 'ENN':
sampler = EditedNearestNeighbours(random_state=random_state)
elif by == 'NCR':
sampler = NeighbourhoodCleaningRule(random_state=random_state)
elif by == 'Tomek':
sampler = TomekLinks(random_state=random_state)
elif by == 'ALLKNN':
sampler = AllKNN(random_state=random_state)
elif by == 'OSS':
sampler = OneSidedSelection(random_state=random_state)
elif by == 'NM':
sampler = NearMiss(random_state=random_state)
elif by == 'CC':
sampler = ClusterCentroids(random_state=random_state)
elif by == 'SMOTE':
sampler = SMOTE(random_state=random_state)
elif by == 'ADASYN':
sampler = ADASYN(random_state=random_state)
n_features=5, n_clusters_per_class=1,
n_samples=100, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Three subplots, unpack the axes array immediately
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
c0, c1 = plot_resampling(ax1, X_vis, y, 'Original set')
# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours(return_indices=True)
X_resampled, y_resampled, idx_resampled = enn.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)
reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
len(X))))
print(reduction_str)
c3 = ax2.scatter(X_vis[idx_samples_removed, 0],
X_vis[idx_samples_removed, 1],
alpha=.2, label='Removed samples', c='g')
plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str)
# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours(return_indices=True)
X_resampled, y_resampled, idx_resampled = renn.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Three subplots, unpack the axes array immediately
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')
# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=.5, edgecolor=almost_black,
facecolor=palette[2], linewidth=0.15)
ax2.set_title('ENN')
# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
def under_sample_EditedNearestNeighbours(train_inputs, train_targets):
sampler = EditedNearestNeighbours(random_state=32)
train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
return train_inputs, train_targets
sampling_strategy=self.sampling_strategy,
random_state=self.random_state,
n_jobs=self.n_jobs,
)
if self.enn is not None:
if isinstance(self.enn, EditedNearestNeighbours):
self.enn_ = clone(self.enn)
else:
raise ValueError(
"enn needs to be an EditedNearestNeighbours."
" Got {} instead.".format(type(self.enn))
)
# Otherwise create a default EditedNearestNeighbours
else:
self.enn_ = EditedNearestNeighbours(
sampling_strategy="all", n_jobs=self.n_jobs
)
def __init__(self, operator = None, sampling_strategy='auto', random_state=None,
n_neighbors=3, kind_sel='all', n_jobs=1):
if operator is None:
raise ValueError("Operator is a required argument.")
self._hyperparams = {
'sampling_strategy': sampling_strategy,
'random_state': random_state,
'n_neighbors': n_neighbors,
'kind_sel': kind_sel,
'n_jobs': n_jobs}
resampler_instance = OrigModel(**self._hyperparams)
super(EditedNearestNeighboursImpl, self).__init__(
operator = operator,
resampler = resampler_instance)
def create_sampler(sampler_name, random_state=None):
if sampler_name is None or sampler_name == 'None':
return None
if sampler_name.lower() == 'randomundersampler':
return RandomUnderSampler(random_state=random_state)
if sampler_name.lower() == 'tomeklinks':
return TomekLinks(random_state=random_state)
if sampler_name.lower() == 'enn':
return EditedNearestNeighbours(random_state=random_state)
if sampler_name.lower() == 'ncl':
return NeighbourhoodCleaningRule(random_state=random_state)
if sampler_name.lower() == 'randomoversampler':
return RandomOverSampler(random_state=random_state)
if sampler_name.lower() == 'smote':
return SMOTE(random_state=random_state)
if sampler_name.lower() == 'smotetomek':
return SMOTETomek(random_state=random_state)
if sampler_name.lower() == 'smoteenn':
return SMOTEENN(random_state=random_state)
else:
raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Three subplots, unpack the axes array immediately
f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')
# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled)) / len(X))))
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=.5, edgecolor=almost_black,
facecolor=palette[2], linewidth=0.15)
ax2.set_title('ENN')
# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
from etl import ETLUtils
from etl import sampler_factory
from nlp import nlp_utils
from topicmodeling.context import review_metrics_extractor
from utils.constants import Constants
RANDOM_STATE = 0
SCORE_METRIC = 'accuracy'
# SCORE_METRIC = 'roc_auc'
resamplers = [
None,
RandomUnderSampler(random_state=RANDOM_STATE),
TomekLinks(random_state=RANDOM_STATE),
EditedNearestNeighbours(random_state=RANDOM_STATE),
NeighbourhoodCleaningRule(random_state=RANDOM_STATE),
RandomOverSampler(random_state=RANDOM_STATE),
SMOTE(random_state=RANDOM_STATE),
SMOTETomek(random_state=RANDOM_STATE),
SMOTEENN(random_state=RANDOM_STATE)
]
PARAM_GRID_MAP = {
'DummyClassifier': {
'resampler': resamplers,
'classifier': [DummyClassifier(random_state=RANDOM_STATE)],
'classifier__strategy': ['most_frequent', 'stratified', 'uniform']
},
'LogisticRegression': {
'resampler': resamplers,