Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
_sampling_type = 'bypass'
def _fit_resample(self, X, y):
return X, y
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
sampler = FakeSampler()
clf = make_pipeline(sampler, LinearSVC())
plot_resampling(X, y, sampler, ax1)
ax1.set_title('Original data - y={}'.format(Counter(y)))
ax_arr = (ax2, ax3, ax4)
for ax, sampler in zip(ax_arr, (RandomOverSampler(random_state=0),
SMOTE(random_state=0),
ADASYN(random_state=0))):
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_resampling(X, y, sampler, ax)
ax.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()
###############################################################################
# The following plot illustrate the difference between ADASYN and SMOTE. ADASYN
# will focus on the samples which are difficult to classify with a
# nearest-neighbors rule while regular SMOTE will not make any distinction.
# Therefore, the decision function depending of the algorithm.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
self.pos_samples = pos_samples
self.ratio_sampler = None
super(ModifiedRandomOverSampler, self).__init__(random_state=random_state)
def fit(self, X, y):
pos = self.pos_samples
neg = len(y[y == 0])
self.ratio_sampler = RandomOverSampler(random_state=self.random_state, ratio={0: neg, 1: pos})
self.ratio_sampler.fit(X, y)
return self
def sample(self, X, y):
return self.ratio_sampler.sample(X, y)
class ModifiedSMOTE(SMOTE):
def __init__(self, pos_samples, random_state=0):
self.pos_samples = pos_samples
self.ratio_sampler = None
super(ModifiedSMOTE, self).__init__(random_state=random_state)
def fit(self, X, y):
pos = self.pos_samples
neg = len(y[y == 0])
self.ratio_sampler = SMOTE(random_state=self.random_state, ratio={0: neg, 1: pos})
self.ratio_sampler.fit(X, y)
return self
def sample(self, X, y):
return self.ratio_sampler.sample(X, y)
def fit(self, X, y):
pos = self.pos_samples
neg = len(y[y == 0])
self.ratio_sampler = SMOTE(random_state=self.random_state, ratio={0: neg, 1: pos})
self.ratio_sampler.fit(X, y)
return self
def fit(self, X, y):
self.model_list = []
df = pd.DataFrame(X); df['label'] = y
df_maj = df[df['label']==0]; n_maj = len(df_maj)
df_min = df[df['label']==1]; n_min = len(df_min)
cols = df.columns.tolist(); cols.remove('label')
for ibagging in range(self.n_estimators):
b = min(0.1*((ibagging%10)+1), 1)
train_maj = df_maj.sample(frac=b, replace=True)
train_min = df_min.sample(frac=b, replace=True)
# train_maj = df_maj.sample(frac=1/self.n_estimators, replace=True)
# train_min = df_min.sample(frac=1/self.n_estimators, replace=True)
# train_maj = df_maj.sample(n=n_min, replace=True)
# train_min = df_min.sample(frac=1/self.n_estimators, replace=True)
df_k = train_maj.append(train_min)
X_train, y_train = SMOTE_IMB(k_neighbors=min(5, len(train_min)-1)).fit_resample(df_k[cols], df_k['label'])
# print ('Bagging Iter: {} |b: {:.1f}|n_train: {}|n_smote: {}'.format(
# ibagging, b, len(y_train), len(y_train)-len(df_k)))
model = DT().fit(X_train, y_train)
self.model_list.append(model)
return self
palette = sns.color_palette()
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=5000, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply Borderline SMOTE 1
sm = SMOTE(kind='borderline1')
X_resampled, y_resampled = sm.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def __init__(self, **kwargs):
super(SmoteSampling, self).__init__(SMOTE(**kwargs, random_state=RANDOM_SEED[BALANCE_SMOTE]), BALANCE_SMOTE)
[{
'n_neighbors': [3,5,8]
}]
)
]
oversampling_methods = [
('None',None),
('RandomOverSampler', RandomOverSampler()),
(
'SMOTE', SMOTE(),
[{
'k_neighbors': [3,5,20]
}]
),
(
'B1-SMOTE', SMOTE(kind='borderline1'),
[{
'k_neighbors': [3,5,20]
}]
),
(
'B2-SMOTE', SMOTE(kind='borderline2'),
[{
'k_neighbors': [3,5,20]
}]
),
(
'KMeansSMOTE', KMeansSMOTE(),
[
{
'imbalance_ratio_threshold': [1,float('Inf')],
'density_power': [0, 2, None], # None corresponds to n_features