Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# will focus on the samples which are difficult to classify with a
# nearest-neighbors rule while regular SMOTE will not make any distinction.
# Therefore, the decision function depending of the algorithm.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = SMOTE()
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
sampler = ADASYN()
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax3)
ax3.set_title('Decision function for {}'.format(sampler.__class__.__name__))
fig.tight_layout()
###############################################################################
# Due to those sampling particularities, it can give rise to some specific
# issues as illustrated below.
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
class_sep=0.8)
ax_arr = ((ax1, ax2), (ax3, ax4))
for ax, sampler in zip(ax_arr, (SMOTE(random_state=0),
def init_ADASYN(self, sampling_strategy, ratio, n_neighbors, n_jobs):
"""Creata a ADASYN sampler object."""
self.object = over_sampling.ADASYN(sampling_strategy=sampling_strategy,
random_state=self.random_state,
ratio=ratio,
n_neighbors=n_neighbors,
n_jobs=n_jobs)
self.sampling_strategy = sampling_strategy
self.ratio = ratio
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
print(__doc__)
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=200, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply the random over-sampling
ada = ADASYN()
X_resampled, y_resampled = ada.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
alpha=0.5)
ax1.set_title('Original set')
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=.5)
def over_sample_ADASYN(train_inputs, train_targets):
sampler = ADASYN(random_state=32)
train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
return train_inputs, train_targets
def __init__(self, operator = None, sampling_strategy='auto', random_state=None, n_neighbors=5, n_jobs=1):
if operator is None:
raise ValueError("Operator is a required argument.")
self._hyperparams = {
'sampling_strategy': sampling_strategy,
'random_state': random_state,
'n_neighbors': n_neighbors,
'n_jobs': n_jobs}
resampler_instance = OrigModel(**self._hyperparams)
super(ADASYNImpl, self).__init__(
operator = operator,
resampler = resampler_instance)
majority_person = 1871 # 530 photos of George W Bush
minority_person = 531 # 29 photos of Bill Clinton
majority_idxs = np.flatnonzero(data.target == majority_person)
minority_idxs = np.flatnonzero(data.target == minority_person)
idxs = np.hstack((majority_idxs, minority_idxs))
X = data.data[idxs]
y = data.target[idxs]
y[y == majority_person] = 0
y[y == minority_person] = 1
classifier = ['3NN', neighbors.KNeighborsClassifier(3)]
samplers = [
['Standard', DummySampler()],
['ADASYN', ADASYN(random_state=RANDOM_STATE)],
['ROS', RandomOverSampler(random_state=RANDOM_STATE)],
['SMOTE', SMOTE(random_state=RANDOM_STATE)],
]
pipelines = [
['{}-{}'.format(sampler[0], classifier[0]),
make_pipeline(sampler[1], classifier[1])]
for sampler in samplers
]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for name, pipeline in pipelines:
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
elif by == 'NCR':
sampler = NeighbourhoodCleaningRule(random_state=random_state)
elif by == 'Tomek':
sampler = TomekLinks(random_state=random_state)
elif by == 'ALLKNN':
sampler = AllKNN(random_state=random_state)
elif by == 'OSS':
sampler = OneSidedSelection(random_state=random_state)
elif by == 'NM':
sampler = NearMiss(random_state=random_state)
elif by == 'CC':
sampler = ClusterCentroids(random_state=random_state)
elif by == 'SMOTE':
sampler = SMOTE(random_state=random_state)
elif by == 'ADASYN':
sampler = ADASYN(random_state=random_state)
elif by == 'BorderSMOTE':
sampler = BorderlineSMOTE(random_state=random_state)
elif by == 'SMOTEENN':
sampler = SMOTEENN(random_state=random_state)
elif by == 'SMOTETomek':
sampler = SMOTETomek(random_state=random_state)
elif by == 'ORG':
sampler = None
else:
raise Error('Unexpected \'by\' type {}'.format(by))
if by != 'ORG':
X_train, y_train = sampler.fit_resample(X, y)
else:
X_train, y_train = X, y
if visualize: