Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
RANDOM_STATE = 42
# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data, iris.target,
sampling_strategy={0: 25, 1: 50, 2: 50},
random_state=RANDOM_STATE)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=RANDOM_STATE)
print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))
# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2),
LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)
# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
# ``NearMiss`` algorithms implement some heuristic rules in order to select
# samples. NearMiss-1 selects samples from the majority class for which the
# average distance of the :math:`k`` nearest samples of the minority class is
# the smallest. NearMiss-2 selects the samples from the majority class for
# which the average distance to the farthest samples of the negative class is
# the smallest. NearMiss-3 is a 2-step algorithm: first, for each minority
# sample, their ::math:`m` nearest-neighbors will be kept; then, the majority
# samples selected are the on for which the average distance to the :math:`k`
# nearest neighbors is the largest.
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
figsize=(15, 25))
X, y = create_dataset(n_samples=5000, weights=(0.1, 0.2, 0.7), class_sep=0.8)
ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (NearMiss(version=1),
NearMiss(version=2),
NearMiss(version=3))):
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax[0])
ax[0].set_title('Decision function for {}-{}'.format(
sampler.__class__.__name__, sampler.version))
plot_resampling(X, y, sampler, ax[1])
ax[1].set_title('Resampling using {}-{}'.format(
sampler.__class__.__name__, sampler.version))
fig.tight_layout()
###############################################################################
# ``EditedNearestNeighbours`` removes samples of the majority class for which
# their class differ from the one of their nearest-neighbors. This sieve can be
# repeated which is the principle of the
def under_sample_NearMiss(train_inputs, train_targets):
sampler = NearMiss(random_state=32)
train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
return train_inputs, train_targets
target_index = np.where(uv == target_value)[0][0]
nontarget_index = np.where(uv != target_value)[0][0]
ratio = (uc[nontarget_index] / uc[target_index]) - 1.0
logger.info("Sampling Ratio for target %s [%r]: %f",
target, target_value, ratio)
# Choose the sampling method.
if sampling_method == SamplingMethod.under_random:
sampler = RandomUnderSampler()
elif sampling_method == SamplingMethod.under_tomek:
sampler = TomekLinks()
elif sampling_method == SamplingMethod.under_cluster:
sampler = ClusterCentroids()
elif sampling_method == SamplingMethod.under_nearmiss:
sampler = NearMiss(version=1)
elif sampling_method == SamplingMethod.under_ncr:
sampler = NeighbourhoodCleaningRule()
elif sampling_method == SamplingMethod.over_random:
sampler = RandomOverSampler(ratio=ratio)
elif sampling_method == SamplingMethod.over_smote:
sampler = SMOTE(ratio=ratio, kind='regular')
elif sampling_method == SamplingMethod.over_smoteb:
sampler = SMOTE(ratio=ratio, kind='borderline1')
elif sampling_method == SamplingMethod.over_smotesv:
sampler = SMOTE(ratio=ratio, kind='svm')
elif sampling_method == SamplingMethod.overunder_smote_tomek:
sampler = SMOTETomek(ratio=ratio)
elif sampling_method == SamplingMethod.overunder_smote_enn:
sampler = SMOTEENN(ratio=ratio)
elif sampling_method == SamplingMethod.ensemble_easy:
sampler = EasyEnsemble()
palette = sns.color_palette()
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=5000, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply Nearmiss 3
nm3 = NearMiss(version=3)
X_resampled, y_resampled = nm3.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
palette = sns.color_palette()
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=5000, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply Nearmiss 1
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def check_samplers_pandas(name, Sampler):
pd = pytest.importorskip("pandas")
# Check that the samplers handle pandas dataframe and pandas series
X, y = make_classification(
n_samples=1000,
n_classes=3,
n_informative=4,
weights=[0.2, 0.3, 0.5],
random_state=0,
)
X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
y_pd = pd.Series(y, name="class")
sampler = Sampler()
if isinstance(Sampler(), NearMiss):
samplers = [Sampler(version=version) for version in (1, 2, 3)]
else:
samplers = [Sampler()]
for sampler in samplers:
set_random_state(sampler)
X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
X_res, y_res = sampler.fit_resample(X, y)
# check that we return a pandas dataframe if a dataframe was given in
assert isinstance(X_res_pd, pd.DataFrame)
assert isinstance(y_res_pd, pd.Series)
assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
assert y_pd.name == y_res_pd.name
assert_allclose(X_res_pd.to_numpy(), X_res)
def init_NearMiss(self, sampling_strategy, n_jobs):
"""Creata a near miss sampler object."""
self.object = under_sampling.NearMiss(sampling_strategy=sampling_strategy,
random_state=self.random_state,
n_jobs=n_jobs)
self.sampling_strategy = sampling_strategy
self.n_jobs = n_jobs
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=200, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply Nearmiss
version = [1, 2, 3]
nm = [NearMiss(version=v, return_indices=True) for v in version]
X_resampled = []
y_resampled = []
X_res_vis = []
idx_samples_removed = []
for method in nm:
X_res, y_res, idx_res = method.fit_resample(X, y)
X_resampled.append(X_res)
y_resampled.append(y_res)
X_res_vis.append(pca.transform(X_res))
idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
idx_res)
# Two subplots, unpack the axes array immediately
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
ax_res = [ax2, ax3, ax4]