Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
###############################################################################
# Balancing the class before classification
###############################################################################
###############################################################################
# To improve the prediction of the class \#3, it could be interesting to apply
# a balancing before to train the naive bayes classifier. Therefore, we will
# use a ``RandomUnderSampler`` to equalize the number of samples in all the
# classes before the training.
#
# It is also important to note that we are using the ``make_pipeline`` function
# implemented in imbalanced-learn to properly handle the samplers.
pipe = make_pipeline_imb(TfidfVectorizer(),
RandomUnderSampler(),
MultinomialNB())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
###############################################################################
# Although the results are almost identical, it can be seen that the resampling
# allowed to correct the poor recall of the class \#3 at the cost of reducing
# the other metrics for the other classes. However, the overall results are
# slightly better.
print(classification_report_imbalanced(y_test, y_pred))
test_size=(1. - share_train),
stratify=stratify
)
self.dependent = y.name
if X_label:
self.independent = X_label
else:
self.independent = list(X.columns.values)
self.balance = balance
if balance == 'upsample':
ros = RandomOverSampler()
X_resample, y_resample = ros.fit_sample(X_train, y_train)
elif balance == 'downsample':
rus = RandomUnderSampler()
X_resample, y_resample = rus.fit_sample(X_train, y_train)
else:
X_resample = X
y_resample = y
self.X_train, X_test, self.y_train, y_test = train_test_split(
X_resample,
y_resample,
test_size=(1. - share_train),
stratify=stratify
)
from sklearn.decomposition import KernelPCA
from sklearn.kernel_approximation import RBFSampler
pipe = make_pipeline_imb(
#pipe = make_pipeline(
TfidfVectorizer(
min_df=f_min,
encoding="latin-1",
decode_error="replace",
lowercase=True,
binary= True,# if args.tf.startswith("bin") else False,
sublinear_tf= False,# if args.tf.startswith("subl") else False,
stop_words= "english",# if args.stop else None
),
#RandomOverSampler(),
RandomUnderSampler(),
#SMOTEENN(random_state=0),
#SMOTETomek(random_state=42),
Normalizer(),
TruncatedSVD(200),
#KernelPCA(n_components=75, kernel="poly", gamma=10, degree=3, n_jobs=-1),
#KernelPCA( kernel="rbf", gamma=10, degree=3, n_jobs=-1),
#RBFSampler(gamma=0.1, random_state=1),
#SGDClassifier(alpha=.0001,
# n_iter=100,
# n_jobs=-1,
# verbose=100,
# epsilon=1,
# class_weight='balanced',
# #warm_start=True,
# penalty='l1')
#GaussianProcessClassifier(n_jobs=-1)
def __init__(self, lemmatization=False):
BugModel.__init__(self, lemmatization)
self.sampler = RandomUnderSampler(random_state=0)
feature_extractors = [
bug_features.has_str(),
bug_features.has_regression_range(),
bug_features.severity(),
bug_features.keywords({"qawanted"}),
bug_features.is_coverity_issue(),
bug_features.has_crash_signature(),
bug_features.has_url(),
bug_features.has_w3c_url(),
bug_features.has_github_url(),
bug_features.whiteboard(),
bug_features.patches(),
bug_features.landings(),
]
def init_RandomUnderSampling(self, sampling_strategy):
"""Creata a random under sampler object."""
self.object = under_sampling.RandomUnderSampler(sampling_strategy=sampling_strategy,
random_state=self.random_state)
self.sampling_strategy = sampling_strategy
if self.n_estimators <= 0:
raise ValueError(
"n_estimators must be greater than zero, "
"got {}.".format(self.n_estimators)
)
if self.base_estimator is not None:
base_estimator = clone(self.base_estimator)
else:
base_estimator = clone(default)
self.base_estimator_ = Pipeline(
[
(
"sampler",
RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement,
),
),
("classifier", base_estimator),
]
# First move the bands to last axis.
image_data2 = np.transpose(image_data, (1, 2, 0))
# Check again the data shape, now the bands should be last.
print ('Dataframe shape after transpose, 3D: ', image_data2.shape)
# Then reshape to 1D.
pixels = image_data2.reshape(-1, 3)
print ('Dataframe shape after transpose and reshape, 2D: ', pixels.shape)
# For labels only reshape to 1D is enough.
labels_data = labels_dataset.read()
input_labels = labels_data.reshape(-1)
print ('Labels shape after reshape, 1D: ', pixels.shape)
# The forest classes are very imbalanced in the dataset, so undersample the majority classes
rus = RandomUnderSampler(random_state=63)
pixels_resampled, labels_resampled = rus.fit_resample(pixels, input_labels)
print ('Dataframe shape after undersampling of majority classes, 2D: ', pixels_resampled.shape)
return pixels_resampled, labels_resampled
def _sample(self):
random_state = check_random_state(self.random_state)
if self.sampler is None:
self.sampler_ = RandomUnderSampler(random_state=random_state)
else:
self.sampler_ = clone(self.sampler)
self.sampler_.fit_resample(self.X, self.y)
if not hasattr(self.sampler_, "sample_indices_"):
raise ValueError(
"'sampler' needs to have an attribute " "'sample_indices_'."
)
self.indices_ = self.sampler_.sample_indices_
# shuffle the indices since the sampler are packing them by class
random_state.shuffle(self.indices_)
def transform(self, X, y=None):
"""Transform the dataframe."""
# TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
# TODO simple trainer in the correct order and leave this to advanced users?
# Extract predicted column
y = np.squeeze(X[[self.predicted_column]])
# Copy the dataframe without the predicted column
temp_dataframe = X.drop([self.predicted_column], axis=1)
# Initialize and fit the under sampler
under_sampler = RandomUnderSampler(random_state=self.random_seed)
x_under_sampled, y_under_sampled = under_sampler.fit_sample(temp_dataframe, y)
# Build the resulting under sampled dataframe
result = pd.DataFrame(x_under_sampled)
# Restore the column names
result.columns = temp_dataframe.columns
# Restore the y values
y_under_sampled = pd.Series(y_under_sampled)
result[self.predicted_column] = y_under_sampled
return result
def __init__(self):
super(DownSampling, self).__init__(RandomUnderSampler(random_state=RANDOM_SEED[BALANCE_DOWN_SAMPLING]),
BALANCE_DOWN_SAMPLING)