How to use the imblearn.under_sampling.RandomUnderSampler function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scikit-learn-contrib / imbalanced-learn / examples / applications / plot_topic_classication.py View on Github external
###############################################################################
# Balancing the class before classification
###############################################################################

###############################################################################
# To improve the prediction of the class \#3, it could be interesting to apply
# a balancing before to train the naive bayes classifier. Therefore, we will
# use a ``RandomUnderSampler`` to equalize the number of samples in all the
# classes before the training.
#
# It is also important to note that we are using the ``make_pipeline`` function
# implemented in imbalanced-learn to properly handle the samplers.

pipe = make_pipeline_imb(TfidfVectorizer(),
                         RandomUnderSampler(),
                         MultinomialNB())

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

###############################################################################
# Although the results are almost identical, it can be seen that the resampling
# allowed to correct the poor recall of the class \#3 at the cost of reducing
# the other metrics for the other classes. However, the overall results are
# slightly better.

print(classification_report_imbalanced(y_test, y_pred))
github vc1492a / henosis / Henosis / model.py View on Github external
test_size=(1. - share_train),
            stratify=stratify
        )

        self.dependent = y.name
        if X_label:
            self.independent = X_label
        else:
            self.independent = list(X.columns.values)
        self.balance = balance

        if balance == 'upsample':
            ros = RandomOverSampler()
            X_resample, y_resample = ros.fit_sample(X_train, y_train)
        elif balance == 'downsample':
            rus = RandomUnderSampler()
            X_resample, y_resample = rus.fit_sample(X_train, y_train)
        else:
            X_resample = X
            y_resample = y

        self.X_train, X_test, self.y_train, y_test = train_test_split(
            X_resample,
            y_resample,
            test_size=(1. - share_train),
            stratify=stratify
        )
github iarroyof / sentence_embedding / classify_lsa_winds.py View on Github external
from sklearn.decomposition import KernelPCA
from sklearn.kernel_approximation import RBFSampler

pipe = make_pipeline_imb(
#pipe = make_pipeline(
                        TfidfVectorizer(
                          min_df=f_min,
                          encoding="latin-1",
                          decode_error="replace",
                          lowercase=True,
                          binary= True,# if args.tf.startswith("bin") else False,
                          sublinear_tf= False,# if args.tf.startswith("subl") else False,
                          stop_words= "english",# if args.stop else None
                        ), 
                        #RandomOverSampler(),
                        RandomUnderSampler(), 
                        #SMOTEENN(random_state=0),
                        #SMOTETomek(random_state=42),
                        Normalizer(),
                        TruncatedSVD(200),
                        #KernelPCA(n_components=75, kernel="poly", gamma=10, degree=3, n_jobs=-1), 
                        #KernelPCA( kernel="rbf", gamma=10, degree=3, n_jobs=-1), 
                        #RBFSampler(gamma=0.1, random_state=1),
                        #SGDClassifier(alpha=.0001, 
                        #              n_iter=100, 
                        #              n_jobs=-1, 
                        #              verbose=100,
                        #              epsilon=1, 
                        #              class_weight='balanced',
                        #              #warm_start=True, 
                        #              penalty='l1')
                        #GaussianProcessClassifier(n_jobs=-1)
github mozilla / bugbug / bugbug / models / qaneeded.py View on Github external
def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords({"qawanted"}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
        ]
github MStarmans91 / WORC / WORC / classification / ObjectSampler.py View on Github external
def init_RandomUnderSampling(self, sampling_strategy):
        """Creata a random under sampler object."""
        self.object = under_sampling.RandomUnderSampler(sampling_strategy=sampling_strategy,
                                                        random_state=self.random_state)
        self.sampling_strategy = sampling_strategy
github scikit-learn-contrib / imbalanced-learn / imblearn / ensemble / _easy_ensemble.py View on Github external
if self.n_estimators <= 0:
            raise ValueError(
                "n_estimators must be greater than zero, "
                "got {}.".format(self.n_estimators)
            )

        if self.base_estimator is not None:
            base_estimator = clone(self.base_estimator)
        else:
            base_estimator = clone(default)

        self.base_estimator_ = Pipeline(
            [
                (
                    "sampler",
                    RandomUnderSampler(
                        sampling_strategy=self.sampling_strategy,
                        replacement=self.replacement,
                    ),
                ),
                ("classifier", base_estimator),
            ]
github csc-training / geocomputing / machineLearning / 03_deep / 07_deepClassification.py View on Github external
# First move the bands to last axis.
    image_data2 = np.transpose(image_data, (1, 2, 0))
    # Check again the data shape, now the bands should be last.
    print ('Dataframe shape after transpose, 3D: ', image_data2.shape) 
    
    # Then reshape to 1D.
    pixels = image_data2.reshape(-1, 3)
    print ('Dataframe shape after transpose and reshape, 2D: ', pixels.shape) 
    
	# For labels only reshape to 1D is enough.
    labels_data = labels_dataset.read()
    input_labels = labels_data.reshape(-1)
    print ('Labels shape after reshape, 1D: ', pixels.shape)
    
    # The forest classes are very imbalanced in the dataset, so undersample the majority classes
    rus = RandomUnderSampler(random_state=63)
    pixels_resampled, labels_resampled = rus.fit_resample(pixels, input_labels)   
    print ('Dataframe shape after undersampling of majority classes, 2D: ', pixels_resampled.shape)

    return pixels_resampled, labels_resampled
github scikit-learn-contrib / imbalanced-learn / imblearn / keras / _generator.py View on Github external
def _sample(self):
        random_state = check_random_state(self.random_state)
        if self.sampler is None:
            self.sampler_ = RandomUnderSampler(random_state=random_state)
        else:
            self.sampler_ = clone(self.sampler)
        self.sampler_.fit_resample(self.X, self.y)
        if not hasattr(self.sampler_, "sample_indices_"):
            raise ValueError(
                "'sampler' needs to have an attribute " "'sample_indices_'."
            )
        self.indices_ = self.sampler_.sample_indices_
        # shuffle the indices since the sampler are packing them by class
        random_state.shuffle(self.indices_)
github HealthCatalyst / healthcareai-py / healthcareai / common / transformers.py View on Github external
def transform(self, X, y=None):
        """Transform the dataframe."""
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        under_sampler = RandomUnderSampler(random_state=self.random_seed)
        x_under_sampled, y_under_sampled = under_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_under_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_under_sampled = pd.Series(y_under_sampled)
        result[self.predicted_column] = y_under_sampled

        return result
github salan668 / FAE / FAE / FeatureAnalysis / DataBalance.py View on Github external
def __init__(self):
        super(DownSampling, self).__init__(RandomUnderSampler(random_state=RANDOM_SEED[BALANCE_DOWN_SAMPLING]),
                                           BALANCE_DOWN_SAMPLING)