Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@Substitution(random_state=_random_state_docstring)
def balanced_batch_generator(
X,
y,
sample_weight=None,
sampler=None,
batch_size=32,
keep_sparse=False,
random_state=None,
):
"""Create a balanced batch generator to train keras model.
Returns a generator --- as well as the number of step per epoch --- which
is given to ``fit_generator``. The sampler defines the sampling strategy
used to balance the dataset ahead of creating the batch. The sampler should
have an attribute ``sample_indices_``.
is_max = np.isclose(col_maxs, col_maxs.max(axis=1, keepdims=True))
max_idxs = rng.permutation(np.argwhere(is_max))
xs, idx_sels = np.unique(max_idxs[:, 0], return_index=True)
col_sels = max_idxs[idx_sels, 1]
ys = start_idx + col_sels
X_new[:, start_idx:end_idx] = 0
X_new[xs, ys] = 1
return X_new
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class KMeansSMOTE(BaseSMOTE):
"""Apply a KMeans clustering before to over-sample using SMOTE.
This is an implementation of the algorithm described in [1]_.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
k_neighbors : int or object, default=2
If ``int``, number of nearest neighbours to used to construct synthetic
import numpy as np
from sklearn.utils import check_array
from sklearn.utils import check_consistent_length
from sklearn.utils import check_random_state
from sklearn.utils import _safe_indexing
from ..base import BaseUnderSampler
from ...utils import check_target_type
from ...utils import Substitution
from ...utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
random_state=_random_state_docstring,
)
class RandomUnderSampler(BaseUnderSampler):
"""Class to perform random under-sampling.
Under-sample the majority class(es) by randomly picking samples
with or without replacement.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
replacement : bool, default=False
from sklearn.base import clone
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from ..pipeline import Pipeline
from ..under_sampling import RandomUnderSampler
from ..under_sampling.base import BaseUnderSampler
from ..utils import Substitution, check_target_type
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class BalancedBaggingClassifier(BaggingClassifier):
"""A Bagging classifier with additional balancing.
This implementation of Bagging is similar to the scikit-learn
implementation. It includes an additional step to balance the training set
at fit time using a ``RandomUnderSampler``.
Read more in the :ref:`User Guide `.
Parameters
----------
base_estimator : object, default=None
The base estimator to fit on random subsets of the dataset.
If None, then the base estimator is a decision tree.
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import _safe_indexing
from ..base import BaseUnderSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
VOTING_KIND = ("auto", "hard", "soft")
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class ClusterCentroids(BaseUnderSampler):
"""Undersample by generating centroids based on clustering methods.
Method that under samples the majority class by replacing a
cluster of majority samples by the cluster centroid of a KMeans
algorithm. This algorithm keeps N majority samples by fitting the
KMeans algorithm with N cluster to the majority class and using
the coordinates of the N cluster centroids as the new majority
samples.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
from scipy import sparse
from sklearn.utils import check_random_state
from sklearn.utils import _safe_indexing
from .base import BaseOverSampler
from ..utils import check_neighbors_object
from ..utils import Substitution
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class ADASYN(BaseOverSampler):
"""Oversample using Adaptive Synthetic (ADASYN) algorithm.
This method is similar to SMOTE but it generates different number of
samples depending on an estimate of the local distribution of the class
to be oversampled.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
from sklearn.utils import check_X_y
from ..base import BaseSampler
from ..over_sampling import SMOTE
from ..over_sampling.base import BaseOverSampler
from ..under_sampling import EditedNearestNeighbours
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class SMOTEENN(BaseSampler):
"""Over-sampling using SMOTE and cleaning using ENN.
Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
smote : object, default=None
The :class:`imblearn.over_sampling.SMOTE` object to use. If not given,
from sklearn.ensemble._base import _set_random_states
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.utils import check_random_state
from sklearn.utils import _safe_indexing
from ..base import BaseUnderSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class InstanceHardnessThreshold(BaseUnderSampler):
"""Undersample based on the instance hardness threshold.
Read more in the :ref:`User Guide `.
Parameters
----------
estimator : object, default=None
Classifier to be used to estimate instance hardness of the samples. By
default a :class:`sklearn.ensemble.RandomForestClassifier` will be used.
If ``str``, the choices using a string are the following: ``'knn'``,
``'decision-tree'``, ``'random-forest'``, ``'adaboost'``,
``'gradient-boosting'`` and ``'linear-svm'``. If object, an estimator
inherited from :class:`sklearn.base.ClassifierMixin` and having an
attribute :func:`predict_proba`.
@Substitution(random_state=_random_state_docstring)
def balanced_batch_generator(
X,
y,
sample_weight=None,
sampler=None,
batch_size=32,
keep_sparse=False,
random_state=None,
):
"""Create a balanced batch generator to train tensorflow model.
Returns a generator --- as well as the number of step per epoch --- which
is given to ``fit_generator``. The sampler defines the sampling strategy
used to balance the dataset ahead of creating the batch. The sampler should
have an attribute ``sample_indices_``.
from sklearn.ensemble import BaggingClassifier
from ..under_sampling import RandomUnderSampler
from ..under_sampling.base import BaseUnderSampler
from ..utils import Substitution, check_target_type
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
from ..pipeline import Pipeline
MAX_INT = np.iinfo(np.int32).max
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class EasyEnsembleClassifier(BaggingClassifier):
"""Bag of balanced boosted learners also known as EasyEnsemble.
This algorithm is known as EasyEnsemble [1]_. The classifier is an
ensemble of AdaBoost learners trained on different balanced boostrap
samples. The balancing is achieved by random under-sampling.
Read more in the :ref:`User Guide `.
Parameters
----------
n_estimators : int, default=10
Number of AdaBoost learners in the ensemble.
base_estimator : object, default=AdaBoostClassifier()