Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import numpy as np
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import check_random_state, _safe_indexing
from ..base import BaseCleaningSampler
from ._tomek_links import TomekLinks
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class OneSidedSelection(BaseCleaningSampler):
"""Class to perform under-sampling based on one-sided selection method.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
n_neighbors : int or object, default=None
If ``int``, size of the neighbourhood to consider to compute the
nearest neighbors. If object, an estimator that inherits from
from scipy.stats import mode
from sklearn.utils import _safe_indexing
from ..base import BaseCleaningSampler
from ._edited_nearest_neighbours import EditedNearestNeighbours
from ...utils import check_neighbors_object
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
SEL_KIND = ("all", "mode")
@Substitution(
sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
)
class NeighbourhoodCleaningRule(BaseCleaningSampler):
"""Undersample based on the neighbourhood cleaning rule.
This class uses ENN and a k-NN to remove noisy samples from the datasets.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
n_neighbors : int or object, default=3
If ``int``, size of the neighbourhood to consider to compute the
nearest neighbors. If object, an estimator that inherits from
:class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to
# Fernando Nogueira
# Christos Aridas
# License: MIT
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import _safe_indexing
from ..base import BaseCleaningSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
@Substitution(
sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
)
class TomekLinks(BaseCleaningSampler):
"""Under-sampling by removing Tomek's links.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{n_jobs}
Attributes
----------
sample_indices_ : ndarray of shape (n_new_samples)
Indices of the samples selected.
from sklearn.base import clone
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from ..pipeline import Pipeline
from ..under_sampling import RandomUnderSampler
from ..under_sampling.base import BaseUnderSampler
from ..utils import Substitution, check_target_type
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class BalancedBaggingClassifier(BaggingClassifier):
"""A Bagging classifier with additional balancing.
This implementation of Bagging is similar to the scikit-learn
implementation. It includes an additional step to balance the training set
at fit time using a ``RandomUnderSampler``.
Read more in the :ref:`User Guide `.
Parameters
----------
base_estimator : object, default=None
The base estimator to fit on random subsets of the dataset.
If None, then the base estimator is a decision tree.
forest,
X_resampled,
y_resampled,
sample_weight,
tree_idx,
n_trees,
verbose=verbose,
class_weight=class_weight,
n_samples_bootstrap=n_samples_bootstrap,
)
return sampler, tree
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class BalancedRandomForestClassifier(RandomForestClassifier):
"""A balanced random forest classifier.
A balanced random forest randomly under-samples each boostrap sample to
balance it.
Read more in the :ref:`User Guide `.
Parameters
----------
n_estimators : int, default=100
The number of trees in the forest.
criterion : str, default="gini"
# tie breaking argmax
is_max = np.isclose(col_maxs, col_maxs.max(axis=1, keepdims=True))
max_idxs = rng.permutation(np.argwhere(is_max))
xs, idx_sels = np.unique(max_idxs[:, 0], return_index=True)
col_sels = max_idxs[idx_sels, 1]
ys = start_idx + col_sels
X_new[:, start_idx:end_idx] = 0
X_new[xs, ys] = 1
return X_new
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class KMeansSMOTE(BaseSMOTE):
"""Apply a KMeans clustering before to over-sample using SMOTE.
This is an implementation of the algorithm described in [1]_.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
k_neighbors : int or object, default=2
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from ..under_sampling import RandomUnderSampler
from ..under_sampling.base import BaseUnderSampler
from ..utils import Substitution, check_target_type
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
from ..pipeline import Pipeline
MAX_INT = np.iinfo(np.int32).max
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class EasyEnsembleClassifier(BaggingClassifier):
"""Bag of balanced boosted learners also known as EasyEnsemble.
This algorithm is known as EasyEnsemble [1]_. The classifier is an
ensemble of AdaBoost learners trained on different balanced boostrap
samples. The balancing is achieved by random under-sampling.
Read more in the :ref:`User Guide `.
Parameters
----------
n_estimators : int, default=10
Number of AdaBoost learners in the ensemble.
from sklearn.base import clone
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import _safe_indexing
from ..base import BaseUnderSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
VOTING_KIND = ("auto", "hard", "soft")
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class ClusterCentroids(BaseUnderSampler):
"""Undersample by generating centroids based on clustering methods.
Method that under samples the majority class by replacing a
cluster of majority samples by the cluster centroid of a KMeans
algorithm. This algorithm keeps N majority samples by fitting the
KMeans algorithm with N cluster to the majority class and using
the coordinates of the N cluster centroids as the new majority
samples.
Read more in the :ref:`User Guide `.
Parameters
----------
from scipy.sparse import issparse
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import check_random_state, _safe_indexing
from ..base import BaseCleaningSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class CondensedNearestNeighbour(BaseCleaningSampler):
"""Undersample based on the condensed nearest neighbour method.
Read more in the :ref:`User Guide `.
Parameters
----------
{sampling_strategy}
{random_state}
n_neighbors : int or object, default=\
KNeighborsClassifier(n_neighbors=1)
If ``int``, size of the neighbourhood to consider to compute the
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble._base import _set_random_states
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.utils import check_random_state
from sklearn.utils import _safe_indexing
from ..base import BaseUnderSampler
from ...utils import Substitution
from ...utils._docstring import _n_jobs_docstring
from ...utils._docstring import _random_state_docstring
@Substitution(
sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class InstanceHardnessThreshold(BaseUnderSampler):
"""Undersample based on the instance hardness threshold.
Read more in the :ref:`User Guide `.
Parameters
----------
estimator : object, default=None
Classifier to be used to estimate instance hardness of the samples. By
default a :class:`sklearn.ensemble.RandomForestClassifier` will be used.
If ``str``, the choices using a string are the following: ``'knn'``,
``'decision-tree'``, ``'random-forest'``, ``'adaboost'``,
``'gradient-boosting'`` and ``'linear-svm'``. If object, an estimator
inherited from :class:`sklearn.base.ClassifierMixin` and having an