Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_fit_auto_pandas(self):
class FE(FeatureExtractor):
_detect_features = Mock(return_value=['a', 'b'])
_fit = Mock()
fe = FE(features='auto')
X = pd.DataFrame({
'a': ['a', 'b', 'c'],
'b': ['d', 'e', 'f'],
'c': [1, 2, 3]
})
fe.fit(X)
assert fe._features == ['a', 'b']
assert fe._detect_features.called_once_with(X)
expected_calls = [
((pd.Series(['a', 'b', 'c']), ), {}),
self.vectorizers = dict()
super(StringVectorizer, self).fit(X)
def _fit(self, x):
vectorizer = CountVectorizer(**self.kwargs)
vectorizer.fit(x.fillna('').astype(str))
self.vectorizers[x.name] = vectorizer
def _transform(self, x):
vectorizer = self.vectorizers[x.name]
bow = vectorizer.transform(x.fillna('').astype(str))
bow_columns = ['{}_{}'.format(x.name, f) for f in vectorizer.get_feature_names()]
return pd.DataFrame(bow.toarray(), columns=bow_columns, index=x.index)
class DatetimeFeaturizer(FeatureExtractor):
"""Extract features from a datetime."""
def _detect_features(self, X):
return list(X.select_dtypes('datetime').columns)
def _transform(self, x):
prefix = x.name + '_'
features = {
prefix + 'year': x.dt.year,
prefix + 'month': x.dt.month,
prefix + 'day': x.dt.day,
prefix + 'weekday': x.dt.day,
prefix + 'hour': x.dt.hour,
}
return pd.DataFrame(features)
def fit(self, X, y=None):
self.encoders = dict()
super(CategoricalEncoder, self).fit(X)
def _fit(self, x):
encoder = OneHotLabelEncoder(x.name, self.max_labels, self.dropna)
encoder.fit(x)
self.encoders[x.name] = encoder
def _transform(self, x):
encoder = self.encoders[x.name]
return encoder.transform(x)
class StringVectorizer(FeatureExtractor):
"""FeatureExtractor that encodes text features using a scikit-learn CountVectorizer.
When autodetecting features, only features with dtype ``object`` features are considered.
Optionally, a ``min_words`` can be passed, which allows ignoring features
have less than the given value of words in all their occurrences.
Args:
copy (bool):
Whether to make a copy of the input data or modify it in place.
Defaults to ``True``.
features (list or str):
List of features to apply the feature extractor to. If ``'auto'`` is passed,
try to detect the feature automatically. Defaults to an empty list.
keep (bool):
Whether to keep the original features instead of replaceing them.
if self.keep:
x = X[feature]
else:
x = X.pop(feature)
extracted = self._transform(x)
X = pd.concat([X, extracted], axis=1)
return X
def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X)
class CategoricalEncoder(FeatureExtractor):
"""FeatureExtractor that encodes categorical features using OneHotLabelEncoder.
When autodetecting features, only features with dtype ``category`` or ``object``
are considered.
Optionally, a ``max_unique_ratio`` can be passed, which allows ignoring features
that have a high number of unique values, such as primary keys.
Args:
max_labels (int or None):
Maximum number of labels to use by feature. Defaults to ``None``.
max_unique_ratio (int):
Max proportion of unique values that a feature must have in order
to be considered a categorical feature. If ``0`` is given, the ratio is ignored.
Defaults to ``0``.
dropna (bool):