Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def score_model_baseline(fm, labels, fl, hyperparams):
baseline_fm = (fm.reset_index('customer_id', drop=False)
.drop_duplicates('customer_id', keep='last')
.set_index('customer_id'))
baseline_fm, baseline_fl = ft.encode_features(baseline_fm, fl)
baseline_fm, baseline_fl = remove_low_information_features(baseline_fm, baseline_fl)
hyperparams = parse_hyperparams_baseline(hyperparams)
print("HYPERPARAMS:", hyperparams)
cv_score = []
n_splits = 5
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True)
for train_index, test_index in splitter.split(labels, labels):
baseline_train_labels = labels.iloc[train_index]
baseline_test_labels = labels.iloc[test_index]
baseline_train_fm = baseline_fm.loc[baseline_train_labels.index, :]
baseline_test_fm = baseline_fm.loc[baseline_test_labels.index, :]
score = score_baseline_pipeline(baseline_train_fm, baseline_train_labels,
baseline_test_fm, baseline_test_labels,
**hyperparams)
def test_all_primitives(entityset, parameters):
is_agg_primitive = lambda name: issubclass(primitives[name], ft.primitives.AggregationPrimitive)
construct_primitive = lambda name: primitives[name](**parameters.get(name, {}))
agg_primitives = [construct_primitive(name) for name in primitives if is_agg_primitive(name)]
feature_matrix, features = ft.dfs(entityset=entityset, target_entity='sessions', agg_primitives=agg_primitives)
assert not feature_matrix.empty
is_agg_primitive = lambda name: issubclass(primitives[name], ft.primitives.AggregationPrimitive)
construct_primitive = lambda name: primitives[name](**parameters.get(name, {}))
def fit(self, X, **kwargs):
self.features = ft.dfs(
cutoff_time=X,
features_only=True,
max_depth=self.max_depth,
**kwargs
)
[[0.0, 0.0, 0.01], [0.0, 0.0, 0.0]]
Now, if we change the values of the input corpus, to something that better resembles
the given text, the same given input text will result in a different, more discerning,
output. Also, NaN values are handled, as well as strings without words.
>>> lsa = LSA()
>>> x = ["the earth is round", "", np.NaN, ".,/"]
>>> res = lsa(x).tolist()
>>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
>>> res
[[0.01, 0.0, nan, 0.0], [0.0, 0.0, nan, 0.0]]
"""
name = "lsa"
input_types = [Text]
return_type = Numeric
default_value = 0
def __init__(self):
# TODO: allow user to use own corpus
self.number_output_features = 2
self.n = 2
try:
brown = nltk.corpus.brown.sents()
except LookupError:
nltk.download('brown')
brown = nltk.corpus.brown.sents()
finally:
self.trainer = make_pipeline(TfidfVectorizer(), TruncatedSVD())
self.trainer.fit([" ".join(sent) for sent in brown])
class LinearTrend(AggregationPrimitive):
"""Calculate a linear least-squares regression for the values of the time
series versus the sequence from 0 to length of the time series minus one.
This feature assumes the signal to be uniformly sampled. It will not use
the time stamps to fit the model.
Args:
attr (str) : Controls which of the characteristics are returned.
Possible extracted attributes are:
['pvalue', 'rvalue', 'intercept', 'slope', 'stderr'].
Docstring source:
https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.linear_trend
"""
name = "linear_trend"
input_types = [Numeric]
return_type = Numeric
stack_on_self = False
def __init__(self, attr):
self.attr = attr
def get_function(self):
def function(x):
param = [{'attr': self.attr}]
return list(linear_trend(x, param))[0][1]
return function
from featuretools.primitives import AggregationPrimitive
from featuretools.variable_types import Numeric
from tsfresh.feature_extraction.feature_calculators import maximum
class Maximum(AggregationPrimitive):
"""Calculates the highest value of the time series x.
Docstring source:
https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.maximum
"""
name = "maximum"
input_types = [Numeric]
return_type = Numeric
stack_on_self = False
def get_function(self):
return maximum
from featuretools.variable_types import Numeric
from tsfresh.feature_extraction.feature_calculators import quantile
class Quantile(AggregationPrimitive):
"""Calculates the q quantile of x. This is the value of x greater than q%
of the ordered values from x.
Args:
q (float) : The quantile to calculate.
Docstring source:
https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.quantile
"""
name = "quantile"
input_types = [Numeric]
return_type = Numeric
stack_on_self = False
def __init__(self, q):
self.q = q
def get_function(self):
def function(x):
return quantile(x, q=self.q)
return function
from featuretools.primitives import AggregationPrimitive
from featuretools.variable_types import Numeric
from tsfresh.feature_extraction.feature_calculators import count_below_mean
class CountBelowMean(AggregationPrimitive):
"""Returns the number of values in x that are lower than the mean of x
Docstring source:
https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.count_below_mean
"""
name = "count_below_mean"
input_types = [Numeric]
return_type = Numeric
stack_on_self = False
def get_function(self):
return count_below_mean
# catch cases where object dtype cannot be interpreted as a string
try:
avg_length = sample.str.len().mean()
if avg_length > 50:
inferred_type = vtypes.Text
except AttributeError:
pass
elif df[variable].dtype == "bool":
inferred_type = vtypes.Boolean
elif pdtypes.is_categorical_dtype(df[variable].dtype):
inferred_type = vtypes.Categorical
elif pdtypes.is_numeric_dtype(df[variable].dtype):
inferred_type = vtypes.Numeric
elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
elif len(df[variable]):
sample = df[variable] \
.sample(min(10000, df[variable].nunique(dropna=False)))
unique = sample.unique()
percent_unique = sample.size / len(unique)
if percent_unique < .05:
inferred_type = vtypes.Categorical
else:
inferred_type = vtypes.Numeric