Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def make_es(data):
es = ft.EntitySet('Flight Data')
arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
'national_airspace_delay', 'security_delay',
'late_aircraft_delay', 'canceled', 'diverted',
'taxi_in', 'taxi_out', 'air_time', 'dep_time']
variable_types = {'flight_num': vtypes.Categorical,
'distance_group': vtypes.Ordinal,
'canceled': vtypes.Boolean,
'diverted': vtypes.Boolean}
es.entity_from_dataframe('trip_logs',
data,
index='trip_log_id',
make_index=True,
time_index='date_scheduled',
secondary_time_index={'arr_time': arr_time_columns},
variable_types=variable_types)
es.normalize_entity('trip_logs', 'flights', 'flight_id',
additional_variables=['origin', 'origin_city', 'origin_state',
'dest', 'dest_city', 'dest_state',
'distance_group', 'carrier', 'flight_num'])
es.normalize_entity('flights', 'airlines', 'carrier',
make_time_index=False)
def __mul__(self, other):
"""Multiply by other"""
if isinstance(other, FeatureBase):
if self.variable_type == Boolean and other.variable_type == Boolean:
return Feature([self, other], primitive=primitives.MultiplyBoolean)
return self._handle_binary_comparision(other, primitives.MultiplyNumeric, primitives.MultiplyNumericScalar)
else:
inferred_type = vtypes.Categorical
# heuristics to predict this some other than categorical
sample = df[variable].sample(min(10000, len(df[variable])))
# catch cases where object dtype cannot be interpreted as a string
try:
avg_length = sample.str.len().mean()
if avg_length > 50:
inferred_type = vtypes.Text
except AttributeError:
pass
elif df[variable].dtype == "bool":
inferred_type = vtypes.Boolean
elif pdtypes.is_categorical_dtype(df[variable].dtype):
inferred_type = vtypes.Categorical
elif pdtypes.is_numeric_dtype(df[variable].dtype):
inferred_type = vtypes.Numeric
elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
elif len(df[variable]):
sample = df[variable] \
.sample(min(10000, df[variable].nunique(dropna=False)))
unique = sample.unique()
percent_unique = sample.size / len(unique)
"""
all_features = {}
for e in self.es.entities:
if e not in self.ignore_entities:
all_features[e.id] = {}
self.where_clauses = defaultdict(set)
self._run_dfs(self.es[self.target_entity_id], [],
all_features, max_depth=self.max_depth)
new_features = list(all_features[self.target_entity_id].values())
if variable_types is None:
variable_types = [Numeric,
Discrete,
Boolean]
elif variable_types == 'all':
variable_types = None
else:
msg = "variable_types must be a list, or 'all'"
assert isinstance(variable_types, list), msg
if variable_types is not None:
new_features = [f for f in new_features
if any(issubclass(f.variable_type, vt) for vt in variable_types)]
def filt(f):
# remove identity features of the ID field of the target entity
if (isinstance(f, IdentityFeature) and
f.entity.id == self.target_entity_id and
f.variable.id == self.es[self.target_entity_id].index):
return False
class And(TransformPrimitive):
"""Element-wise logical AND of two lists.
Description:
Given a list of booleans X and a list of booleans Y,
determine whether each value in X is `True`, and
whether its corresponding value in Y is also `True`.
Examples:
>>> _and = And()
>>> _and([False, True, False], [True, True, False]).tolist()
[False, True, False]
"""
name = "and"
input_types = [Boolean, Boolean]
return_type = Boolean
commutative = True
def get_function(self):
return np.logical_and
def generate_name(self, base_feature_names):
return "AND(%s, %s)" % (base_feature_names[0], base_feature_names[1])
class Or(TransformPrimitive):
"""Element-wise logical OR of two lists.
Description:
Given a list of booleans X and a list of booleans Y,
determine whether each value in X is `True`, or
if new_type == vtypes.Numeric:
orig_nonnull = df[column_id].dropna().shape[0]
df[column_id] = pd.to_numeric(df[column_id], errors='coerce')
# This will convert strings to nans
# If column contained all strings, then we should
# just raise an error, because that shouldn't have
# been converted to numeric
nonnull = df[column_id].dropna().shape[0]
if nonnull == 0 and orig_nonnull != 0:
raise TypeError("Attempted to convert all string column {} to numeric".format(column_id))
elif issubclass(new_type, vtypes.Datetime):
format = kwargs.get("format", None)
# TODO: if float convert to int?
df[column_id] = pd.to_datetime(df[column_id], format=format,
infer_datetime_format=True)
elif new_type == vtypes.Boolean:
map_dict = {kwargs.get("true_val", True): True,
kwargs.get("false_val", False): False,
True: True,
False: False}
# TODO: what happens to nans?
df[column_id] = df[column_id].map(map_dict).astype(np.bool)
elif not issubclass(new_type, vtypes.Discrete):
raise Exception("Cannot convert column %s to %s" %
(column_id, new_type))
return df
def fit_transform(self, ftens, fl=None, categorical_feature_names=None, labels=None):
if categorical_feature_names is not None:
self.categorical_feature_names = categorical_feature_names
elif fl is not None:
self.categorical_feature_names = [f.get_name() for f in fl
if issubclass(f.variable_type,
Discrete)
and not
f.variable_type == Boolean]
else:
self.categorical_feature_names = [c for c in ftens.columns
if ftens[c].dtype == object]
# Can't handle multiindex
if len(ftens.index.names) > 1:
index_name = ftens.index.names[0]
ftens = ftens.reset_index(index_name, drop=False).set_index(index_name)
self.categorical_vocab = self._gen_categorical_mapping(ftens)
self.numeric_columns = [f for f in ftens.columns
if f not in self.categorical_feature_names]
ftens = self.fit_transform_scaler_imputer(ftens)
if not self.regression:
class NumTrue(AggregationPrimitive):
"""Counts the number of `True` values.
Description:
Given a list of booleans, return the number
of `True` values. Ignores 'NaN'.
Examples:
>>> num_true = NumTrue()
>>> num_true([True, False, True, True, None])
3
"""
name = "num_true"
input_types = [Boolean]
return_type = Numeric
default_value = 0
stack_on = []
stack_on_exclude = []
def get_function(self):
return np.sum
class PercentTrue(AggregationPrimitive):
"""Determines the percent of `True` values.
Description:
Given a list of booleans, return the percent
of values which are `True` as a decimal.
`NaN` values are treated as `False`,
"""Element-wise multiplication of two lists.
Description:
Given a list of values X and a list of values
Y, determine the product of each value in X
with its corresponding value in Y.
Examples:
>>> multiply_numeric = MultiplyNumeric()
>>> multiply_numeric([2, 1, 2], [1, 2, 2]).tolist()
[2, 2, 4]
"""
name = "multiply_numeric"
input_types = [
[Numeric, Numeric],
[Numeric, Boolean],
[Boolean, Numeric],
]
return_type = Numeric
commutative = True
def get_function(self):
return np.multiply
def generate_name(self, base_feature_names):
return "%s * %s" % (base_feature_names[0], base_feature_names[1])
class MultiplyNumericScalar(TransformPrimitive):
"""Multiply each element in the list by a scalar.
Description:
def make_entity_set(orders_table, order_products_table):
es = ft.EntitySet("instacart")
es.entity_from_dataframe(
entity_id="order_products",
dataframe=order_products_table,
index="order_product_id",
variable_types={
"aisle_id": ft.variable_types.Categorical,
"reordered": ft.variable_types.Boolean
},
time_index="order_time")
es.entity_from_dataframe(
entity_id="orders",
dataframe=orders_table,
index="order_id",
time_index="order_time")
es.add_relationship(
ft.Relationship(es["orders"]["order_id"],
es["order_products"]["order_id"]))
es.normalize_entity(
base_entity_id="orders", new_entity_id="users", index="user_id")
es.add_last_time_indexes()