Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Returns:
pd.DataFrame : Pandas DataFrame of calculated feature values.
Indexed by instance_ids. Columns in same order as features
passed in.
"""
assert len(instance_ids) > 0, "0 instance ids provided"
if progress_callback is None:
# do nothing for the progress call back if not provided
def progress_callback(*args):
pass
feature_trie = self.feature_set.feature_trie
df_trie = Trie(path_constructor=RelationshipPath)
full_entity_df_trie = Trie(path_constructor=RelationshipPath)
target_entity = self.entityset[self.feature_set.target_eid]
self._calculate_features_for_entity(entity_id=self.feature_set.target_eid,
feature_trie=feature_trie,
df_trie=df_trie,
full_entity_df_trie=full_entity_df_trie,
precalculated_trie=self.precalculated_features,
filter_variable=target_entity.index,
filter_values=instance_ids,
progress_callback=progress_callback)
# The dataframe for the target entity should be stored at the root of
# df_trie.
df = df_trie.value
if df.empty:
def gather_approximate_features(feature_set):
"""
Find features which can be approximated. Returned as a trie where the values
are sets of feature names.
Args:
feature_set (FeatureSet): Features to search the dependencies of for
features to approximate.
Returns:
Trie[RelationshipPath, set[str]]
"""
approximate_feature_trie = Trie(default=set, path_constructor=RelationshipPath)
for feature in feature_set.target_features:
if feature_set.uses_full_entity(feature, check_dependents=True):
continue
if isinstance(feature, DirectFeature):
path = feature.relationship_path
base_feature = feature.base_features[0]
while isinstance(base_feature, DirectFeature):
path = path + base_feature.relationship_path
base_feature = base_feature.base_features[0]
if isinstance(base_feature, AggregationFeature):
node_feature_set = approximate_feature_trie.get_node(path).value
node_feature_set.add(base_feature.unique_name())
precalculated_features (Trie[RelationshipPath -> pd.DataFrame]):
Maps RelationshipPaths to dataframes of precalculated_features
"""
self.entityset = entityset
self.feature_set = feature_set
self.training_window = training_window
if time_last is None:
time_last = datetime.now()
self.time_last = time_last
if precalculated_features is None:
precalculated_features = Trie(path_constructor=RelationshipPath)
self.precalculated_features = precalculated_features
# total number of features (including dependencies) to be calculate
self.num_features = sum(len(features1) + len(features2) for _, (_, features1, features2) in self.feature_set.feature_trie)
def _build_feature_trie(self):
"""
Build the feature trie by adding the target features and their dependencies recursively.
"""
feature_trie = Trie(default=lambda: (False, set(), set()),
path_constructor=RelationshipPath)
for f in self.target_features:
self._add_feature_to_trie(feature_trie,
f,
self.approximate_feature_trie)
return feature_trie
window (Timedelta or str): frequency to group instances with similar
cutoff times by for features with costly calculations. For example,
if bucket is 24 hours, all instances with cutoff times on the same
day will use the same calculation for expensive features.
entityset (:class:`.EntitySet`): An already initialized entityset.
feature_set (:class:`.FeatureSet`): The features to be calculated.
training_window (`Timedelta`, optional):
Window defining how much older than the cutoff time data
can be to be included when calculating the feature. If None, all older data is used.
save_progress (str, optional): path to save intermediate computational results
'''
approx_fms_trie = Trie(path_constructor=RelationshipPath)
target_time_colname = 'target_time'
cutoff_time[target_time_colname] = cutoff_time['time']
approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window)
cutoff_df_time_var = 'time'
cutoff_df_instance_var = 'instance_id'
# should this order be by dependencies so that calculate_feature_matrix
# doesn't skip approximating something?
for relationship_path, approx_feature_names in feature_set.approximate_feature_trie:
if not approx_feature_names:
continue
cutoffs_with_approx_e_ids, new_approx_entity_index_var = \
_add_approx_entity_index_var(entityset, feature_set.target_eid,
approx_cutoffs.copy(), relationship_path)
progress_callback (callable): function to be called with incremental progress updates
Returns:
pd.DataFrame : Pandas DataFrame of calculated feature values.
Indexed by instance_ids. Columns in same order as features
passed in.
"""
assert len(instance_ids) > 0, "0 instance ids provided"
if progress_callback is None:
# do nothing for the progress call back if not provided
def progress_callback(*args):
pass
feature_trie = self.feature_set.feature_trie
df_trie = Trie(path_constructor=RelationshipPath)
full_entity_df_trie = Trie(path_constructor=RelationshipPath)
target_entity = self.entityset[self.feature_set.target_eid]
self._calculate_features_for_entity(entity_id=self.feature_set.target_eid,
feature_trie=feature_trie,
df_trie=df_trie,
full_entity_df_trie=full_entity_df_trie,
precalculated_trie=self.precalculated_features,
filter_variable=target_entity.index,
filter_values=instance_ids,
progress_callback=progress_callback)
# The dataframe for the target entity should be stored at the root of
# df_trie.
df = df_trie.value
def __init__(self, features, approximate_feature_trie=None):
"""
Args:
features (list[Feature]): Features of the target entity.
approximate_feature_trie (Trie[RelationshipPath, set[str]], optional): Dependency
features to ignore because they have already been approximated. For example, if
one of the target features is a direct feature of a feature A and A is included in
approximate_feature_trie then neither A nor its dependencies will appear in
FeatureSet.feature_trie.
"""
self.target_eid = features[0].entity.id
self.target_features = features
self.target_feature_names = {f.unique_name() for f in features}
if not approximate_feature_trie:
approximate_feature_trie = Trie(default=list,
path_constructor=RelationshipPath)
self.approximate_feature_trie = approximate_feature_trie
# Maps the unique name of each feature to the actual feature. This is necessary
# because features do not support equality and so cannot be used as
# dictionary keys. The equality operator on features produces a new
# feature (which will always be truthy).
self.features_by_name = {f.unique_name(): f for f in features}
feature_dependents = defaultdict(set)
for f in features:
deps = f.get_dependencies(deep=True)
for dep in deps:
feature_dependents[dep.unique_name()].add(f.unique_name())
self.features_by_name[dep.unique_name()] = dep
subdeps = dep.get_dependencies(deep=True)