Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def run_single_partition(iterator,
all_columns: list,
es_id: str,
entities: list,
relationships: list):
list_iter = list(iterator)
if len(list_iter) > 0:
data = pd.DataFrame(list_iter, columns=all_columns)
es = ft.EntitySet(id=es_id)
for entity in entities:
columns = entity.columns
# TODO drop_duplicates here is TOO expensive. How to avoid using it?
df = data[columns].drop_duplicates()
entity_id = entity.entity_id
df.columns = [EntitySpark.recover_col_name(entity_id, col) for col in columns]
es.entity_from_dataframe(entity_id=entity.entity_id,
dataframe=df,
index=EntitySpark.recover_col_name(entity_id, entity.index),
variable_types=entity.variable_types,
time_index=EntitySpark.recover_col_name(entity_id, entity.time_index),
secondary_time_index=EntitySpark.recover_col_name(entity_id,
entity.secondary_time_index))
transactions_df = pd.DataFrame({"transaction_id": range(1, n_transactions + 1)})
transactions_df["session_id"] = choice(sessions_df["session_id"], n_transactions)
transactions_df = transactions_df.sort_values("session_id").reset_index(drop=True)
transactions_df["transaction_time"] = pd.date_range('1/1/2014', periods=n_transactions, freq='65s') # todo make these less regular
transactions_df["product_id"] = pd.Categorical(choice(products_df["product_id"], n_transactions))
transactions_df["amount"] = random.randint(500, 15000, n_transactions) / 100
# calculate and merge in session start
# based on the times we came up with for transactions
session_starts = transactions_df.drop_duplicates("session_id")[["session_id", "transaction_time"]].rename(columns={"transaction_time": "session_start"})
sessions_df = sessions_df.merge(session_starts)
if return_single_table:
return transactions_df.merge(sessions_df).merge(customers_df).merge(products_df).reset_index(drop=True)
elif return_entityset:
es = ft.EntitySet(id="transactions")
es = es.entity_from_dataframe(entity_id="transactions",
dataframe=transactions_df,
index="transaction_id",
time_index="transaction_time",
variable_types={"product_id": Categorical})
es = es.entity_from_dataframe(entity_id="products",
dataframe=products_df,
index="product_id")
es = es.entity_from_dataframe(entity_id="sessions",
dataframe=sessions_df,
index="session_id",
time_index="session_start")
es = es.entity_from_dataframe(entity_id="customers",
def _generate_features(self, input_df):
df = input_df.copy()
df["id"] = df.index + 1
es = ft.EntitySet(id="data")
es = es.entity_from_dataframe(entity_id="time_seq",
dataframe=df,
index="id",
time_index=self.dt_col)
def is_awake(column):
hour = column.dt.hour
return (((hour >= 6) & (hour <= 23)) | (hour == 0)).astype(int)
def is_busy_hours(column):
hour = column.dt.hour
return (((hour >= 7) & (hour <= 9)) | (hour >= 16) & (hour <= 19)).astype(int)
IsAwake = make_trans_primitive(function=is_awake,
input_types=[DatetimeTimeIndex],
return_type=Numeric)
def make_entity_set(orders_table, order_products_table):
es = ft.EntitySet("instacart")
es.entity_from_dataframe(
entity_id="order_products",
dataframe=order_products_table,
index="order_product_id",
variable_types={
"aisle_id": ft.variable_types.Categorical,
"reordered": ft.variable_types.Boolean
},
time_index="order_time")
es.entity_from_dataframe(
entity_id="orders",
dataframe=orders_table,
index="order_id",
time_index="order_time")
def make_es(data):
es = ft.EntitySet('Flight Data')
arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
'national_airspace_delay', 'security_delay',
'late_aircraft_delay', 'canceled', 'diverted',
'taxi_in', 'taxi_out', 'air_time', 'dep_time']
variable_types = {'flight_num': vtypes.Categorical,
'distance_group': vtypes.Ordinal,
'canceled': vtypes.Boolean,
'diverted': vtypes.Boolean}
es.entity_from_dataframe('trip_logs',
data,
index='trip_log_id',
make_index=True,
time_index='date_scheduled',
secondary_time_index={'arr_time': arr_time_columns},
def __init__(self, df, entity_col, time_col, variable_types, name):
assert name != entity_col
self.entity_col = entity_col
self.es = ft.EntitySet(id=name)
self.es = self.es.entity_from_dataframe(entity_id=name,
dataframe=df,
time_index=time_col,
index="__id__",
make_index=True,
variable_types=variable_types
)
entity_df = pd.DataFrame([[i] for i in set(df[entity_col])], columns=[entity_col])
self.es = self.es.entity_from_dataframe(entity_id=entity_col,
dataframe=entity_df,
index=entity_col
)
new_relationship = ft.Relationship(self.es[entity_col][entity_col],
self.es[name][entity_col])
self.es = self.es.add_relationship(new_relationship)