How to use the featuretools.EntitySet function in featuretools

To help you get started, we’ve selected a few featuretools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pan5431333 / featuretools4s / featuretools4s / featuretools4s.py View on Github external
def run_single_partition(iterator,
                             all_columns: list,
                             es_id: str,
                             entities: list,
                             relationships: list):
        list_iter = list(iterator)

        if len(list_iter) > 0:
            data = pd.DataFrame(list_iter, columns=all_columns)

            es = ft.EntitySet(id=es_id)
            for entity in entities:
                columns = entity.columns

                # TODO drop_duplicates here is TOO expensive. How to avoid using it?
                df = data[columns].drop_duplicates()
                entity_id = entity.entity_id
                df.columns = [EntitySpark.recover_col_name(entity_id, col) for col in columns]

                es.entity_from_dataframe(entity_id=entity.entity_id,
                                         dataframe=df,
                                         index=EntitySpark.recover_col_name(entity_id, entity.index),
                                         variable_types=entity.variable_types,
                                         time_index=EntitySpark.recover_col_name(entity_id, entity.time_index),
                                         secondary_time_index=EntitySpark.recover_col_name(entity_id,
                                                                                           entity.secondary_time_index))
github FeatureLabs / featuretools / featuretools / demo / mock_customer.py View on Github external
transactions_df = pd.DataFrame({"transaction_id": range(1, n_transactions + 1)})
    transactions_df["session_id"] = choice(sessions_df["session_id"], n_transactions)
    transactions_df = transactions_df.sort_values("session_id").reset_index(drop=True)
    transactions_df["transaction_time"] = pd.date_range('1/1/2014', periods=n_transactions, freq='65s')  # todo make these less regular
    transactions_df["product_id"] = pd.Categorical(choice(products_df["product_id"], n_transactions))
    transactions_df["amount"] = random.randint(500, 15000, n_transactions) / 100

    # calculate and merge in session start
    # based on the times we came up with for transactions
    session_starts = transactions_df.drop_duplicates("session_id")[["session_id", "transaction_time"]].rename(columns={"transaction_time": "session_start"})
    sessions_df = sessions_df.merge(session_starts)

    if return_single_table:
        return transactions_df.merge(sessions_df).merge(customers_df).merge(products_df).reset_index(drop=True)
    elif return_entityset:
        es = ft.EntitySet(id="transactions")
        es = es.entity_from_dataframe(entity_id="transactions",
                                      dataframe=transactions_df,
                                      index="transaction_id",
                                      time_index="transaction_time",
                                      variable_types={"product_id": Categorical})

        es = es.entity_from_dataframe(entity_id="products",
                                      dataframe=products_df,
                                      index="product_id")

        es = es.entity_from_dataframe(entity_id="sessions",
                                      dataframe=sessions_df,
                                      index="session_id",
                                      time_index="session_start")

        es = es.entity_from_dataframe(entity_id="customers",
github intel-analytics / analytics-zoo / pyzoo / zoo / automl / feature / time_sequence.py View on Github external
def _generate_features(self, input_df):
        df = input_df.copy()
        df["id"] = df.index + 1

        es = ft.EntitySet(id="data")
        es = es.entity_from_dataframe(entity_id="time_seq",
                                      dataframe=df,
                                      index="id",
                                      time_index=self.dt_col)

        def is_awake(column):
            hour = column.dt.hour
            return (((hour >= 6) & (hour <= 23)) | (hour == 0)).astype(int)

        def is_busy_hours(column):
            hour = column.dt.hour
            return (((hour >= 7) & (hour <= 9)) | (hour >= 16) & (hour <= 19)).astype(int)

        IsAwake = make_trans_primitive(function=is_awake,
                                       input_types=[DatetimeTimeIndex],
                                       return_type=Numeric)
github HDI-Project / MLBlocks / examples / pipelines / multitable / multitable.py View on Github external
def make_entity_set(orders_table, order_products_table):
    es = ft.EntitySet("instacart")

    es.entity_from_dataframe(
        entity_id="order_products",
        dataframe=order_products_table,
        index="order_product_id",
        variable_types={
            "aisle_id": ft.variable_types.Categorical,
            "reordered": ft.variable_types.Boolean
        },
        time_index="order_time")

    es.entity_from_dataframe(
        entity_id="orders",
        dataframe=orders_table,
        index="order_id",
        time_index="order_time")
github FeatureLabs / featuretools / featuretools / demo / flight.py View on Github external
def make_es(data):
    es = ft.EntitySet('Flight Data')
    arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
                        'national_airspace_delay', 'security_delay',
                        'late_aircraft_delay', 'canceled', 'diverted',
                        'taxi_in', 'taxi_out', 'air_time', 'dep_time']

    variable_types = {'flight_num': vtypes.Categorical,
                      'distance_group': vtypes.Ordinal,
                      'canceled': vtypes.Boolean,
                      'diverted': vtypes.Boolean}

    es.entity_from_dataframe('trip_logs',
                             data,
                             index='trip_log_id',
                             make_index=True,
                             time_index='date_scheduled',
                             secondary_time_index={'arr_time': arr_time_columns},
github HDI-Project / Trane / trane / utils / featuretools_wrapper.py View on Github external
def __init__(self, df, entity_col, time_col, variable_types, name):
        assert name != entity_col

        self.entity_col = entity_col
        self.es = ft.EntitySet(id=name)
        self.es = self.es.entity_from_dataframe(entity_id=name,
                                                dataframe=df,
                                                time_index=time_col,
                                                index="__id__",
                                                make_index=True,
                                                variable_types=variable_types
                                                )

        entity_df = pd.DataFrame([[i] for i in set(df[entity_col])], columns=[entity_col])
        self.es = self.es.entity_from_dataframe(entity_id=entity_col,
                                                dataframe=entity_df,
                                                index=entity_col
                                                )
        new_relationship = ft.Relationship(self.es[entity_col][entity_col],
                                           self.es[name][entity_col])
        self.es = self.es.add_relationship(new_relationship)