Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_purchase_predictions_do_not_differ_much_if_looking_at_hourly_or_daily_frequencies(self):
transaction_data = load_transaction_data(parse_dates=['date'])
daily_summary = utils.summary_data_from_transaction_data(transaction_data, 'id', 'date', observation_period_end=max(transaction_data.date), freq='D')
hourly_summary = utils.summary_data_from_transaction_data(transaction_data, 'id', 'date', observation_period_end=max(transaction_data.date), freq='h')
thirty_days = 30
hours_in_day = 24
mbfg = estimation.ModifiedBetaGeoFitter()
np.random.seed(0)
mbfg.fit(daily_summary['frequency'], daily_summary['recency'], daily_summary['T'])
thirty_day_prediction_from_daily_data = mbfg.expected_number_of_purchases_up_to_time(thirty_days)
np.random.seed(0)
mbfg.fit(hourly_summary['frequency'], hourly_summary['recency'], hourly_summary['T'])
thirty_day_prediction_from_hourly_data = mbfg.expected_number_of_purchases_up_to_time(thirty_days * hours_in_day)
npt.assert_almost_equal(thirty_day_prediction_from_daily_data, thirty_day_prediction_from_hourly_data)
freq_multiplier = 1 working and compare with tested data for last 4 records.
dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14']
actual_trans = [11, 12, 15, 19]
expected_trans = [10.67, 12.67, 14.87, 17.24]
"""
datetime_col = 'date'
customer_id_col = 'id_sample'
t = 14
datetime_format = '%Y%m%d'
freq = 'D'
observation_period_end = '19970930'
freq_multiplier = 1
transactions_summary = utils.summary_data_from_transaction_data(
cdnow_transactions, customer_id_col, datetime_col,
datetime_format=datetime_format, freq=freq,
freq_multiplier=freq_multiplier,
observation_period_end=observation_period_end)
transactions_summary = transactions_summary.reset_index()
model = BetaGeoFitter()
model.fit(transactions_summary['frequency'],
transactions_summary['recency'],
transactions_summary['T'])
df_cum = utils.expected_cumulative_transactions(
model, cdnow_transactions, datetime_col, customer_id_col, t,
datetime_format, freq, set_index_date=True,
freq_multiplier=freq_multiplier)
def df_cum_transactions(cdnow_transactions):
datetime_col = 'date'
customer_id_col = 'id_sample'
t = 25 * 7
datetime_format = '%Y%m%d'
freq = 'D'
observation_period_end = '19970930'
freq_multiplier = 7
transactions_summary = utils.summary_data_from_transaction_data(
cdnow_transactions, customer_id_col, datetime_col,
datetime_format=datetime_format, freq=freq, freq_multiplier=freq_multiplier,
observation_period_end=observation_period_end)
transactions_summary = transactions_summary.reset_index()
model = ParetoNBDFitter()
model.fit(transactions_summary['frequency'],
transactions_summary['recency'],
transactions_summary['T'])
df_cum = utils.expected_cumulative_transactions(
model, cdnow_transactions, datetime_col, customer_id_col, t,
datetime_format, freq, set_index_date=False, freq_multiplier=freq_multiplier)
return df_cum
def test_summary_data_from_transaction_data_squashes_period_purchases_to_one_purchase():
transactions = pd.DataFrame([[1, '2015-01-01'], [1, '2015-01-01']], columns=['id', 't'])
actual = utils.summary_data_from_transaction_data(transactions, 'id', 't', freq='W')
assert actual.loc[1]['frequency'] == 1. - 1.
def test_summary_date_from_transaction_data_with_specific_non_daily_frequency(large_transaction_level_data):
today = '20150207'
actual = utils.summary_data_from_transaction_data(large_transaction_level_data, 'id', 'date', observation_period_end=today, freq='W')
expected = pd.DataFrame([[1, 1., 5., 5.],
[2, 0., 0., 5.],
[3, 1., 1., 5.],
[4, 1., 3., 3.],
[5, 0., 0., 3.],
[6, 0., 0., 0.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
assert_frame_equal(actual, expected)
def test_summary_data_from_transaction_data_will_choose_the_correct_first_order_to_drop_in_monetary_transactions():
# this is the correct behaviour. See https://github.com/CamDavidsonPilon/lifetimes/issues/85
# and test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations
cust = pd.Series([2, 2, 2])
dates_ordered = pd.to_datetime(pd.Series([
'2014-03-14 00:00:00',
'2014-04-09 00:00:00',
'2014-05-21 00:00:00']))
sales = pd.Series([10, 20, 25])
transaction_data = pd.DataFrame({'date': dates_ordered, 'id': cust, 'sales': sales})
summary_ordered_data = utils.summary_data_from_transaction_data(transaction_data, 'id', 'date', 'sales')
dates_unordered = pd.to_datetime(pd.Series([
'2014-04-09 00:00:00',
'2014-03-14 00:00:00',
'2014-05-21 00:00:00']))
sales = pd.Series([20, 10, 25])
transaction_data = pd.DataFrame({'date': dates_unordered, 'id': cust, 'sales': sales})
summary_unordered_data = utils.summary_data_from_transaction_data(transaction_data, 'id', 'date', 'sales')
assert_frame_equal(summary_ordered_data, summary_unordered_data)
assert summary_ordered_data['monetary_value'].loc[2] == 22.5
def test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations():
# see http://brucehardie.com/papers/rfm_clv_2005-02-16.pdf
# RFM and CLV: Using Iso-value Curves for Customer Base Analysis
df = pd.read_csv('lifetimes/datasets/CDNOW_sample.txt', sep='\s+', header=None, names=['_id', 'id', 'date', 'cds_bought', 'spent'])
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df_train = df[df['date'] < '1997-10-01']
summary = utils.summary_data_from_transaction_data(df_train, 'id', 'date', 'spent')
results = summary[summary['frequency'] > 0]['monetary_value'].describe()
assert np.round(results.loc['mean']) == 35
assert np.round(results.loc['std']) == 30
assert np.round(results.loc['min']) == 3
assert np.round(results.loc['50%']) == 27
assert np.round(results.loc['max']) == 300
assert np.round(results.loc['count']) == 946
def test_summary_date_from_transaction_with_monetary_values(large_transaction_level_data_with_monetary_value):
today = '20150207'
actual = utils.summary_data_from_transaction_data(large_transaction_level_data_with_monetary_value, 'id', 'date', monetary_value_col='monetary_value', observation_period_end=today)
expected = pd.DataFrame([[1, 1., 36., 37., 2],
[2, 0., 0., 37., 0],
[3, 2., 4., 37., 3],
[4, 2., 20., 22., 3],
[5, 2., 2., 22., 4.5],
[6, 0., 0., 5., 0]], columns=['id', 'frequency', 'recency', 'T', 'monetary_value']).set_index('id')
assert_frame_equal(actual, expected)
def test_beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_period_end, freq, size):
np.random.seed(188898)
transaction_data = beta_geometric_nbd_model_transactional_data(
T=T,r=r,alpha=alpha,a=a,b=b, observation_period_end=observation_period_end, freq=freq, size=size
)
actual = summary_data_from_transaction_data(transactions=transaction_data,
customer_id_col='customer_id', datetime_col='date',
observation_period_end=observation_period_end,
freq=freq)
np.random.seed(188898)
expected = beta_geometric_nbd_model(T=T,r=r,alpha=alpha,a=a,b=b,size=size)[['frequency', 'recency', 'T']]
expected['recency'] = expected['recency'].apply(np.ceil)
expected = expected.reset_index(drop=True)
actual = actual.reset_index(drop=True)
assert expected.equals(actual)
def test_summary_data_from_transaction_data_returns_correct_results(transaction_level_data):
today = '2015-02-07'
actual = utils.summary_data_from_transaction_data(transaction_level_data, 'id', 'date', observation_period_end=today)
expected = pd.DataFrame([[1, 1., 5., 6.],
[2, 0., 0., 37.],
[3, 2., 4., 37.]], columns=['id', 'frequency', 'recency', 'T']).set_index('id')
assert_frame_equal(actual, expected)