This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.
References
Feature | Description |
---|---|
InvoiceNo | Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation. |
StockCode | Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product. |
Description | Product (item) name. Nominal. |
Quantity | The quantities of each product (item) per transaction. Numeric. |
InvoiceDate | Invice Date and time. Numeric, the day and time when each transaction was generated. |
UnitPrice | Unit price. Numeric, Product price per unit in sterling. |
CustomerID | Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer. |
Country | Country name. Nominal, the name of the country where each customer resides. |
import numpy as np
import pandas as pd
import os,sys,time
import re
from datetime import datetime
time_start_notebook = time.time()
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
# Statistical LTV
import lifetimes
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data, summary_data_from_transaction_data
# ML Approach to LTV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots
import xgboost
# Evaluation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
%matplotlib inline
%load_ext watermark
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark numpy : 1.21.1 lifetimes : 0.11.3 xgboost : 1.4.2 sklearn : 0.24.2 pandas : 1.2.5 autopep8 : 1.5.7 tensorflow_docs: 0.0.0ab165fda010d80d82548b3797c1eaaf5803a00e8- sys : 3.9.5 (default, May 18 2021, 12:31:01) [Clang 10.0.0 ] tensorflow : 2.5.0 re : 2.2.1 json : 2.0.9 seaborn : 0.11.1 bhishan : 0.6 matplotlib : 3.3.4
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/bhishan")
from bhishan import bp
ifile = "data/processed/online_retail.parquet.gzip"
df = pd.read_parquet(ifile)
print(df.shape)
df.head(2).append(df.tail(2))
(354321, 6)
customer_id | invoice_no | invoice_date | quantity | unit_price | total_sales | |
---|---|---|---|---|---|---|
0 | 17850.0 | 536365 | 2010-12-01 08:26:00 | 6 | 2.55 | 15.30 |
1 | 17850.0 | 536365 | 2010-12-01 08:26:00 | 6 | 3.39 | 20.34 |
541892 | 13113.0 | 581586 | 2011-12-09 12:49:00 | 24 | 8.95 | 214.80 |
541893 | 13113.0 | 581586 | 2011-12-09 12:49:00 | 10 | 7.08 | 70.80 |
df['date'] = pd.to_datetime(df.invoice_date.dt.date)
df['time'] = df.invoice_date.dt.time
df['hour'] = df['time'].apply(lambda x: x.hour)
df['weekend'] = df['date'].apply(lambda x: x.weekday() in [5, 6])
df['dayofweek'] = df['date'].apply(lambda x: x.dayofweek)
df.groupby('date')['quantity'].sum().plot()
<AxesSubplot:xlabel='date'>
df_lookup = df[['customer_id', 'invoice_no', 'date']]
df_lookup = df_lookup.drop_duplicates()
df_lookup = df_lookup.set_index('invoice_no')
df_lookup.head()
customer_id | date | |
---|---|---|
invoice_no | ||
536365 | 17850.0 | 2010-12-01 |
536366 | 17850.0 | 2010-12-01 |
536367 | 13047.0 | 2010-12-01 |
536368 | 13047.0 | 2010-12-01 |
536369 | 13047.0 | 2010-12-01 |
df_invoice_sales = df.groupby('invoice_no')[['total_sales']].sum()
df_invoice_sales.head()
total_sales | |
---|---|
invoice_no | |
536365 | 139.12 |
536366 | 22.20 |
536367 | 278.73 |
536368 | 70.05 |
536369 | 17.85 |
df_tran = df_lookup.join(df_invoice_sales)
df_tran.head()
customer_id | date | total_sales | |
---|---|---|---|
invoice_no | |||
536365 | 17850.0 | 2010-12-01 | 139.12 |
536366 | 17850.0 | 2010-12-01 | 22.20 |
536367 | 13047.0 | 2010-12-01 | 278.73 |
536368 | 13047.0 | 2010-12-01 | 70.05 |
536369 | 13047.0 | 2010-12-01 | 17.85 |
from lifetimes.utils import calibration_and_holdout_data
df_rfm = calibration_and_holdout_data(
df_tran,'customer_id','date',
calibration_period_end='2011-09-10',
monetary_value_col='total_sales'
)
# select only +ve monetary for gamma-gamma model
df_rfm = df_rfm.loc[df_rfm['monetary_value_cal'] > 0, :]
df_rfm.head(2)
frequency_cal | recency_cal | T_cal | monetary_value_cal | frequency_holdout | monetary_value_holdout | duration_holdout | |
---|---|---|---|---|---|---|---|
customer_id | |||||||
12747.0 | 7.0 | 260.0 | 279.0 | 344.405714 | 3.0 | 475.536667 | 90.0 |
12748.0 | 73.0 | 282.0 | 283.0 | 193.189041 | 39.0 | 254.701039 | 90.0 |
from lifetimes import BetaGeoFitter
bgf = BetaGeoFitter(penalizer_coef=0.1)
bgf.fit(df_rfm['frequency_cal'],
df_rfm['recency_cal'],
df_rfm['T_cal'])
bgf.summary
coef | se(coef) | lower 95% bound | upper 95% bound | |
---|---|---|---|---|
r | 1.001046 | 0.028732 | 0.944731 | 1.057361 |
alpha | 53.749212 | 2.160589 | 49.514458 | 57.983966 |
a | 0.005329 | 0.003397 | -0.001329 | 0.011986 |
b | 0.075781 | 0.043000 | -0.008500 | 0.160062 |
# To fit Gamma-Gamma model, we first need to make sure that
# the monetary value and frequency are not correlated
# as this is one of the basi assumptions of the model.
df_rfm[['monetary_value_cal', 'frequency_cal']].corr()
monetary_value_cal | frequency_cal | |
---|---|---|
monetary_value_cal | 1.000000 | 0.078159 |
frequency_cal | 0.078159 | 1.000000 |
ggf = GammaGammaFitter(penalizer_coef = 0)
ggf.fit(df_rfm['frequency_cal'],
df_rfm['monetary_value_cal']);
ggf.summary
coef | se(coef) | lower 95% bound | upper 95% bound | |
---|---|---|---|---|
p | 2.301668 | 0.150103 | 2.007466 | 2.595871 |
q | 3.716938 | 0.194718 | 3.335291 | 4.098585 |
v | 453.716003 | 49.860609 | 355.989209 | 551.442798 |
Prediction is done in three steps:
t = 89 # number of next days
pred_num_sales = bgf.predict(t,
df_rfm['frequency_cal'],
df_rfm['recency_cal'],
df_rfm['T_cal'])
pred_num_sales = pred_num_sales.fillna(0)
# Predict the average order value
pred_monetary = ggf.conditional_expected_average_profit(
df_rfm['frequency_cal'],
df_rfm['monetary_value_cal'])
# Putting it all together
pred_sales = pred_num_sales * pred_monetary
pred_sales.head(2)
customer_id 12747.0 747.715246 12748.0 3834.958257 dtype: float64
from sklearn import metrics as skmetrics
df_rfm.head(2)
frequency_cal | recency_cal | T_cal | monetary_value_cal | frequency_holdout | monetary_value_holdout | duration_holdout | |
---|---|---|---|---|---|---|---|
customer_id | |||||||
12747.0 | 7.0 | 260.0 | 279.0 | 344.405714 | 3.0 | 475.536667 | 90.0 |
12748.0 | 73.0 | 282.0 | 283.0 | 193.189041 | 39.0 | 254.701039 | 90.0 |
actual_sales = df_rfm['monetary_value_holdout'] * df_rfm['frequency_holdout']
actual_sales.head(2)
customer_id 12747.0 1426.610000 12748.0 9933.340519 dtype: float64
def evaluate(actual_sales, pred_sales):
mae = skmetrics.mean_absolute_error(actual_sales, pred_sales)
mse = skmetrics.mean_squared_error(actual_sales, pred_sales)
r2 = skmetrics.r2_score(actual_sales,pred_sales)
print(f"MAE : {mae:,.2f}")
print(f"MSE : {mse:,.0f}")
print(f"R2-score: {r2:,.2f}")
plt.scatter(x=range(len(actual_sales)),y=actual_sales-pred_sales)
plt.show()
evaluate(actual_sales, pred_sales)
MAE : 578.14 MSE : 4,072,183 R2-score: 0.45
df.head(2)
customer_id | invoice_no | invoice_date | quantity | unit_price | total_sales | date | time | hour | weekend | dayofweek | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17850.0 | 536365 | 2010-12-01 08:26:00 | 6 | 2.55 | 15.30 | 2010-12-01 | 08:26:00 | 8 | False | 2 |
1 | 17850.0 | 536365 | 2010-12-01 08:26:00 | 6 | 3.39 | 20.34 | 2010-12-01 | 08:26:00 | 8 | False | 2 |
df_rfm.head(2)
frequency_cal | recency_cal | T_cal | monetary_value_cal | frequency_holdout | monetary_value_holdout | duration_holdout | |
---|---|---|---|---|---|---|---|
customer_id | |||||||
12747.0 | 7.0 | 260.0 | 279.0 | 344.405714 | 3.0 | 475.536667 | 90.0 |
12748.0 | 73.0 | 282.0 | 283.0 | 193.189041 | 39.0 | 254.701039 | 90.0 |
def get_features(df, df_rfm, feature_start, feature_end,
target_start,target_end):
# periods length
features_df = df.loc[(df.date >= feature_start) & (df.date <= feature_end), :]
print(f'Using df from {(pd.to_datetime(feature_end) - pd.to_datetime(feature_start)).days} days',end=' ')
print(f'to predict {(pd.to_datetime(target_end) - pd.to_datetime(target_start)).days} days.')
#Transactions df features
total_rev = features_df.groupby('customer_id')['total_sales'].sum().rename('total_revenue')
recency = (features_df.groupby('customer_id')['date'].max() - features_df.groupby('customer_id')['date'].min()).apply(lambda x: x.days).rename('recency')
frequency = features_df.groupby('customer_id')['invoice_no'].count().rename('frequency')
t = features_df.groupby('customer_id')['date'].min().apply(lambda x: (datetime(2011, 6, 11) - x).days).rename('t')
time_between = (t / frequency).rename('time_between')
avg_basket_value = (total_rev / frequency).rename('avg_basket_value')
avg_basket_size = (features_df.groupby('customer_id')['quantity'].sum() / frequency).rename('avg_basket_Size')
returns = features_df.loc[features_df['total_sales'] < 0, :].groupby('customer_id')['invoice_no'].count().rename('num_returns')
hour = features_df.groupby('customer_id')['hour'].median().rename('purchase_hour_med')
dow = features_df.groupby('customer_id')['dayofweek'].median().rename('purchase_dow_med')
weekend = features_df.groupby('customer_id')['weekend'].mean().rename('purchase_weekend_prop')
train_df = pd.DataFrame(index = df_rfm.index)
train_df = train_df.join([total_rev, recency, frequency, t, time_between, avg_basket_value, avg_basket_size, returns, hour, dow, weekend])
train_df = train_df.fillna(0)
#Target df
target_df = df.loc[(df.date >= target_start) & (df.date <= target_end), :]
target_quant = target_df.groupby(['customer_id'])['date'].nunique()
target_rev = target_df.groupby(['customer_id'])['total_sales'].sum().rename('target_rev')
train_df = train_df.join(target_rev).fillna(0)
return train_df.iloc[:, :-1], train_df.iloc[:, -1]
# train-test split
X_train, y_train = get_features(df, df_rfm, '2011-01-01', '2011-06-11', '2011-06-12', '2011-09-09')
X_test, y_test = get_features(df, df_rfm, '2011-04-02', '2011-09-10', '2011-09-11', '2011-12-09')
X_train.head()
Using df from 161 days to predict 89 days. Using df from 161 days to predict 89 days.
total_revenue | recency | frequency | t | time_between | avg_basket_value | avg_basket_Size | num_returns | purchase_hour_med | purchase_dow_med | purchase_weekend_prop | |
---|---|---|---|---|---|---|---|---|---|---|---|
customer_id | |||||||||||
12747.0 | 1385.13 | 125.0 | 40.0 | 142.0 | 3.550000 | 34.628250 | 12.375000 | 0.0 | 14.0 | 2.0 | 0.000000 |
12748.0 | 5790.12 | 156.0 | 752.0 | 157.0 | 0.208777 | 7.699628 | 4.414894 | 0.0 | 14.0 | 4.0 | 0.276596 |
12749.0 | 859.10 | 0.0 | 43.0 | 32.0 | 0.744186 | 19.979070 | 6.093023 | 0.0 | 15.0 | 1.0 | 0.000000 |
12823.0 | 994.50 | 42.0 | 3.0 | 115.0 | 38.333333 | 331.500000 | 43.333333 | 0.0 | 12.0 | 2.0 | 0.000000 |
12826.0 | 542.10 | 8.0 | 40.0 | 143.0 | 3.575000 | 13.552500 | 13.325000 | 0.0 | 12.0 | 3.0 | 0.000000 |
y_train.head()
customer_id 12747.0 678.00 12748.0 4089.50 12749.0 1896.13 12823.0 229.50 12826.0 121.52 Name: target_rev, dtype: float64
def build_model():
model = keras.Sequential([
# first layer
layers.Dense(128, activation='relu', input_shape=[len(X_train.columns), ]),
layers.Dropout(0.2),
# hidden layer
layers.Dense(64, activation='relu'),
layers.Dropout(0.2),
# hidden layer
layers.Dense(32, activation='relu'),
layers.Dropout(0.2),
# output layer
layers.Dense(1)
])
optimizer = tf.keras.optimizers.Adam(0.001)
model.compile(loss='mse',
optimizer=optimizer,
metrics=['mae', 'mse'])
return model
# callbacks
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
EPOCHS = 100
# build the model
model = build_model()
# fit the model
history = model.fit(X_train, y_train,
epochs=EPOCHS, validation_split = 0.2, verbose=0,
callbacks=[early_stop, tfdocs.modeling.EpochDots()])
# results
df_history = pd.DataFrame(history.history)
print(df_history.shape)
display(df_history.head())
df_history[['loss','val_loss']].plot.line();
# predicton
dnn_preds = model.predict(X_test).ravel()
# model evaluation
evaluate(y_test, dnn_preds)
Epoch: 0, loss:1973503.1250, mae:529.3196, mse:1973503.1250, val_loss:5649310.5000, val_mae:659.9467, val_mse:5649310.5000, .......................(23, 6)
loss | mae | mse | val_loss | val_mae | val_mse | |
---|---|---|---|---|---|---|
0 | 1973503.125 | 529.319580 | 1973503.125 | 5649310.5 | 659.946716 | 5649310.5 |
1 | 1582560.875 | 490.506653 | 1582560.875 | 5247932.0 | 644.530151 | 5247932.0 |
2 | 1408744.125 | 472.717102 | 1408744.125 | 4885743.0 | 634.136780 | 4885743.0 |
3 | 1535116.125 | 498.226593 | 1535116.125 | 6669162.5 | 696.565186 | 6669162.5 |
4 | 1429936.125 | 491.151611 | 1429936.125 | 5466815.0 | 658.880249 | 5466815.0 |
MAE : 769.21 MSE : 13,356,743 R2-score: 0.41
import xgboost as xgb
# xgb.XGBRegressor?
model = xgb.XGBRegressor(random_state=100,max_depth=3,n_estimators=1000)
model.fit(X_train,y_train)
y_preds = model.predict(X_test).ravel()
evaluate(y_test,y_preds)
MAE : 923.95 MSE : 12,439,437 R2-score: 0.45
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 17 secs