import numpy as np
import pandas as pd
import os,sys,time
import re
from datetime import datetime

time_start_notebook = time.time()

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)

# Statistical LTV
import lifetimes
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data, summary_data_from_transaction_data

# ML Approach to LTV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

import xgboost

# Evaluation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

%matplotlib inline
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
numpy          : 1.21.1
lifetimes      : 0.11.3
xgboost        : 1.4.2
sklearn        : 0.24.2
pandas         : 1.2.5
autopep8       : 1.5.7
tensorflow_docs: 0.0.0ab165fda010d80d82548b3797c1eaaf5803a00e8-
sys            : 3.9.5 (default, May 18 2021, 12:31:01) 
[Clang 10.0.0 ]
tensorflow     : 2.5.0
re             : 2.2.1
json           : 2.0.9
seaborn        : 0.11.1
bhishan        : 0.6
matplotlib     : 3.3.4


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/bhishan")
from bhishan import bp


ifile = "data/processed/online_retail.parquet.gzip"

df = pd.read_parquet(ifile)

print(df.shape)
df.head(2).append(df.tail(2))

(354321, 6)


df['date'] = pd.to_datetime(df.invoice_date.dt.date)
df['time'] = df.invoice_date.dt.time
df['hour'] = df['time'].apply(lambda x: x.hour)
df['weekend'] = df['date'].apply(lambda x: x.weekday() in [5, 6])
df['dayofweek'] = df['date'].apply(lambda x: x.dayofweek)


df.groupby('date')['quantity'].sum().plot()

<AxesSubplot:xlabel='date'>


df_lookup = df[['customer_id', 'invoice_no', 'date']]
df_lookup = df_lookup.drop_duplicates()
df_lookup = df_lookup.set_index('invoice_no')

df_lookup.head()


df_invoice_sales = df.groupby('invoice_no')[['total_sales']].sum()
df_invoice_sales.head()


df_tran = df_lookup.join(df_invoice_sales)
df_tran.head()


from lifetimes.utils import calibration_and_holdout_data


df_rfm = calibration_and_holdout_data(
    df_tran,'customer_id','date',
    calibration_period_end='2011-09-10',
    monetary_value_col='total_sales'
)   

# select only +ve monetary for gamma-gamma model
df_rfm = df_rfm.loc[df_rfm['monetary_value_cal'] > 0, :]

df_rfm.head(2)


from lifetimes import BetaGeoFitter


bgf = BetaGeoFitter(penalizer_coef=0.1)
bgf.fit(df_rfm['frequency_cal'],
        df_rfm['recency_cal'],
        df_rfm['T_cal'])

bgf.summary


# To fit Gamma-Gamma model, we first need to make sure that
# the monetary value and frequency are not correlated
# as this is one of the basi assumptions of the model.


df_rfm[['monetary_value_cal', 'frequency_cal']].corr()


ggf = GammaGammaFitter(penalizer_coef = 0)
ggf.fit(df_rfm['frequency_cal'],
        df_rfm['monetary_value_cal']);

ggf.summary


t = 89 # number of next days
pred_num_sales = bgf.predict(t,
                        df_rfm['frequency_cal'], 
                        df_rfm['recency_cal'], 
                        df_rfm['T_cal'])
pred_num_sales = pred_num_sales.fillna(0)


# Predict the average order value
pred_monetary = ggf.conditional_expected_average_profit(
    df_rfm['frequency_cal'],
    df_rfm['monetary_value_cal'])

# Putting it all together
pred_sales = pred_num_sales * pred_monetary

pred_sales.head(2)

customer_id
12747.0     747.715246
12748.0    3834.958257
dtype: float64


from sklearn import metrics as skmetrics


df_rfm.head(2)


actual_sales = df_rfm['monetary_value_holdout'] * df_rfm['frequency_holdout']
actual_sales.head(2)

customer_id
12747.0    1426.610000
12748.0    9933.340519
dtype: float64


def evaluate(actual_sales, pred_sales):
    mae = skmetrics.mean_absolute_error(actual_sales, pred_sales)
    mse = skmetrics.mean_squared_error(actual_sales, pred_sales)
    r2 = skmetrics.r2_score(actual_sales,pred_sales)
    
    print(f"MAE     : {mae:,.2f}")
    print(f"MSE     : {mse:,.0f}")
    print(f"R2-score: {r2:,.2f}")
    
    plt.scatter(x=range(len(actual_sales)),y=actual_sales-pred_sales)
    plt.show()

evaluate(actual_sales, pred_sales)

MAE     : 578.14
MSE     : 4,072,183
R2-score: 0.45


df.head(2)


df_rfm.head(2)


def get_features(df, df_rfm, feature_start, feature_end,
                 target_start,target_end):
    # periods length
    features_df = df.loc[(df.date >= feature_start) & (df.date <= feature_end), :]
    print(f'Using df from {(pd.to_datetime(feature_end) - pd.to_datetime(feature_start)).days} days',end=' ')
    print(f'to predict {(pd.to_datetime(target_end) - pd.to_datetime(target_start)).days} days.')

    #Transactions df features
    total_rev = features_df.groupby('customer_id')['total_sales'].sum().rename('total_revenue')
    recency = (features_df.groupby('customer_id')['date'].max() - features_df.groupby('customer_id')['date'].min()).apply(lambda x: x.days).rename('recency')
    frequency = features_df.groupby('customer_id')['invoice_no'].count().rename('frequency')
    t = features_df.groupby('customer_id')['date'].min().apply(lambda x: (datetime(2011, 6, 11) - x).days).rename('t')
    time_between = (t / frequency).rename('time_between')
    avg_basket_value = (total_rev / frequency).rename('avg_basket_value')
    avg_basket_size = (features_df.groupby('customer_id')['quantity'].sum() / frequency).rename('avg_basket_Size')
    returns = features_df.loc[features_df['total_sales'] < 0, :].groupby('customer_id')['invoice_no'].count().rename('num_returns')
    hour = features_df.groupby('customer_id')['hour'].median().rename('purchase_hour_med')
    dow = features_df.groupby('customer_id')['dayofweek'].median().rename('purchase_dow_med')
    weekend =  features_df.groupby('customer_id')['weekend'].mean().rename('purchase_weekend_prop')

    train_df = pd.DataFrame(index = df_rfm.index)
    train_df = train_df.join([total_rev, recency, frequency, t, time_between, avg_basket_value, avg_basket_size, returns, hour, dow, weekend])
    train_df = train_df.fillna(0)

    #Target df
    target_df = df.loc[(df.date >= target_start) & (df.date <= target_end), :]
    target_quant = target_df.groupby(['customer_id'])['date'].nunique()
    target_rev = target_df.groupby(['customer_id'])['total_sales'].sum().rename('target_rev')
    train_df = train_df.join(target_rev).fillna(0)
    
    return train_df.iloc[:, :-1], train_df.iloc[:, -1]


# train-test split
X_train, y_train = get_features(df, df_rfm, '2011-01-01', '2011-06-11', '2011-06-12', '2011-09-09')
X_test, y_test = get_features(df, df_rfm, '2011-04-02', '2011-09-10', '2011-09-11', '2011-12-09')

X_train.head()

Using df from 161 days to predict 89 days.
Using df from 161 days to predict 89 days.


y_train.head()

customer_id
12747.0     678.00
12748.0    4089.50
12749.0    1896.13
12823.0     229.50
12826.0     121.52
Name: target_rev, dtype: float64


def build_model():
    model = keras.Sequential([
    
    # first layer
    layers.Dense(128, activation='relu', input_shape=[len(X_train.columns), ]),
    layers.Dropout(0.2),
        
    # hidden layer
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),

    # hidden layer
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
        
    # output layer
    layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.Adam(0.001)

    model.compile(loss='mse',
            optimizer=optimizer,
            metrics=['mae', 'mse'])
    
    return model

# callbacks
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
EPOCHS = 100

# build the model
model = build_model()

# fit the model
history = model.fit(X_train, y_train, 
                    epochs=EPOCHS, validation_split = 0.2, verbose=0, 
                    callbacks=[early_stop, tfdocs.modeling.EpochDots()])

# results
df_history = pd.DataFrame(history.history)
print(df_history.shape)
display(df_history.head())
df_history[['loss','val_loss']].plot.line();

# predicton
dnn_preds = model.predict(X_test).ravel()

# model evaluation
evaluate(y_test, dnn_preds)

Epoch: 0, loss:1973503.1250,  mae:529.3196,  mse:1973503.1250,  val_loss:5649310.5000,  val_mae:659.9467,  val_mse:5649310.5000,  
.......................(23, 6)

MAE     : 769.21
MSE     : 13,356,743
R2-score: 0.41


import xgboost as xgb


# xgb.XGBRegressor?


model = xgb.XGBRegressor(random_state=100,max_depth=3,n_estimators=1000)

model.fit(X_train,y_train)

y_preds = model.predict(X_test).ravel()
evaluate(y_test,y_preds)

MAE     : 923.95
MSE     : 12,439,437
R2-score: 0.45


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 17 secs

Feature	Description
InvoiceNo	Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
StockCode	Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
Description	Product (item) name. Nominal.
Quantity	The quantities of each product (item) per transaction. Numeric.
InvoiceDate	Invice Date and time. Numeric, the day and time when each transaction was generated.
UnitPrice	Unit price. Numeric, Product price per unit in sterling.
CustomerID	Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
Country	Country name. Nominal, the name of the country where each customer resides.

	customer_id	invoice_no	invoice_date	quantity	unit_price	total_sales
0	17850.0	536365	2010-12-01 08:26:00	6	2.55	15.30
1	17850.0	536365	2010-12-01 08:26:00	6	3.39	20.34
541892	13113.0	581586	2011-12-09 12:49:00	24	8.95	214.80
541893	13113.0	581586	2011-12-09 12:49:00	10	7.08	70.80

	customer_id	date
invoice_no
536365	17850.0	2010-12-01
536366	17850.0	2010-12-01
536367	13047.0	2010-12-01
536368	13047.0	2010-12-01
536369	13047.0	2010-12-01

	total_sales
invoice_no
536365	139.12
536366	22.20
536367	278.73
536368	70.05
536369	17.85

	customer_id	date	total_sales
invoice_no
536365	17850.0	2010-12-01	139.12
536366	17850.0	2010-12-01	22.20
536367	13047.0	2010-12-01	278.73
536368	13047.0	2010-12-01	70.05
536369	13047.0	2010-12-01	17.85

Table of Contents

Description¶

Load the libraries¶

Read the clean data¶

Data Processing¶

Create datetime features¶

Create transaction data¶

Get RFM data¶

Modelling 01: Statistical method BG/NBD + GammaGamma¶

Prediction¶

Evaluation¶

Modelling 02: ML Approach¶

Feature Engineering¶

Modelling DNN¶

Modelling XGBoost¶

Time Taken¶

	frequency_cal	recency_cal	T_cal	monetary_value_cal	frequency_holdout	monetary_value_holdout	duration_holdout
customer_id
12747.0	7.0	260.0	279.0	344.405714	3.0	475.536667	90.0
12748.0	73.0	282.0	283.0	193.189041	39.0	254.701039	90.0

	coef	se(coef)	lower 95% bound	upper 95% bound
r	1.001046	0.028732	0.944731	1.057361
alpha	53.749212	2.160589	49.514458	57.983966
a	0.005329	0.003397	-0.001329	0.011986
b	0.075781	0.043000	-0.008500	0.160062

	monetary_value_cal	frequency_cal
monetary_value_cal	1.000000	0.078159
frequency_cal	0.078159	1.000000

	coef	se(coef)	lower 95% bound	upper 95% bound
p	2.301668	0.150103	2.007466	2.595871
q	3.716938	0.194718	3.335291	4.098585
v	453.716003	49.860609	355.989209	551.442798

	total_revenue	recency	frequency	t	time_between	avg_basket_value	avg_basket_Size	num_returns	purchase_hour_med	purchase_dow_med	purchase_weekend_prop
customer_id
12747.0	1385.13	125.0	40.0	142.0	3.550000	34.628250	12.375000	0.0	14.0	2.0	0.000000
12748.0	5790.12	156.0	752.0	157.0	0.208777	7.699628	4.414894	0.0	14.0	4.0	0.276596
12749.0	859.10	0.0	43.0	32.0	0.744186	19.979070	6.093023	0.0	15.0	1.0	0.000000
12823.0	994.50	42.0	3.0	115.0	38.333333	331.500000	43.333333	0.0	12.0	2.0	0.000000
12826.0	542.10	8.0	40.0	143.0	3.575000	13.552500	13.325000	0.0	12.0	3.0	0.000000

	loss	mae	mse	val_loss	val_mae	val_mse
0	1973503.125	529.319580	1973503.125	5649310.5	659.946716	5649310.5
1	1582560.875	490.506653	1582560.875	5247932.0	644.530151	5247932.0
2	1408744.125	472.717102	1408744.125	4885743.0	634.136780	4885743.0
3	1535116.125	498.226593	1535116.125	6669162.5	696.565186	6669162.5
4	1429936.125	491.151611	1429936.125	5466815.0	658.880249	5466815.0