import numpy as np
import pandas as pd
import os,sys,time
import re

time_start_notebook = time.time()

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)

# special
import lifetimes

%matplotlib inline
%load_ext watermark
%watermark -iv

autopep8  : 1.5.7
re        : 2.2.1
json      : 2.0.9
matplotlib: 3.3.4
lifetimes : 0.11.3
seaborn   : 0.11.1
pandas    : 1.2.5
numpy     : 1.21.1
sys       : 3.9.5 (default, May 18 2021, 12:31:01) 
[Clang 10.0.0 ]


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/bhishan")
from bhishan import bp


ifile = "data/processed/online_retail.parquet.gzip"

df = pd.read_parquet(ifile)

print(df.shape)
df.head(2).append(df.tail(2))

(354321, 6)


df_cust = df.groupby('customer_id').agg(
    age = ('invoice_date', lambda x: (x.max()-x.min()).days),
    frequency = ('invoice_no',len),
    total_sales = ('total_sales','sum')
    )

df_cust.head()


avg_sales = df_cust['total_sales'].mean()
purchase_freq = df_cust['frequency'].mean()
retention_rate = df_cust.query("frequency>1").shape[0] / df_cust.shape[0]

churn = 1 -retention_rate
profit_margin = 0.05 # choose yourself


CLV = avg_sales * purchase_freq / churn * profit_margin

print(f"Cutomer Lifetime Value CLV = ${CLV:,.0f}")

Cutomer Lifetime Value CLV = $471,851


"""
Observation:

This is aggregate model, it gives unrealistic CLV.

""";


df_cust['total_sales'].describe()

count      3920.000000
mean       1864.385601
std        7482.817477
min           3.750000
25%         300.280000
50%         652.280000
75%        1576.585000
max      259657.300000
Name: total_sales, dtype: float64


"""
Observation:

1. The CLV using Aggregate method gives value 797k,

But, 75% of customers have total sales less than 1.5k.

We need to use other methods than Aggregate model to estimate the 
customer lifetime value.

In the next section, we will use cohort method.

""";


df.head(2)


df_cust = df.groupby('customer_id').agg(
    start_month = ('invoice_date', lambda x: x.min().month),
    frequency = ('invoice_no',len),
    total_sales = ('total_sales','sum')
)

df_cust.head()


months = ['Jan', 'Feb', 'March', 'Apr', 'May',
          'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

clv_monthly = []

for i in range(1,13):
    dfc = df_cust.query("start_month==@i")
    
    avg_sales = dfc['total_sales'].mean()
    pur_freq = dfc['frequency'].mean()
    ret_rate = dfc.query("frequency>1").shape[0]/dfc.shape[0]
    churn = 1-ret_rate
    
    CLV = avg_sales *pur_freq / churn *profit_margin
    clv_monthly.append(CLV)


df_clv_monthly = pd.DataFrame({'Month': months,
                              'CLV': clv_monthly})

df_clv_monthly.style.background_gradient()


df_clv_monthly.plot.bar(x='Month',y='CLV')

<AxesSubplot:xlabel='Month'>


"""
Observation:

1. We can see CLV is very in December,
   this might be becasue of Christmas.

""";


import lifetimes


bp.show_methods(lifetimes)


bp.show_methods(lifetimes.utils)


print(df.shape)
df.head(2)

(354321, 6)


# make sure thare are no nans in customer id
df['customer_id'].isna().sum()

0


df.customer_id.dtype

dtype('O')


df.query("customer_id == 'nan'").shape

(0, 6)


df_rfm = lifetimes.utils.summary_data_from_transaction_data(
    df,'customer_id','invoice_date','total_sales')

df_rfm = df_rfm.reset_index()

print(df_rfm.shape)
df_rfm.head(2)

(3920, 5)


"""
Observation:

Here the value of 0 in frequency and recency means that,
these are one time buyers.

""";


df_rfm['frequency'].hist(bins=50)

<AxesSubplot:>


df_rfm.describe()


# lifetimes.BetaGeoFitter?


bgf = lifetimes.BetaGeoFitter(penalizer_coef=0.0)

bgf.fit(df_rfm['frequency'],df_rfm['recency'],df_rfm['T']);

bgf.summary


bp.show_methods(bgf,2)


import lifetimes.plotting
bp.show_methods(lifetimes.plotting,2)


df_rfm['prob_alive'] = bgf.conditional_probability_alive(
    df_rfm['frequency'],
    df_rfm['recency'],
    df_rfm['T']
)
df_rfm.head()


"""
Note:

The probabilty of being alive is calculated based on
  the recency and frequency of a customer.
  
So, If a customer has bought multiple times (frequency)
    and the time between first & last transaction is high (recency),
    then their probability of being alive is high.

""";


from lifetimes.plotting import plot_probability_alive_matrix

fig = plt.figure(figsize=(12,8))
plot_probability_alive_matrix(bgf);


t = 30 # next n days
df_rfm['exp_num_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
    t,
    df_rfm['frequency'],
    df_rfm['recency'],
    df_rfm['T'])

df1 = df_rfm.sort_values(by='exp_num_purchases', ascending=False).head()
df1


# look at topmost customer
ser = df1.iloc[0]
ser

customer_id             12748.0
frequency                 112.0
recency                   373.0
T                         373.0
monetary_value       301.024821
prob_alive             0.999954
exp_num_purchases      7.645667
Name: 2, dtype: object


print(f"""In {ser['recency']:.0f} days,
he purchased {ser['frequency']:.0f} times.

Average per day is: {ser['frequency']:.0f}/{ser['recency']:.0f} =  {ser['frequency']/ser['recency']:.4f} times.

In {t} days : {t} * {ser['frequency']/ser['recency']:.4f} = {t * ser['frequency']/ser['recency']:.4f}

Our prediction is: {ser['exp_num_purchases']:.4f}

Difference = {t * ser['frequency']/ser['recency']:.4f} - {ser['exp_num_purchases']:.4f} = {(t * ser['frequency']/ser['recency']) - (ser['exp_num_purchases']):.4f}
""")

In 373 days,
he purchased 112 times.

Average per day is: 112/373 =  0.3003 times.

In 30 days : 30 * 0.3003 = 9.0080

Our prediction is: 7.6457

Difference = 9.0080 - 7.6457 = 1.3624


df_rfm.head(2)


df_rfm.query("frequency==0").shape

(1398, 7)


df_rfm2 = df_rfm.query("frequency >0")


df_rfm2[['frequency', 'monetary_value']].corr()


"""
Observation:

The correlation seems very weak. Hence, we can conclude that, 
the assumption is satisfied and we can fit the model to our data.

""";


ggf = lifetimes.GammaGammaFitter(penalizer_coef=0.001)

ggf.fit(df_rfm2['frequency'],
       df_rfm2['monetary_value'])

ggf.summary


df_rfm2.query("monetary_value == 0").shape

(0, 7)


df_rfm2 = df_rfm2.query("monetary_value != 0")


bp.show_methods(ggf)


# Here, we get exp avg sales NOT exp avg profit (we will mulitply by profit_margin later.)
df_rfm2['exp_avg_sales'] = ggf.conditional_expected_average_profit(
    df_rfm2['frequency'],
    df_rfm2['monetary_value'])

df_rfm2.head()


print(f"Expected Average Sales: {df_rfm2['exp_avg_sales'].mean()}")
print(f"Actual Average Sales: {df_rfm2['monetary_value'].mean()}")

Expected Average Sales: 464.5104833021637
Actual Average Sales: 455.5710405734213


# Predicting Customer Lifetime Value for the next 30 days
df_rfm2['predicted_clv'] = ggf.customer_lifetime_value(bgf,
    df_rfm2['frequency'],
    df_rfm2['recency'],
    df_rfm2['T'],
    df_rfm2['monetary_value'],
    time=1,     # lifetime in months
    freq='D',   # frequency in which the data is present(T)      
    discount_rate=0.01 # discount rate per month is 0.01 (per year is about 12.7)
    )
df_rfm2.head()


df_rfm2['manual_predicted_clv'] = df_rfm2['exp_num_purchases'] * df_rfm2['exp_avg_sales']
df_rfm2.head()


# how different are the sales predictions?
(df_rfm2['predicted_clv']-
    df_rfm2['manual_predicted_clv']).describe()

count    2522.000000
mean       -2.255118
std         9.024262
min      -338.071944
25%        -2.114399
50%        -1.102105
75%        -0.544666
max        -0.027260
dtype: float64


# how different are the sales predictions?
(df_rfm2['predicted_clv']-
    df_rfm2['manual_predicted_clv']).hist()

<AxesSubplot:>


"""
Note:

Here, clv is for the sales NOT for the profit.
To get clv for profit we need to mulitply by profit_margin.
""";


# CLV in terms of profit (profit margin is 5%)
profit_margin = 0.05
df_rfm2['CLV'] = df_rfm2['predicted_clv'] * profit_margin
df_rfm2.head()


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 6 secs

Feature	Description
InvoiceNo	Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
StockCode	Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
Description	Product (item) name. Nominal.
Quantity	The quantities of each product (item) per transaction. Numeric.
InvoiceDate	Invice Date and time. Numeric, the day and time when each transaction was generated.
UnitPrice	Unit price. Numeric, Product price per unit in sterling.
CustomerID	Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
Country	Country name. Nominal, the name of the country where each customer resides.

	customer_id	invoice_no	invoice_date	quantity	unit_price	total_sales
0	17850.0	536365	2010-12-01 08:26:00	6	2.55	15.30
1	17850.0	536365	2010-12-01 08:26:00	6	3.39	20.34
541892	13113.0	581586	2011-12-09 12:49:00	24	8.95	214.80
541893	13113.0	581586	2011-12-09 12:49:00	10	7.08	70.80

	age	frequency	total_sales
customer_id
12346.0	0	1	77183.60
12747.0	366	103	4196.01
12748.0	372	4595	33719.73
12749.0	209	199	4090.88
12820.0	323	59	942.34

	customer_id	invoice_no	invoice_date	quantity	unit_price	total_sales
0	17850.0	536365	2010-12-01 08:26:00	6	2.55	15.30
1	17850.0	536365	2010-12-01 08:26:00	6	3.39	20.34

	start_month	frequency	total_sales
customer_id
12346.0	1	1	77183.60
12747.0	12	103	4196.01
12748.0	12	4595	33719.73
12749.0	5	199	4090.88
12820.0	1	59	942.34

Table of Contents

Description¶

Load the libraries¶

Read the clean data¶

Method 01: Aggregate Model¶

aggregate the data per customer¶

calculate CLV variables¶

Method 02: Cohort Model¶

Method 03: BG/NBD Model + Gamma-Gamma Model¶

Get RFM summary data¶

BG/NBD Fitting to get expected number of purchases¶

Customer alive probability¶

Calculate expected number of purchase upto time¶

Gamma-Gamma Modelling for Monetary Value¶

Fitting GammaGamma Model¶

Predict cutomer life time value using ggf¶

Time Taken¶

	Month	CLV
0	Jan	807795.771327
1	Feb	272699.905100
2	March	432619.978714
3	Apr	239316.006912
4	May	133384.514952
5	Jun	286107.889868
6	Jul	53007.355670
7	Aug	134038.232234
8	Sep	165834.460286
9	Oct	86161.618740
10	Nov	42638.784234
11	Dec	2686916.911152

	0	1	2
0	BaseFitter	ModifiedBetaGeoFitter	generate_data
1	BetaGeoBetaBinomFitter	ParetoNBDFitter	utils
2	BetaGeoFitter	fitters	version
3	GammaGammaFitter

	0	1	2
0	ConvergenceError	dill	expected_cumulative_transactions
1	calculate_alive_path	division	summary_data_from_transaction_data
2	calibration_and_holdout_data

	frequency	recency	T	monetary_value
count	3920.000000	3920.000000	3920.000000	3920.000000
mean	2.850000	131.343622	223.085714	293.099532
std	5.713358	132.443948	118.037855	2734.951972
min	0.000000	0.000000	1.000000	0.000000
25%	0.000000	0.000000	112.000000	0.000000
50%	1.000000	94.000000	249.000000	173.625000
75%	3.000000	252.000000	327.000000	347.946042
max	112.000000	373.000000	373.000000	168469.600000

	coef	se(coef)	lower 95% bound	upper 95% bound
r	0.833433	0.028549	0.777477	0.889389
alpha	69.637792	2.788365	64.172597	75.102988
a	0.005493	0.013933	-0.021816	0.032802
b	7.955385	23.159843	-37.437907	53.348677

	0	1
0	conditional_expected_number_of_purchases_up_to_time	params_
1	conditional_probability_alive	penalizer_coef
2	conditional_probability_alive_matrix	predict
3	confidence_intervals_	probability_of_n_purchases_up_to_time
4	data	save_model
5	expected_number_of_purchases_up_to_time	standard_errors_
6	fit	summary
7	generate_new_data	variance_matrix_
8	load_model

	0	1
0	calculate_alive_path	plot_frequency_recency_matrix
1	coalesce	plot_history_alive
2	expected_cumulative_transactions	plot_incremental_transactions
3	forceAspect	plot_period_transactions
4	plot_calibration_purchases_vs_holdout_purchases	plot_probability_alive_matrix
5	plot_cumulative_transactions	plot_transaction_rate_heterogeneity
6	plot_dropout_rate_heterogeneity	stats
7	plot_expected_repeat_purchases

	customer_id	frequency	recency	T	monetary_value	prob_alive
0	12346.0	0.0	0.0	325.0	0.000000	1.000000
1	12747.0	10.0	367.0	369.0	383.745000	0.999660
2	12748.0	112.0	373.0	373.0	301.024821	0.999954
3	12749.0	3.0	210.0	213.0	1077.260000	0.999426
4	12820.0	3.0	323.0	326.0	257.293333	0.999432

	customer_id	frequency	recency	T	monetary_value	prob_alive	exp_num_purchases
0	12346.0	0.0	0.0	325.0	0.000	1.00000	0.063354
1	12747.0	10.0	367.0	369.0	383.745	0.99966	0.740595

	coef	se(coef)	lower 95% bound	upper 95% bound
p	11.133452	0.282670	10.579419	11.687486
q	0.860041	0.021344	0.818207	0.901875
v	11.330566	0.296364	10.749692	11.911440

	customer_id	frequency	recency	T	monetary_value	prob_alive	exp_num_purchases
2	12748.0	112.0	373.0	373.0	301.024821	0.999954	7.645667
3593	17841.0	111.0	372.0	373.0	364.452162	0.999940	7.577803
1771	15311.0	89.0	373.0	373.0	677.729438	0.999943	6.087118
1267	14606.0	88.0	372.0	373.0	135.890114	0.999929	6.019278
110	12971.0	70.0	369.0	372.0	159.211286	0.999884	4.810275

	0	1	2
0	conditional_expected_average_profit	fit	save_model
1	confidence_intervals_	load_model	standard_errors_
2	customer_lifetime_value	params_	summary
3	data	penalizer_coef	variance_matrix_