import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})

SEED = 100
np.random.seed(SEED) # we need this in each cell that calls random

plt.style.use('ggplot')
%matplotlib inline


# scipy
import scipy
from scipy import stats # stats.ttest_ind(x, y, equal_var=False).pvalue
from scipy.stats import fisher_exact


# statsmodels
import statsmodels.api as sm
import statsmodels.stats.api as sms
# sms.proportion.proportions_ztest(counts, nobs, alternative = 'two-sided')

from statsmodels.stats.power import tt_ind_solve_power
# effect_size = sms.proportion_effectsize(prop_control, prop_test) 
# sample_size = tt_ind_solve_power(effect_size=effect_size, nobs1=None, 
#                alpha=0.05, power=0.8, ratio=1, alternative='two-sided')


from mlxtend.evaluate import permutation_test
# p_value = permutation_test(treatment, control,
#                            method='approximate',
#                            num_rounds=10000,
#                            seed=SEED)


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
import bp


def plot_feature_test_control(df,cat,show=True,figsize=(8,8),rot=90):
    """
    cat ==> categorical column
    """

    fig, ax = plt.subplots(3,1, figsize=(figsize),sharex=False)

    # data to plot
    dfx = df.groupby([cat,'test']).agg({'conversion' : ['count','mean']
                                       }).unstack(level='test')
    
    # rename columns test==0 is control
    dfx.columns = ['control','test',
                   'rate_control','rate_test']
    
    # add difference column
    dfx['test_control_diff'] = dfx['test'] - dfx['control']
    dfx['rate_test_control_diff'] = dfx['rate_test'] - dfx['rate_control']
    
    # add percent column
    dfx['control_pct'] = dfx['control'] / dfx['control'].sum() * 100
    dfx['test_pct'] = dfx['test'] / dfx['test'].sum() * 100
    dfx['test_control_diff_pct'] = dfx['test_pct'] - dfx['control_pct']

    
    # plots
    dfx.plot.bar(y=['control','test','test_control_diff'],
                 ax=ax[0],rot=rot,color=['red','green','gray']);
    dfx.plot.bar(y=['control_pct','test_pct','test_control_diff_pct'],
                 ax=ax[1],rot=rot,color=['red','green','gray']);
    dfx.plot.bar(y=['rate_control','rate_test','rate_test_control_diff']
                 ,ax=ax[2],rot=rot,color=['red','green','gray']);
    
    # labels
    ax[0].set_ylabel('Count')
    ax[1].set_ylabel('Count Percent')
    ax[2].set_ylabel('Conversion Rate')
    
    ax[0].set_xlabel('')
    ax[1].set_xlabel('')
    ax[2].set_xlabel('')
    
    # title
    plt.suptitle(f'Count and Conversion Rate for {cat}',fontsize=14,weight='bold')
    plt.tight_layout();
    
    # save the large image
    plt.savefig(f'images/{cat}_test_vs_control.png',dpi=300);

    if show:
        plt.show()

    plt.close()

    return None


def get_ttest(x,y):
    return stats.ttest_ind(x, y, equal_var=False).pvalue


h = "/Volumes/Media/github/Datasets/business_projects/Spanish_Translation_AB_Test"
!ls $h

test_results.csv test_table.csv   user_table.csv


!head -n 2 $h/test_table.csv

"user_id","date","source","device","browser_language","ads_channel","browser","conversion","test"
315281,2015-12-03,"Direct","Web","ES",NA,"IE",1,0


dft = pd.read_csv(f'{h}/test_table.csv',parse_dates=['date'])

print(dft.shape)
dft.head().append(dft.tail())

(453321, 9)


dfu = pd.read_csv(f'{h}/user_table.csv')

print(dfu.shape)
print(dfu.info())
dfu.head().append(dfu.tail())

(452867, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452867 entries, 0 to 452866
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  452867 non-null  int64 
 1   sex      452867 non-null  object
 2   age      452867 non-null  int64 
 3   country  452867 non-null  object
dtypes: int64(2), object(2)
memory usage: 13.8+ MB
None


dft.head(2)


dft.dtypes

user_id                      int64
date                datetime64[ns]
source                      object
device                      object
browser_language            object
ads_channel                 object
browser                     object
conversion                   int64
test                         int64
dtype: object


dft.isnull().sum()

user_id                  0
date                     0
source                   0
device                   0
browser_language         0
ads_channel         271444
browser                  0
conversion               0
test                     0
dtype: int64


dft['ads_channel'].value_counts(dropna=False,normalize=True)

NaN         0.598790
Facebook    0.150942
Google      0.150401
Yahoo       0.060520
Bing        0.030197
Other       0.009150
Name: ads_channel, dtype: float64


# there are lots of nans in ads channel, all other features have no nulls.


dfu.isna().sum()

user_id    0
sex        0
age        0
country    0
dtype: int64


len(dft['user_id']) - dft['user_id'].nunique()

0


len(dfu['user_id']) - dfu['user_id'].nunique()

0


# the user id is unique we can combine them.
dft.head(1)


dfu.head(1)


df = dft.merge(dfu,on='user_id',how='inner')
# note: if we do left join there are some users in test data that are not in
# user data and we get NAN values for age.


df.shape

(452867, 12)


df.head(2)


# exclude Spain # it has only test group no control group
df = df[df.country !='Spain']
df.shape

(401085, 12)


df.head(2)


df[df.duplicated()] # no duplicate entries


df['user_id'].nunique() == df.shape[0] # unique user id

True


# date range of test and control
df['date'] = pd.to_datetime(df['date'])
df.groupby('test')['date'].agg(['min','max'])


df['date'].unique()

array(['2015-12-04T00:00:00.000000000', '2015-12-03T00:00:00.000000000',
       '2015-11-30T00:00:00.000000000', '2015-12-02T00:00:00.000000000',
       '2015-12-01T00:00:00.000000000'], dtype='datetime64[ns]')


df['date'].max() - df['date'].min() # we have data of just 4+1 days.

Timedelta('4 days 00:00:00')


df['date'].describe(datetime_is_numeric=True)

count                           401085
mean     2015-12-02 08:56:25.472904704
min                2015-11-30 00:00:00
25%                2015-12-01 00:00:00
50%                2015-12-03 00:00:00
75%                2015-12-04 00:00:00
max                2015-12-04 00:00:00
Name: date, dtype: object


df.head(2)


df[['conversion','test','age']].corr(method='spearman').style.background_gradient() # look at conversion column


for col in df.columns.drop(['user_id','date']):
    print(col)
    print(df[col].unique())
    print()

source
['Ads' 'Direct' 'SEO']

device
['Web' 'Mobile']

browser_language
['ES' 'Other' 'EN']

ads_channel
['Google' 'Facebook' nan 'Bing' 'Yahoo' 'Other']

browser
['IE' 'Android_App' 'FireFox' 'Chrome' 'Iphone_App' 'Safari' 'Opera']

conversion
[0 1]

test
[1 0]

sex
['M' 'F']

age
[21 22 19 35 31 33 37 29 28 27 24 20 45 23 26 18 34 25 32 39 48 40 30 38
 42 43 44 41 36 47 51 49 50 46 53 56 55 52 59 70 54 60 57 62 63 58 61 65
 64 66]

country
['Mexico' 'Venezuela' 'Bolivia' 'Colombia' 'Uruguay' 'El Salvador'
 'Nicaragua' 'Peru' 'Costa Rica' 'Chile' 'Argentina' 'Ecuador' 'Guatemala'
 'Honduras' 'Paraguay' 'Panama']


df.head(2)


sns.FacetGrid(df, col='test').map(sns.countplot,'conversion',order=[0,1])

<seaborn.axisgrid.FacetGrid at 0x7fcb3128c0d0>


df.groupby(['conversion','test'])['user_id'].count().unstack('conversion')\
.plot.bar(color=['tomato','g'])

<matplotlib.axes._subplots.AxesSubplot at 0x7fcb31cafdd0>


df.head(2)


bp.plot_cat_cat(df,'test','conversion')

==================================================
Feature: **test**
Overall Count: 
    1: 53.8%
    0: 46.2%

Total  **conversion_1** distribution:
    1: 51.14%
    0: 48.86%

Per test  **conversion_1** distribution:
    0: 4.83%
    1: 4.34%


df.groupby('country')['conversion'].count().sort_values(ascending=False)\
.plot.bar(color=sns.color_palette('husl',20),title='sample size (test+contro) for each country');


df.groupby('country')['conversion'].sum().sort_values(ascending=False)\
.plot.bar(color=sns.color_palette('husl',20),
         title='total converted cases per country');


df.groupby('country')['conversion'].mean().sort_values(ascending=False)\
.plot.bar(color=sns.color_palette('husl',20),
          title='mean conversion rate per country'
         );


"""
# highest average conversion rate: spain
# very low average: argentina and uruguay
# other latin american country are almost similar.


Note that there might be bias in sampling for spain vs other countries
and we may need to do t-test to confirm that.

""";


# df.groupby(['country','test']).sum().reset_index()
# spain has no test group, it has only control group


plot_feature_test_control(df,'country',figsize=(12,12))


bp.plot_cat_cat(df,'country','conversion',figsize=(32,24),fontsize=24,show=False)

==================================================
Feature: **country**
Overall Count: 
    Mexico: 32.03%
    Colombia: 13.48%
    Argentina: 11.65%
    Peru: 8.39%
    Venezuela: 7.99%
    Chile: 4.92%
    Ecuador: 3.96%
    Guatemala: 3.77%
    Bolivia: 2.77%
    Honduras: 2.14%
    El Salvador: 2.04%
    Paraguay: 1.83%
    Nicaragua: 1.68%
    Costa Rica: 1.32%
    Uruguay: 1.03%
    Panama: 0.99%

Total  **conversion_1** distribution:
    Mexico: 35.31%
    Colombia: 15.15%
    Peru: 9.24%
    Venezuela: 8.69%
    Chile: 5.36%
    Ecuador: 4.26%
    Guatemala: 4.1%
    Argentina: 3.57%
    Bolivia: 2.95%
    Honduras: 2.3%
    El Salvador: 2.27%
    Nicaragua: 1.96%
    Paraguay: 1.96%
    Costa Rica: 1.55%
    Panama: 1.04%
    Uruguay: 0.29%

Per country  **conversion_1** distribution:
    Costa Rica: 5.35%
    Nicaragua: 5.34%
    Colombia: 5.13%
    El Salvador: 5.08%
    Peru: 5.03%
    Mexico: 5.03%
    Guatemala: 4.97%
    Chile: 4.97%
    Venezuela: 4.97%
    Honduras: 4.93%
    Ecuador: 4.91%
    Paraguay: 4.89%
    Bolivia: 4.86%
    Panama: 4.81%
    Argentina: 1.4%
    Uruguay: 1.28%


"""
1. Spain does not have test group
2. dissimilar test and control group: argentina and uruguay
3. low conversion rate: argentina and uruguay
4. other latin american countries have almost same conversion rate.


control group higher conversion rate(red): 
    bolivia, columbia, el salvador, guatemala, hondurus, venezuala 

test group higher conversion rate (green): 
    chile, costarica, mexico, nicaragua, panama, paraguay, peru, uruguay

conversion rate in control and test group are almost same.

Sample bias: Looking at argentina and uruguay we can see heavy sample imbalance,
             this might mean our sample selection may be biased and we may need
             to exclude these two country in the analysis part.
             
             a) We can not validate the results due to huge sample bias in uruguay and argentina.

""";


df.head(2)


pd.crosstab(df['country'],df['conversion'])


sample_sizes = df[['country','test','conversion']].groupby(['country','test'], as_index=False).count()
conversions = df[['country','test','conversion']].groupby(['country','test'],as_index=False).sum()

df1 = pd.merge(sample_sizes,conversions,on=['country','test'])
df1 = df1.rename(columns={'conversion_x':'sample_size','conversion_y':'conversion'})


df1['conversion_ratio'] = df1['conversion']/df1['sample_size']

df1


df1.groupby('country')['sample_size'].apply(np.ptp).sort_values(ascending=False)

# Argentina and Uruguay has very large difference between test and control size
# we should remove this from analysis.

country
Argentina      28021
Uruguay         3304
Venezuela        244
Ecuador          177
Honduras         154
Guatemala        119
Colombia         116
Nicaragua        115
Peru              72
Mexico            66
Paraguay          47
El Salvador       41
Chile             31
Bolivia           24
Panama            19
Costa Rica        11
Name: sample_size, dtype: int64


df['country'].sort_values().unique()

array(['Argentina', 'Bolivia', 'Chile', 'Colombia', 'Costa Rica',
       'Ecuador', 'El Salvador', 'Guatemala', 'Honduras', 'Mexico',
       'Nicaragua', 'Panama', 'Paraguay', 'Peru', 'Uruguay', 'Venezuela'],
      dtype=object)


# for Argentina and Uruguay, the sample size are much different, drop them.
# For Spain, we are doing the A/B test, remove this from analysis.
df = df[~df['country'].isin(['Argentina','Uruguay','Spain'])]

df['country'].sort_values().unique()

array(['Bolivia', 'Chile', 'Colombia', 'Costa Rica', 'Ecuador',
       'El Salvador', 'Guatemala', 'Honduras', 'Mexico', 'Nicaragua',
       'Panama', 'Paraguay', 'Peru', 'Venezuela'], dtype=object)


df.head(2)


df['date'].describe(datetime_is_numeric=True)

count                           350218
mean     2015-12-02 08:56:32.830751232
min                2015-11-30 00:00:00
25%                2015-12-01 00:00:00
50%                2015-12-03 00:00:00
75%                2015-12-04 00:00:00
max                2015-12-04 00:00:00
Name: date, dtype: object


fig,ax = plt.subplots(1,1,figsize=(12,8))

df.query("test == 1")[['date','conversion']].groupby('date').mean().plot(ax=ax,color='g')
df.query("test == 0")[['date','conversion']].groupby('date').mean().plot(ax=ax,color='r')

plt.legend(['test','control'])
plt.title('Conversion Rates by Date')
plt.ylabel("Conversion")
plt.show()


"""
We have only 5 days of data: Nov30, Dec1, Dec2, Dec3, and Dec4 of year 2015.

On first and last day we have large difference between conversion rates betweeen
control and test groups.

On date, Dec 2, and dec 4, we have higher conversion rate for test group
(with dec 3 having slightly less conversion rate). This might mean the A/B test
is working and test group might have higher conversion rate as time goes by.

""";


df.head(2)


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


for cat in ['source','device','browser_language','ads_channel','browser','sex']:
    plot_feature_test_control(df,cat,figsize=(12,6),rot=0)


plot_feature_test_control(df,'age',figsize=(18,8))
"""
lower plot: 
1. for a given age, mostly control group have more conversion rate.
2. the age 61 seems to be outlier;

""";


df[df.age==61]


df.age.value_counts().plot.bar(figsize=(18,4))
plt.axhline(y=500)

# plt.ylim(0,5000)
# above the age of 49 there are less than 500 samples.

<matplotlib.lines.Line2D at 0x7fcb34e23fd0>


# What is the probability of an individual converting regardless of the  test?
df['conversion'].mean()

0.05028011124499598


# Prob of conversion for each groups: treatment group and control group
df.groupby('test')['conversion'].describe()

# control group has slightly larger conversion rate than treatement group.
"""
On average 4.8% control convert and 4.3% test group convert.

""";


df.head(2)


from scipy.stats import fisher_exact


def get_fishers_exact_test(df):
    table = pd.crosstab(df['test'], df['conversion']).to_numpy()

    oddsratio, pvalue = fisher_exact(table)

    print(f'Fisher exact test p-value: {pvalue}')
    if pvalue < 0.05:
        print('Two groups are statistically different')

    else:
        print('Two groups are NOT statistically different')
        
get_fishers_exact_test(df)

Fisher exact test p-value: 0.7220634320130839
Two groups are NOT statistically different


dfm = df.query("country == 'Mexico'")
get_fishers_exact_test(dfm)

Fisher exact test p-value: 0.16819684442263277
Two groups are NOT statistically different


from scipy import stats

df_fisher_exact = df.groupby('country').apply(lambda dfx:
                           stats.fisher_exact(
                           pd.crosstab(dfx['test'],dfx['conversion']).to_numpy()
                           )[1] # 0 is statistic and 1 is pvalue
                           ).to_frame()

df_fisher_exact.columns = ['Fishers_exact_pvalue']
df_fisher_exact['significant'] = df_fisher_exact['Fishers_exact_pvalue'] < 0.05

df_fisher_exact


"""
We get interesting result, if we take all the data and perform the 
Fishers exact test for contingency table for converted and non-converted
before and after, our result was SIGNIFICANT.

Now, if we look at individual country, the test is not significant for
all of the countries. The observed imbalance is NOT statistically significant
at 5% level of confidence and it might simply be due to chance.

""";


df.head(2)


num_control = len(df[df.test == 0])
num_test    = len(df[df.test == 1])

num_converted_control = len(df[(df.test == 0) & (df.conversion == 1)])
num_converted_test = len(df[(df.test == 1) & (df.conversion == 1)])

prop_control = num_converted_control/num_control # proportions
prop_test = num_converted_test/num_test

print(f'Number of control            : {num_control:,}')
print(f'Number of test               : {num_test:,}')

print(f'Number of Converted control  : {num_converted_control:,}')
print(f'Number of Converted test     : {num_converted_test:,}')

print(f'Converted control proportion : {prop_control:.2f}')
print(f'Converted test proportion    : {prop_test:.2f}')

Number of control            : 175,540
Number of test               : 174,678
Number of Converted control  : 8,803
Number of Converted test     : 8,806
Converted control proportion : 0.05
Converted test proportion    : 0.05


ser_control = df.loc[(df.test == 0), 'conversion'] # control series
ser_test    = df.loc[(df.test == 1) ,'conversion'] # test series

display(ser_control.sample(5,random_state=SEED))

pvalue = stats.ttest_ind(ser_control, ser_test, equal_var=False).pvalue

print(f'T-test p-value: {pvalue}')
if pvalue < 0.05:
    print('Two groups are statistically different')
else:
    print('Two groups are NOT statistically different')

242005    0
362081    1
448747    0
221438    0
334631    0
Name: conversion, dtype: int64

T-test p-value: 0.7200849282884506
Two groups are NOT statistically different


from scipy import stats


# Test and control group has different number of samples
# Take N samples from each country and do t-test
#
#
N_SAMPLES = 400
SEED = 100
df_pvalues = pd.DataFrame(columns=['country', 'test_size', 'control_size',
                                   'test_rate_full','control_rate_full','p_value_full',
                                   'test_rate_sample','control_rate_sample','p_value_sample',
                                  ])

for country in df.country.unique():
    # countries
    test_country    = df.loc[(df['country'] == country) & (df['test'] == 1),'conversion']
    control_country = df.loc[(df['country'] == country) & (df['test'] == 0),'conversion']
    
    # sample countries
    test_country_sample    = test_country.sample(N_SAMPLES,random_state=SEED)
    control_country_sample = control_country.sample(N_SAMPLES,random_state=SEED)
    
    # test and control size
    test_size    = len(test_country)
    control_size = len(control_country)
    
    # test and control rate (mean)
    test_country_mean = test_country.mean()
    control_country_mean = control_country.mean()
    test_country_sample_mean = test_country_sample.mean()
    control_country_sample_mean = control_country_sample.mean()

    # pvalue
    p_value_full   = stats.ttest_ind(test_country, control_country, equal_var=False).pvalue
    p_value_sample = stats.ttest_ind(test_country_sample, control_country_sample, equal_var=False).pvalue
    
    # dataframe
    row = [country, test_size, control_size,
           test_country_mean, control_country_mean, p_value_full,
           test_country_sample_mean, control_country_sample_mean, p_value_sample]
    df_pvalues.loc[len(df_pvalues)] = row
    
    df_pvalues = df_pvalues.drop_duplicates()

df_pvalues.sort_values('p_value_sample')


def get_ttest(x,y):
    return stats.ttest_ind(x, y, equal_var=False).pvalue


df_ttest = df.groupby(['country','test'])['conversion'].agg(['size','mean']
                                                           ).unstack('test')

# test == 0 is control
df_ttest.columns = ['control_size','test_size','control_rate','test_rate']

df_ttest['pvalue'] = df.groupby('country').apply(lambda dfx: get_ttest(
    dfx.loc[dfx.test == 0,'conversion'],
    dfx.loc[dfx.test == 1,'conversion']))

df_ttest['is_test_rate_higher'] = df_ttest['test_rate'] > df_ttest['control_rate']
df_ttest['is_Ttest_significant'] = df_ttest['pvalue'] < 0.05

df_ttest.sort_values('pvalue')


"""
Some countries show positive changes in test group than control group.

+ve change: ['Chile', 'Costa Rica', 'Mexico', 'Nicaragua', 'Panama', 'Paraguay',
       'Peru', 'Uruguay']
       
       
-ve change: ['Argentina', 'Bolivia', 'Colombia', 'Ecuador', 'El Salvador',
       'Guatemala', 'Honduras', 'Venezuela']
  
       
Result of t-test
-----------------
H0: mean of two groups (converted or not) is same
H1: is not same

at 5% significace level the p-value for all the countries are not smaller than
0.05, this means we can not reject the null hypothesis and we conclude that
the mean of two groups are statistically same. The two groups are NOT different.


""";

positive_change_countries = df_ttest.query("is_test_rate_higher==True").index
positive_change_countries

Index(['Chile', 'Costa Rica', 'Mexico', 'Nicaragua', 'Panama', 'Paraguay',
       'Peru'],
      dtype='object', name='country')


negative_change_countries = df_ttest.index.drop(positive_change_countries)
negative_change_countries

Index(['Bolivia', 'Colombia', 'Ecuador', 'El Salvador', 'Guatemala',
       'Honduras', 'Venezuela'],
      dtype='object', name='country')


import statsmodels.stats.api as sms
from statsmodels.stats.power import tt_ind_solve_power

effect_size = sms.proportion_effectsize(prop_control, prop_test) 

# ratio = 1 means we want both samples to have same size
# power = 0.8 is commonly accepted power we want our anaylysis have
# power = 0.8 means there is 20% chance of Type II error (False Negative)
sample_size = sms.NormalIndPower().solve_power(effect_size, power=0.8,
                                               alpha=0.05, ratio=1)
print(f'sample size required per group: {sample_size:,.0f}')

sample size required per group: 10,702,929


effect_size = sms.proportion_effectsize(prop_control, prop_test) 

sample_size = tt_ind_solve_power(effect_size=effect_size, nobs1=None, alpha=0.05,
                       power=0.8, ratio=1, alternative='two-sided')

print(f'sample size required per group: {sample_size:,.0f}')

sample size required per group: 10,702,930


df['conversion'].value_counts()

0    332609
1     17609
Name: conversion, dtype: int64


"""
control group: 38k > 28k
test group: 18k < 28k  (We can not get statistical power of 0.8)

Our analysis has smaller dataset, We need at least 28k persons in test group
to have the statistical power of 0.8.

Statistical power of 0.8 means, we have 20% chance of Type II error.

Type II error is False Negative. p-value is pessimistically large and we 
do not reject the Null hypothesis when there is significant effect.

We should note that our data has only 5 days of conversion data,
the large required sample size means we need to continue A/B testing for 
few months until we get the required amount of data.

Then, after few more months we should repeat the experiment to make
sure our analysis was correct and then implement the changes in real life.

""";


treatment = df.loc[df.test == 1, 'conversion']
control = df.loc[df.test == 0, 'conversion']


%%time

from mlxtend.evaluate import permutation_test

p_value = permutation_test(treatment, control,
                           method='approximate',
                           num_rounds=100,
                           seed=SEED)
print(p_value)

0.7722772277227723
CPU times: user 836 ms, sys: 7.21 ms, total: 843 ms
Wall time: 986 ms


pd.crosstab(df['test'],df['conversion'],margins=True)


df[df.conversion==1]['test'].value_counts()

1    8806
0    8803
Name: test, dtype: int64


df[df.conversion==1]['test'].value_counts().sort_index()

0    8803
1    8806
Name: test, dtype: int64


counts = pd.crosstab(df['test'],df['conversion'])[1].to_numpy() # gives count of test 0 and 1 ie control and test
counts

array([8803, 8806])


df['test'].value_counts()

0    175540
1    174678
Name: test, dtype: int64


df['test'].value_counts().sort_index()

0    175540
1    174678
Name: test, dtype: int64


nobs = df['test'].value_counts().sort_index().to_numpy()
nobs

array([175540, 174678])


pd.crosstab(df['test'],df['conversion'],margins=True)


# counts = np.array([num_conversion_control, num_conversion_test])
# nobs = np.array([num_control, num_test])
import statsmodels.stats as sms


counts = df[df.conversion==1]['test'].value_counts().sort_index().to_numpy()
nobs   = df['test'].value_counts().sort_index().to_numpy()

zscore, pvalue = sms.proportion.proportions_ztest(counts, nobs, alternative = 'two-sided')
print('zscore = {:.3f}, pvalue = {:.3f}'.format(zscore, pvalue))

zscore = -0.358, pvalue = 0.720


df_proportions_ztest = df.groupby('country').apply(lambda dfx:
    sms.proportion.proportions_ztest(
        dfx[dfx.conversion==1]['test'].value_counts().sort_index().to_numpy(),
        dfx['test'].value_counts().sort_index().to_numpy(),
        alternative='two-sided'
                           )[1] # 0 is statistic and 1 is pvalue
                           ).to_frame()

df_proportions_ztest.columns = ['proportions_ztest_pvalue']
df_proportions_ztest['significant'] = df_proportions_ztest['proportions_ztest_pvalue'] < 0.05

df_proportions_ztest


# sanity check for a country
country = 'Peru'
counts = df.query("country == @country and conversion == 1")['test'].value_counts().sort_index().to_numpy()
nobs   = df.query("country == @country ")['test'].value_counts().sort_index().to_numpy()

zscore, pvalue = sms.proportion.proportions_ztest(counts, nobs, alternative = 'two-sided')
print('zscore = {:.3f}, pvalue = {:.6f}'.format(zscore, pvalue))

zscore = -0.290, pvalue = 0.771942


import statsmodels.api as sm
import statsmodels.formula.api as smf


df['intercept'] = 1
df.head()


model = sm.Logit(df['conversion'], df[['intercept','test']])

results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.199339
         Iterations 7


# Include more variables
"""
Adding more features have its own advantages and disadvantages:

Advantages:
we might get the better fit


Disadvantages:
model complexity increases, chances of overfitting increases.
features may be correlated in themselves, which harms the fitting.

""";


# add the effect of countries

df1 = pd.get_dummies(df[['conversion', 'intercept', 'test', 'country']]
                     ,columns=['country'],
                     drop_first=True)

df1.head(2)


model = sm.Logit(df1['conversion'], df1.drop('conversion',axis=1))

results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.199330
         Iterations 7


"""
** Now the test column becomes NON significant (p-value < 0.05)
   also country Uruguay is insignificant.
   
** Test group has NO effect on conversion, when we consider the effects of
countries. Which we had already seen in country-wise t-test and fisher_exact_test.

""";


"""
1. We have such a small duration of experiment, just 4 days.
This may not be sufficient to make decisions.

2. If considered overall data, conversion is significant on both
t-test and fisher exact test.

But, if we look at individual countries, for all the countries the test fails.
Country seems to be the cofounding feature.


Limitations:
------------------
** If we have large number of days of observation, we could see the effects
of weekends and holidays on test group clicks.


""";


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 1 min 3 secs

	conversion	test	age
conversion	1.000000	-0.011656	0.000138
test	-0.011656	1.000000	-0.001620
age	0.000138	-0.001620	1.000000

conversion	0	1
country
Argentina	46079	654
Bolivia	10583	541
Chile	18756	981
Colombia	51285	2775
Costa Rica	5025	284
Ecuador	15115	780
El Salvador	7760	415
Guatemala	14374	751
Honduras	8146	422
Mexico	122016	6468
Nicaragua	6364	359
Panama	3761	190
Paraguay	6988	359
Peru	31974	1692
Uruguay	4081	53
Venezuela	30462	1592

	country	test	sample_size	conversion	conversion_ratio
0	Argentina	0	9356	141	0.015071
1	Argentina	1	37377	513	0.013725
2	Bolivia	0	5550	274	0.049369
3	Bolivia	1	5574	267	0.047901
4	Chile	0	9853	474	0.048107
5	Chile	1	9884	507	0.051295
6	Colombia	0	27088	1411	0.052089
7	Colombia	1	26972	1364	0.050571
8	Costa Rica	0	2660	139	0.052256
9	Costa Rica	1	2649	145	0.054738
10	Ecuador	0	8036	395	0.049154
11	Ecuador	1	7859	385	0.048988
12	El Salvador	0	4108	220	0.053554
13	El Salvador	1	4067	195	0.047947
14	Guatemala	0	7622	386	0.050643
15	Guatemala	1	7503	365	0.048647
16	Honduras	0	4361	222	0.050906
17	Honduras	1	4207	200	0.047540
18	Mexico	0	64209	3178	0.049495
19	Mexico	1	64275	3290	0.051186
20	Nicaragua	0	3419	180	0.052647
21	Nicaragua	1	3304	179	0.054177
22	Panama	0	1966	92	0.046796
23	Panama	1	1985	98	0.049370
24	Paraguay	0	3650	177	0.048493
25	Paraguay	1	3697	182	0.049229
26	Peru	0	16869	842	0.049914
27	Peru	1	16797	850	0.050604
28	Uruguay	0	415	5	0.012048
29	Uruguay	1	3719	48	0.012907
30	Venezuela	0	16149	813	0.050344
31	Venezuela	1	15905	779	0.048978

	country	test_size	control_size	test_rate_full	control_rate_full	p_value_full	test_rate_sample	control_rate_sample	p_value_sample
8	Chile	9884	9853	0.051295	0.048107	0.302848	0.0625	0.0350	0.071102
12	Paraguay	3697	3650	0.049229	0.048493	0.883697	0.0600	0.0375	0.139878
4	El Salvador	4067	4108	0.047947	0.053554	0.248127	0.0375	0.0550	0.239193
0	Mexico	64275	64209	0.051186	0.049495	0.165544	0.0500	0.0350	0.293590
5	Nicaragua	3304	3419	0.054177	0.052647	0.780400	0.0525	0.0700	0.302623
1	Venezuela	15905	16149	0.048978	0.050344	0.573702	0.0475	0.0625	0.352748
6	Peru	16797	16869	0.050604	0.049914	0.771953	0.0375	0.0500	0.388075
3	Colombia	26972	27088	0.050571	0.052089	0.423719	0.0425	0.0350	0.583170
13	Panama	1985	1966	0.049370	0.046796	0.705327	0.0500	0.0425	0.614077
9	Ecuador	7859	8036	0.048988	0.049154	0.961512	0.0625	0.0550	0.652445
2	Bolivia	5574	5550	0.047901	0.049369	0.718885	0.0425	0.0475	0.733424
10	Guatemala	7503	7622	0.048647	0.050643	0.572107	0.0425	0.0450	0.862977
11	Honduras	4207	4361	0.047540	0.050906	0.471463	0.0525	0.0500	0.872813
7	Costa Rica	2649	2660	0.054738	0.052256	0.687876	0.0350	0.0350	1.000000

	control_size	test_size	control_rate	test_rate	pvalue	is_test_rate_higher	is_Ttest_significant
country
Mexico	64209	64275	0.049495	0.051186	0.165544	True	False
El Salvador	4108	4067	0.053554	0.047947	0.248127	False	False
Chile	9853	9884	0.048107	0.051295	0.302848	True	False
Colombia	27088	26972	0.052089	0.050571	0.423719	False	False
Honduras	4361	4207	0.050906	0.047540	0.471463	False	False
Guatemala	7622	7503	0.050643	0.048647	0.572107	False	False
Venezuela	16149	15905	0.050344	0.048978	0.573702	False	False
Costa Rica	2660	2649	0.052256	0.054738	0.687876	True	False
Panama	1966	1985	0.046796	0.049370	0.705327	True	False
Bolivia	5550	5574	0.049369	0.047901	0.718885	False	False
Peru	16869	16797	0.049914	0.050604	0.771953	True	False
Nicaragua	3419	3304	0.052647	0.054177	0.780400	True	False
Paraguay	3650	3697	0.048493	0.049229	0.883697	True	False
Ecuador	8036	7859	0.049154	0.048988	0.961512	False	False

Table of Contents

Problem¶

Imports¶

Useful Scripts¶

Load the data¶

Data Processing¶

Missing Values¶

Combine the datasets¶

Sanity checks¶

EDA¶

Correlations¶

Unique Values¶

Conversion distribution among test and control group¶

Country¶

Test vs control for each country¶

Conversion Rates For Date¶

EDA of other categorical features¶

EDA for continuous variables¶

Statistics¶

Overall Statistics¶

Fishers exact test for 2 by 2 contingency table¶

T-test for each countries¶

Sample size check for power analysis¶

Permutation Test¶

Proportion z-test¶

Regression Approach¶

Conclusions¶

Total run time¶

	user_id	date	source	device	browser_language	ads_channel	browser	conversion	test
0	315281	2015-12-03	Direct	Web	ES	NaN	IE	1	0
1	497851	2015-12-04	Ads	Web	ES	Google	IE	0	1
2	848402	2015-12-04	Ads	Web	ES	Facebook	Chrome	0	0
3	290051	2015-12-03	Ads	Mobile	Other	Facebook	Android_App	0	1
4	548435	2015-11-30	Ads	Web	ES	Google	FireFox	0	1
453316	425010	2015-12-04	SEO	Web	ES	NaN	FireFox	0	0
453317	826793	2015-12-01	SEO	Mobile	ES	NaN	Android_App	0	1
453318	514870	2015-12-02	Ads	Mobile	ES	Bing	Android_App	0	0
453319	785224	2015-12-04	SEO	Mobile	ES	NaN	Android_App	0	1
453320	241662	2015-12-04	Ads	Web	ES	Facebook	Chrome	0	1

	user_id	sex	age	country
0	765821	M	20	Mexico
1	343561	F	27	Nicaragua
2	118744	M	23	Colombia
3	987753	F	27	Venezuela
4	554597	F	20	Spain
452862	756215	F	27	Venezuela
452863	36888	M	18	Argentina
452864	800559	M	28	Bolivia
452865	176584	M	19	Chile
452866	314649	M	24	Mexico

	user_id	date	source	device	browser_language	ads_channel	browser	test	sex	age	country
65282	194408	2015-12-02	Ads	Mobile	ES	Google	Iphone_App	0	M	61	Mexico
91592	559580	2015-12-04	SEO	Web	EN	NaN	IE	1	M	61	Ecuador
276375	859504	2015-12-01	Ads	Web	ES	Google	Chrome	0	M	61	Chile
325023	959676	2015-11-30	SEO	Web	ES	NaN	Safari	0	M	61	Mexico
419907	183731	2015-11-30	Ads	Web	ES	Bing	Chrome	1	M	61	Mexico

	Fishers_exact_pvalue	significant
country
Bolivia	0.724576	False
Chile	0.310034	False
Colombia	0.424323	False
Costa Rica	0.714544	False
Ecuador	0.970720	False
El Salvador	0.267651	False
Guatemala	0.574713	False
Honduras	0.484754	False
Mexico	0.168197	False
Nicaragua	0.786442	False
Panama	0.710785	False
Paraguay	0.913838	False
Peru	0.783792	False
Venezuela	0.589273	False

	proportions_ztest_pvalue	significant
country
Bolivia	0.718847	False
Chile	0.302847	False
Colombia	0.423724	False
Costa Rica	0.687796	False
Ecuador	0.961509	False
El Salvador	0.248198	False
Guatemala	0.572135	False
Honduras	0.471663	False
Mexico	0.165545	False
Nicaragua	0.780312	False
Panama	0.705275	False
Paraguay	0.883682	False
Peru	0.771942	False
Venezuela	0.573726	False

Dep. Variable:	conversion	No. Observations:	350218
Model:	Logit	Df Residuals:	350216
Method:	MLE	Df Model:	1
Date:	Tue, 11 May 2021	Pseudo R-squ.:	9.197e-07
Time:	22:27:45	Log-Likelihood:	-69812.
converged:	True	LL-Null:	-69812.
Covariance Type:	nonrobust	LLR p-value:	0.7201

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-2.9413	0.011	-268.959	0.000	-2.963	-2.920
test	0.0055	0.015	0.358	0.720	-0.025	0.036

	conversion	intercept	test	country_Chile	country_Colombia	country_Costa Rica	country_Ecuador	country_El Salvador	country_Guatemala	country_Honduras	country_Mexico	country_Nicaragua	country_Panama	country_Paraguay	country_Peru	country_Venezuela
1	0	1	1	0	0	0	0	0	0	0	1	0	0	0	0	0
3	0	1	1	0	0	0	0	0	0	0	1	0	0	0	0	0

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-2.9764	0.045	-66.499	0.000	-3.064	-2.889
test	0.0056	0.015	0.359	0.720	-0.025	0.036
country_Chile	0.0229	0.055	0.417	0.677	-0.085	0.131
country_Colombia	0.0568	0.048	1.180	0.238	-0.038	0.151
country_Costa Rica	0.1004	0.075	1.334	0.182	-0.047	0.248
country_Ecuador	0.0095	0.057	0.165	0.869	-0.103	0.122
country_El Salvador	0.0451	0.067	0.674	0.500	-0.086	0.176
country_Guatemala	0.0218	0.058	0.378	0.706	-0.091	0.135
country_Honduras	0.0134	0.067	0.201	0.841	-0.117	0.144
country_Mexico	0.0363	0.046	0.791	0.429	-0.054	0.126
country_Nicaragua	0.0985	0.070	1.410	0.159	-0.038	0.236
country_Panama	-0.0118	0.086	-0.137	0.891	-0.181	0.158
country_Paraguay	0.0049	0.070	0.071	0.944	-0.132	0.142
country_Peru	0.0346	0.051	0.683	0.495	-0.065	0.134
country_Venezuela	0.0221	0.051	0.434	0.665	-0.078	0.122