import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()

import plotly_express as px
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

json           2.0.9
plotly_express 0.4.1
pandas         1.1.0
autopep8       1.5.2
numpy          1.18.4
plotly         4.9.0
seaborn        0.11.0


# local imports
module_path = "/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/"
sys.path.append(module_path)

from pandas_api import BPAccessor
from plot_utils import add_text_barplot
from plot_plotly import plotly_cat_binn_zero_one
from plot_plotly import plotly_pieplots


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


path_raw = '../data/raw/telco_customer_churn.csv'
df = pd.read_csv(path_raw)

print(df.shape)
df.head(2).append(df.tail(2))

(7043, 21)


target_name = 'Churn'


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB


df.bp.describe()


"""
Look at the feature with datatype object,
the feature TotalCharges should be float but it is object.

""";


cols_drop = ['customerID']
df = df.drop(cols_drop,axis=1)


df = df.rename(columns=lambda x: x[0].capitalize()+x[1:].strip())
df.head()


cols_obj = df.select_dtypes('object').columns.tolist()
print(cols_obj)

['Gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']


# exclude total charges
cols_obj2 = [i for i in cols_obj if i not in ['TotalCharges']]
df[cols_obj2].apply(pd.Series.nunique).sort_values()

Gender              2
Partner             2
Dependents          2
PhoneService        2
PaperlessBilling    2
Churn               2
MultipleLines       3
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
PaymentMethod       4
dtype: int64


df[cols_obj2].apply(lambda x: list(pd.Series.unique(x)))

Gender                                                                                    [Female, Male]
Partner                                                                                        [Yes, No]
Dependents                                                                                     [No, Yes]
PhoneService                                                                                   [No, Yes]
MultipleLines                                                                [No phone service, No, Yes]
InternetService                                                                   [DSL, Fiber optic, No]
OnlineSecurity                                                            [No, Yes, No internet service]
OnlineBackup                                                              [Yes, No, No internet service]
DeviceProtection                                                          [No, Yes, No internet service]
TechSupport                                                               [No, Yes, No internet service]
StreamingTV                                                               [No, Yes, No internet service]
StreamingMovies                                                           [No, Yes, No internet service]
Contract                                                            [Month-to-month, One year, Two year]
PaperlessBilling                                                                               [Yes, No]
PaymentMethod       [Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)]
Churn                                                                                          [No, Yes]
dtype: object


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df[df.TotalCharges.isna()].shape # bad rows

(11, 20)


df['TotalCharges'] = df['TotalCharges'].fillna(0)
# these people did not pay any charges.


target_name = 'Churn'


fig = px.histogram(df, x=target_name,
                   color=target_name,
                   width=600, height=400,
                  )
fig.show()


from plot_plotly import plotly_cat_binn_zero_one


plotly_cat_binn_zero_one(df,'SeniorCitizen',target_name,
                         zero='No',one='Yes',name='Churn')


from plot_plotly import plotly_pieplots


cols_bkg = ['Gender','SeniorCitizen','Partner','Dependents']
plotly_pieplots(df,cols_bkg,2,2,title='Customer Background')


cols_comm = ['Contract','PaperlessBilling','PaymentMethod']
plotly_pieplots(df,cols_comm,1,3,height=400,title='Commercial Features')


cols_service = ['PhoneService','MultipleLines','InternetService',
                'OnlineSecurity','OnlineBackup','DeviceProtection',
                'TechSupport','StreamingTV','StreamingMovies']
plotly_pieplots(df,cols_service,3,3,title='Service Features')

Data Description

Load the libraries

Useful Scripts

Load the Data

Exploratory Data Analysis (EDA)

Delete Unwanted Features

Change Column names

Unique Value Counts

Data Types

Target Distribution

Categorical Features

Customer Background

Commercial Features

Service Features

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
7041	8361-LTMKD	Male	1	Yes	No	4	Yes	Yes	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Mailed check	74.40	306.6	Yes
7042	3186-AJIEK	Male	0	No	No	66	Yes	No	Fiber optic	Yes	No	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	105.65	6844.5	No

	Feature	Type	N	Count	Unique	Zeros	ZerosPct	mean	std	min	max	25%	50%	75%
2	SeniorCitizen	int64	7043	7043	2	5901	83.79	0.16	0.37	0.00	1.00	0.00	0.00	0.00
5	tenure	int64	7043	7043	73	11	0.16	32.37	24.56	0.00	72.00	9.00	29.00	55.00
0	customerID	object	7043	7043	7043	0	0.00
1	gender	object	7043	7043	2	0	0.00
3	Partner	object	7043	7043	2	0	0.00
4	Dependents	object	7043	7043	2	0	0.00
6	PhoneService	object	7043	7043	2	0	0.00
7	MultipleLines	object	7043	7043	3	0	0.00
8	InternetService	object	7043	7043	3	0	0.00
9	OnlineSecurity	object	7043	7043	3	0	0.00
10	OnlineBackup	object	7043	7043	3	0	0.00
11	DeviceProtection	object	7043	7043	3	0	0.00
12	TechSupport	object	7043	7043	3	0	0.00
13	StreamingTV	object	7043	7043	3	0	0.00
14	StreamingMovies	object	7043	7043	3	0	0.00
15	Contract	object	7043	7043	3	0	0.00
16	PaperlessBilling	object	7043	7043	2	0	0.00
17	PaymentMethod	object	7043	7043	4	0	0.00
18	MonthlyCharges	float64	7043	7043	1585	0	0.00	64.76	30.09	18.25	118.75	35.50	70.35	89.85
19	TotalCharges	object	7043	7043	6531	0	0.00
20	Churn	object	7043	7043	2	0	0.00