import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

json     2.0.9
numpy    1.18.4
seaborn  0.11.0
autopep8 1.5.2
pandas   1.1.0


# local imports
module_path = "/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/"
sys.path.append(module_path)

from pandas_api import BPAccessor
from plot_utils import add_text_barplot


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


path_raw = '../data/raw/telco_customer_churn.csv'
df = pd.read_csv(path_raw)

print(df.shape)
df.head(2).append(df.tail(2))

(7043, 21)


target_name = 'Churn'


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB


df.bp.describe()


"""
Look at the feature with datatype object,
the feature TotalCharges should be float but it is object.

""";


cols_drop = ['customerID']
df = df.drop(cols_drop,axis=1)


df = df.rename(columns=lambda x: x[0].capitalize()+x[1:].strip())
df.head()


cols_obj = df.select_dtypes('object').columns.tolist()
print(cols_obj)

['Gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']


# exclude total charges
cols_obj2 = [i for i in cols_obj if i not in ['TotalCharges']]

df[cols_obj2].apply(pd.Series.nunique).sort_values()

Gender              2
Partner             2
Dependents          2
PhoneService        2
PaperlessBilling    2
Churn               2
MultipleLines       3
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
PaymentMethod       4
dtype: int64


df[cols_obj2].apply(lambda x: list(pd.Series.unique(x)))

Gender                                                                                    [Female, Male]
Partner                                                                                        [Yes, No]
Dependents                                                                                     [No, Yes]
PhoneService                                                                                   [No, Yes]
MultipleLines                                                                [No phone service, No, Yes]
InternetService                                                                   [DSL, Fiber optic, No]
OnlineSecurity                                                            [No, Yes, No internet service]
OnlineBackup                                                              [Yes, No, No internet service]
DeviceProtection                                                          [No, Yes, No internet service]
TechSupport                                                               [No, Yes, No internet service]
StreamingTV                                                               [No, Yes, No internet service]
StreamingMovies                                                           [No, Yes, No internet service]
Contract                                                            [Month-to-month, One year, Two year]
PaperlessBilling                                                                               [Yes, No]
PaymentMethod       [Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)]
Churn                                                                                          [No, Yes]
dtype: object


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],
                                   errors='coerce')

df[df.TotalCharges.isna()].shape # bad rows

(11, 20)


df['TotalCharges'] = df['TotalCharges'].fillna(0)
# these people did not pay any charges.


target_name = 'Churn'


df.bp.plot_cat(target_name,colors=['green','brown'])


"""
Takeaway:
The dataset is highly imbalanced.
This is churn and no-churn classification, churn may be more important.
We can take recall or f1 score as the metric of evaluation rather than accuracy.

""";


cols_num = list(df.select_dtypes('number').columns)
cols_num = [i for i in cols_num if i not in ['SeniorCitizen']]
cols_num

['Tenure', 'MonthlyCharges', 'TotalCharges']


df.bp.plot_num_cat(cols_num[0],target_name)


# low tenure employees are more likely to churn.


df.bp.plot_num_cat(cols_num[1],target_name)


# if monthly charge is high ==> more churn.


df.bp.plot_num_cat(cols_num[2],target_name)


# if total charges is small ==> more churn.


def plot_binned(df,col):
    plt.figure(figsize=(20,5))
    plt.title(f'{col} (Binned) Churn Comparison',fontsize=14)
    sns.countplot(data=df, x=col,
        hue='Churn',
        palette=['green','tomato'],
        order=sorted(df[col].value_counts().index))


col = 'Tenure'
bins = [12*i for i in range(6)] + [np.inf]
labels = list(range(1,7))
labels = [f"{i} year" for i in labels]
df[col+'_cat'] = pd.cut(df[col],bins=bins,labels=labels,
                          include_lowest=True,right=False)

plot_binned(df,'Tenure_cat')


col = 'MonthlyCharges'
bins = [-np.inf] + [20*i for i in range(1,6)] + [np.inf]
labels = [0] +bins[1:-1]
df[col+'_cat'] = pd.cut(df[col],bins=bins,labels=labels,
                          include_lowest=True,right=False)
plot_binned(df,'MonthlyCharges_cat')


cols_cat = [i for i in cols_obj if i not in [target_name,'TotalCharges']]
cols_cat += ['SeniorCitizen']
print(len(cols_cat), cols_cat)

df[cols_cat].apply(pd.Series.nunique) # Make sure you dont have too large numbers

16 ['Gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']

Gender              2
Partner             2
Dependents          2
PhoneService        2
MultipleLines       3
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
PaperlessBilling    2
PaymentMethod       4
SeniorCitizen       2
dtype: int64


col = 'Gender'
df.bp.plot_cat_cat(col,target_name)

==================================================
Feature: **Gender**
Overall Count: 
    Male: 50.48%
    Female: 49.52%

Total  **Churn_Yes** distribution:
    Female: 50.24%
    Male: 49.76%

Per Gender  **Churn_Yes** distribution:
    Female: 26.92%
    Male: 26.16%


"""
Let's understand the chart.

Look at bottom chart:
  Out of 100% customers, 50.48% are male.
  Out of 100% customers, 49.52% are female.
  The company is pretty inclusive.

Look at middle right plot:
  Out of 100% people who churned, 49.76% are male.
  Out of 100% people who churned, 50.24% are female.
  The difference is not significant.

Look at top right plot:
  Out of 100% Males,   26.16% churned.
  Out of 100% Females, 26.92% churned.
  The difference is not significant.

""";


from plot_ds import plot_cat_binn


for col in cols_cat:
    rot = 30 if col in ['PaymentMethod'] else 0
    plot_cat_binn(df, col,target_name,is_1_good=False,rot=rot,save=False)

# look at higher values of left red bar plots


# pd.crosstab(df.Churn, df.SeniorCitizen,margins = True)


sns.countplot(x= 'SeniorCitizen', hue='Churn', data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x7ff4bdca10d0>


from ds_ds import report_cat_binn


# for cat in cols_cat:
#     report_cat_binn(df,cat,'Churn', one='Yes',name='Churn')


"""
Summary of Churn Analysis


Feature: Gender
==================================================
Not much difference.

Feature: Partner
==================================================
Churn_Partner_No  : 1200  (64.21% of 1869  total Churn and 
                           32.96% of 3641  group Partner_No)
If no parter==> MORE churn.

Feature: Dependents
==================================================
Churn_Dependents_No  : 1543  (82.56% of 1869  total Churn)
It there are no dependents==> MORE churn.

Feature: PhoneService
==================================================
Churn_PhoneService_No  : 1699  (90.90% of 1869  total Churn
If no phone service==> MORE churn
Note: There are only 9.68% who does not have phone service.

Feature: MultipleLines
==================================================
Churn_MultipleLines_Yes              : 170   ( 9.10% of 1869  total Churn)
If they have multiple lines==> LESS churn.

Feature: InternetService
==================================================
Churn_InternetService_DSL         : 1297  (69.40% of 1869  total Churn)
If service is DL==> MORE churn.

==================================================
Churn_OnlineSecurity_No : 1461  (78.17% of 1869  total Churn)
If there is no online service==> MORE churn.

==================================================
Churn_OnlineBackup_No  : 1233  (65.97% of 1869  total Churn)
If no online backup ==> MORE churn.

Feature: DeviceProtection
==================================================
Churn_DeviceProtection_No : 1211  (64.79% of 1869  total Churn)
If no device protection ==> MORE churn.

Feature: TechSupport
==================================================
Churn_TechSupport_No : 1446  (77.37% of 1869  total Churn)
If no tech support ==> MORE churn.

Feature: StreamingTV
==================================================
Churn_StreamingTV_Yes : 113   ( 6.05% of 1869  total Churn)
If streaming TV ==> LESS churn.

Feature: StreamingMovies
==================================================
Churn_StreamingMovies_Yes : 113   ( 6.05% of 1869  total Churn)
If streaming movies ==> LESS churn.

Feature: Contract
==================================================
Churn_Contract_Month-to-month : 1655  (88.55% of 1869  total Churn)
If contact month-to-month ==> MORE churn.

Feature: PaperlessBilling
==================================================
Churn_PaperlessBilling_No  : 1400  (74.91% of 1869  total Churn)
If no paperless billing ==> MORE churn.

Feature: PaymentMethod
==================================================
Churn_PaymentMethod_Bank transfer (automatic) : 1071  (57.30% of 1869  total Churn)
If payment method is bank transfer ==> MORE churn.

Feature: SeniorCitizen
==================================================

SeniorCitizen_0 : 5901  (83.79% of total 7043 )
SeniorCitizen_1 : 1142  (16.21% of total 7043 )

Churn_SeniorCitizen_0 : 1393  (74.53% of 1869  total Churn and 
                               23.61% of 5901  group SeniorCitizen_0)
Churn_SeniorCitizen_1 : 476   (25.47% of 1869  total Churn and 
                               41.68% of 1142  group SeniorCitizen_1)
NOTE: There are very few senior citizens (only 16.21%)

""";


sns.pairplot(df[cols_num+["Churn"]], hue="Churn", height=5.5,diag_kind="kde")

<seaborn.axisgrid.PairGrid at 0x7ff484d5de10>


df_corr = df.corr()
df_corr.style.background_gradient()

	SeniorCitizen	Tenure	MonthlyCharges	TotalCharges
SeniorCitizen	1.000000	0.016567	0.220173	0.103006
Tenure	0.016567	1.000000	0.247900	0.826178
MonthlyCharges	0.220173	0.247900	1.000000	0.651174
TotalCharges	0.103006	0.826178	0.651174	1.000000

Data Description

Load the libraries

Useful Scripts

Load the Data

Exploratory Data Analysis (EDA)

Delete Unwanted Features

Change Column names

Unique Value Counts

Data Types

Target Distribution

Numerical Features

Binning Numerical Features

Categorical Features

Pairplots¶

Correlations¶

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
7041	8361-LTMKD	Male	1	Yes	No	4	Yes	Yes	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Mailed check	74.40	306.6	Yes
7042	3186-AJIEK	Male	0	No	No	66	Yes	No	Fiber optic	Yes	No	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	105.65	6844.5	No

	Feature	Type	N	Count	Unique	Zeros	ZerosPct	mean	std	min	max	25%	50%	75%
2	SeniorCitizen	int64	7043	7043	2	5901	83.79	0.16	0.37	0.00	1.00	0.00	0.00	0.00
5	tenure	int64	7043	7043	73	11	0.16	32.37	24.56	0.00	72.00	9.00	29.00	55.00
0	customerID	object	7043	7043	7043	0	0.00
1	gender	object	7043	7043	2	0	0.00
3	Partner	object	7043	7043	2	0	0.00
4	Dependents	object	7043	7043	2	0	0.00
6	PhoneService	object	7043	7043	2	0	0.00
7	MultipleLines	object	7043	7043	3	0	0.00
8	InternetService	object	7043	7043	3	0	0.00
9	OnlineSecurity	object	7043	7043	3	0	0.00
10	OnlineBackup	object	7043	7043	3	0	0.00
11	DeviceProtection	object	7043	7043	3	0	0.00
12	TechSupport	object	7043	7043	3	0	0.00
13	StreamingTV	object	7043	7043	3	0	0.00
14	StreamingMovies	object	7043	7043	3	0	0.00
15	Contract	object	7043	7043	3	0	0.00
16	PaperlessBilling	object	7043	7043	2	0	0.00
17	PaymentMethod	object	7043	7043	4	0	0.00
18	MonthlyCharges	float64	7043	7043	1585	0	0.00	64.76	30.09	18.25	118.75	35.50	70.35	89.85
19	TotalCharges	object	7043	7043	6531	0	0.00
20	Churn	object	7043	7043	2	0	0.00