The data is taken from kaggle competition Telco Customer Churn. The main aim of the project is to predict whether a customer will leave (churn) the company or not based on given attributes. This is a binary classification problem and one of the most important usage of machine learning in business world.
Content
Each row represents a customer, each column contains customer’s attributes described on the column Metadata.
The data set includes information about:
References
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import plotly_express as px
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
json 2.0.9 plotly_express 0.4.1 pandas 1.1.0 autopep8 1.5.2 numpy 1.18.4 plotly 4.9.0 seaborn 0.11.0
# local imports
module_path = "/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/"
sys.path.append(module_path)
from pandas_api import BPAccessor
from plot_utils import add_text_barplot
from plot_plotly import plotly_cat_binn_zero_one
from plot_plotly import plotly_pieplots
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_raw = '../data/raw/telco_customer_churn.csv'
df = pd.read_csv(path_raw)
print(df.shape)
df.head(2).append(df.tail(2))
(7043, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
target_name = 'Churn'
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
df.bp.describe()
Feature | Type | N | Count | Unique | Missing | MissingPct | Zeros | ZerosPct | mean | std | min | max | 25% | 50% | 75% | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | SeniorCitizen | int64 | 7043 | 7043 | 2 | 0 | 0.00 | 5901 | 83.79 | 0.16 | 0.37 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 |
5 | tenure | int64 | 7043 | 7043 | 73 | 0 | 0.00 | 11 | 0.16 | 32.37 | 24.56 | 0.00 | 72.00 | 9.00 | 29.00 | 55.00 |
0 | customerID | object | 7043 | 7043 | 7043 | 0 | 0.00 | 0 | 0.00 | |||||||
1 | gender | object | 7043 | 7043 | 2 | 0 | 0.00 | 0 | 0.00 | |||||||
3 | Partner | object | 7043 | 7043 | 2 | 0 | 0.00 | 0 | 0.00 | |||||||
4 | Dependents | object | 7043 | 7043 | 2 | 0 | 0.00 | 0 | 0.00 | |||||||
6 | PhoneService | object | 7043 | 7043 | 2 | 0 | 0.00 | 0 | 0.00 | |||||||
7 | MultipleLines | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
8 | InternetService | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
9 | OnlineSecurity | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
10 | OnlineBackup | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
11 | DeviceProtection | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
12 | TechSupport | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
13 | StreamingTV | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
14 | StreamingMovies | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
15 | Contract | object | 7043 | 7043 | 3 | 0 | 0.00 | 0 | 0.00 | |||||||
16 | PaperlessBilling | object | 7043 | 7043 | 2 | 0 | 0.00 | 0 | 0.00 | |||||||
17 | PaymentMethod | object | 7043 | 7043 | 4 | 0 | 0.00 | 0 | 0.00 | |||||||
18 | MonthlyCharges | float64 | 7043 | 7043 | 1585 | 0 | 0.00 | 0 | 0.00 | 64.76 | 30.09 | 18.25 | 118.75 | 35.50 | 70.35 | 89.85 |
19 | TotalCharges | object | 7043 | 7043 | 6531 | 0 | 0.00 | 0 | 0.00 | |||||||
20 | Churn | object | 7043 | 7043 | 2 | 0 | 0.00 | 0 | 0.00 |
"""
Look at the feature with datatype object,
the feature TotalCharges should be float but it is object.
""";
cols_drop = ['customerID']
df = df.drop(cols_drop,axis=1)
We have all the column names as CamelCase except gender and tenure
, change them to CamelCase.
df = df.rename(columns=lambda x: x[0].capitalize()+x[1:].strip())
df.head()
Gender | SeniorCitizen | Partner | Dependents | Tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
cols_obj = df.select_dtypes('object').columns.tolist()
print(cols_obj)
['Gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']
# exclude total charges
cols_obj2 = [i for i in cols_obj if i not in ['TotalCharges']]
df[cols_obj2].apply(pd.Series.nunique).sort_values()
Gender 2 Partner 2 Dependents 2 PhoneService 2 PaperlessBilling 2 Churn 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaymentMethod 4 dtype: int64
df[cols_obj2].apply(lambda x: list(pd.Series.unique(x)))
Gender [Female, Male] Partner [Yes, No] Dependents [No, Yes] PhoneService [No, Yes] MultipleLines [No phone service, No, Yes] InternetService [DSL, Fiber optic, No] OnlineSecurity [No, Yes, No internet service] OnlineBackup [Yes, No, No internet service] DeviceProtection [No, Yes, No internet service] TechSupport [No, Yes, No internet service] StreamingTV [No, Yes, No internet service] StreamingMovies [No, Yes, No internet service] Contract [Month-to-month, One year, Two year] PaperlessBilling [Yes, No] PaymentMethod [Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)] Churn [No, Yes] dtype: object
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df[df.TotalCharges.isna()].shape # bad rows
(11, 20)
df['TotalCharges'] = df['TotalCharges'].fillna(0)
# these people did not pay any charges.
target_name = 'Churn'
fig = px.histogram(df, x=target_name,
color=target_name,
width=600, height=400,
)
fig.show()
from plot_plotly import plotly_cat_binn_zero_one
plotly_cat_binn_zero_one(df,'SeniorCitizen',target_name,
zero='No',one='Yes',name='Churn')
from plot_plotly import plotly_pieplots
cols_bkg = ['Gender','SeniorCitizen','Partner','Dependents']
plotly_pieplots(df,cols_bkg,2,2,title='Customer Background')
cols_comm = ['Contract','PaperlessBilling','PaymentMethod']
plotly_pieplots(df,cols_comm,1,3,height=400,title='Commercial Features')
cols_service = ['PhoneService','MultipleLines','InternetService',
'OnlineSecurity','OnlineBackup','DeviceProtection',
'TechSupport','StreamingTV','StreamingMovies']
plotly_pieplots(df,cols_service,3,3,title='Service Features')