import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
# update modules
!pip uninstall xgboost
!pip install -U xgboost
print('Environment: Google Colab')
sys.path.append('../utils/')
from hyperband_search import HyperbandSearchCV
import numpy as np
import pandas as pd
import os,sys,time
import joblib
from tqdm import tqdm_notebook as tqdm
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
# machine learning
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics
# warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark seaborn 0.11.0 imblearn 0.7.0 autopep8 1.5.2 plotly_express 0.4.1 numpy 1.19.4 json 2.0.9 pandas 1.1.4 joblib 0.17.0 sklearn 0.23.1
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
ser_test_ids = df_test['customerID']
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
def clean_data(dfx):
dfx = dfx.copy()
# from eda we see that gender has no effect
cols_drop = ['customerID','gender']
dfx = dfx.drop(cols_drop,axis=1)
# replace values
dic_replace = [
{'SeniorCitizen': {0:'No', 1:'Yes'}},
{'MultipleLines': {'No phone service':'N/A'}},
{'SeniorCitizen': {'No':'Not_SenCit', 'Yes':'SeniorCitizen'}},
{'Partner': {'No':'No_Partner', 'Yes':'Partner'}},
{'Dependents': {'No':'No_Dependents', 'Yes':'Dependents'}},
{'PaperlessBilling': {'No':'No_PaperlessBill', 'Yes':'PaperlessBill'}},
{'PhoneService': {'No':'No_PhoneService', 'Yes':'PhoneService'}},
{'MultipleLines': {'No':'No_MultiLines', 'Yes':'MultiLines', 'N/A': 'No_PhoneService'}},
{'InternetService': {'No':'No_internet_service'}},
{'OnlineSecurity': {'No':'No_OnlineSecurity', 'Yes':'OnlineSecurity'}},
{'OnlineBackup': {'No':'No_OnlineBackup', 'Yes':'OnlineBackup'}},
{'DeviceProtection': {'No':'No_DeviceProtection', 'Yes':'DeviceProtection'}},
{'TechSupport': {'No':'No_TechSupport', 'Yes':'TechSupport'}},
{'StreamingTV': {'No':'No_StreamingTV', 'Yes':'StreamingTV'}},
{'StreamingMovies': {'No':'No_StreamingMov', 'Yes':'StreamingMov'}}
]
for dic in dic_replace:
dfx = dfx.replace(dic)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],errors='coerce').fillna(0)
# sum of features
dfx['SenCit_Dependents'] = dfx['SeniorCitizen'] + '_' + dfx['Dependents']
dfx['SenCit_Partner'] = dfx['SeniorCitizen'] + '_' + dfx['Partner']
dfx['SenCit_Contract'] = dfx['SeniorCitizen'] + '_' + dfx['Contract']
dfx['SenCit_TechSupport'] = dfx['SeniorCitizen'] + '_' + dfx['TechSupport']
dfx['SenCit_PayMeth'] = dfx['SeniorCitizen'] + '_' + dfx['PaymentMethod']
dfx['Partner_Dependents'] = dfx['Partner'] + '_' + dfx['Dependents']
# aggration features
temp = (dfx.groupby('Contract')['TotalCharges'].agg(['mean'])
.rename({'mean':'Contract_mean_totCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='Contract', how='left')
dfx['Contract_totCharges_diff'] = (dfx['TotalCharges']
- dfx['Contract_mean_totCharges'])
temp = (dfx.groupby('PaymentMethod')['MonthlyCharges'].agg(['mean'])
.rename({'mean':'PayMeth_mean_monthCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='PaymentMethod', how='left')
dfx['PayMeth_monthCharges_diff'] = (dfx['MonthlyCharges']
- dfx['PayMeth_mean_monthCharges'])
multiLines_dict = {'No_PhoneService':0, 'No_MultiLines':1, 'MultiLines':2}
dfx['MultipleLines_Ordinal'] = dfx['MultipleLines'].map(multiLines_dict)
# Ordinal encoding of 'InternetService'
intServ_dict = {'No_internet_service':0, 'DSL':1, 'Fiber_optic':2}
dfx['InternetService_Ordinal'] = dfx['InternetService'].map(intServ_dict)
# Ordinal encoding of 'Contract'
contract_dict = {'Month-to-month':0, 'One_year':1, 'Two_year':2}
dfx['Contract_Ordinal'] = dfx['Contract'].map(contract_dict)
# Drop unnecessary columns that have been encoded
ordinal_drop_cols = ['MultipleLines', 'InternetService', 'Contract']
dfx.drop(ordinal_drop_cols, axis=1, inplace=True)
# Apply one-hot encoder to the relevant columns
cols_ohe = ['SeniorCitizen', 'Partner', 'Dependents',
'PaperlessBilling', 'PhoneService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'PaymentMethod',
'SenCit_Dependents', 'Partner_Dependents', 'SenCit_Partner',
'SenCit_Contract', 'SenCit_TechSupport', 'SenCit_PayMeth']
enc_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_ohe = pd.DataFrame(enc_ohe.fit_transform(dfx[cols_ohe]))
# Replace default column names with more descriptive ones
df_ohe.columns = enc_ohe.get_feature_names(cols_ohe)
# One-hot encoding removed index; put it back
df_ohe.index = dfx.index
# Remove categorical columns (will replace with one-hot encoding)
dfx.drop(cols_ohe, axis=1, inplace=True)
# Add one-hot encoded columns to numerical features
dfx = pd.concat([dfx, df_ohe], axis=1)
# remove columns
cols_drop = ['InternetService_Ordinal','Contract_Ordinal']
dfx = dfx.drop(cols_drop, axis=1)
# remove white spaces from column names
dfx = dfx.rename(columns=lambda x: x.strip())
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
df_Xtrain.sum().sum(), ser_ytrain.sum().sum()
(26621171.299999997, 1495)
df_Xtrain.head(2)
tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 36 | 106.05 | 3834.40 | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 10 | 62.25 | 612.95 | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
ser_ytrain.head(2)
0 0 1 0 Name: Churn, dtype: int64
from imblearn.over_sampling import SMOTE # smote needs sklearn 0.23.1
import sklearn
sklearn.__version__
'0.23.1'
smote = SMOTE(sampling_strategy=0.5, random_state=SEED)
df_Xtrain_smote, ser_ytrain_smote = smote.fit_resample(df_Xtrain,ser_ytrain)
# smote = SMOTE(ratio='minority', random_state=SEED)
# df_Xtrain_smote, ser_ytrain_smote = smote.fit_sample(df_Xtrain, ser_ytrain)
Instead of Standard scaling use power transformer (yeo-johnson) for not-uniform distribution
sklearn.preprocessing.PowerTransformer(
method='yeo-johnson', *, standardize=True, copy=True)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
df_Xtrain.head()
tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 36 | 106.05 | 3834.40 | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 10 | 62.25 | 612.95 | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 25 | 19.15 | 477.60 | 1370.923131 | -893.323131 | 43.792328 | -24.642328 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 7 | 20.00 | 137.60 | 1370.923131 | -1233.323131 | 67.564819 | -47.564819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 24 | 20.30 | 459.95 | 1370.923131 | -910.973131 | 43.792328 | -23.492328 | 1 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_Xtrain.columns[df_Xtrain.apply(pd.Series.nunique)>5]
Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_totCharges_diff', 'PayMeth_monthCharges_diff'], dtype='object')
# Define the columns we wish to transform
cols_scale = ['tenure', 'MonthlyCharges', 'TotalCharges',
'Contract_totCharges_diff',
'PayMeth_monthCharges_diff']
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain)
df_Xtrain_scaled = pd.DataFrame(transformer.transform(df_Xtrain))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest))
#features
df_Xtrain_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
df_Xtrain_scaled.head()
tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.367334 | 1.355904 | 0.883378 | 0.329235 | 1.553040 | 3683.643192 | 65.801934 | 2.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | -0.786426 | -0.056378 | -0.528932 | -0.235047 | -0.268218 | 1370.923131 | 67.564819 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | -0.034567 | -1.531684 | -0.674473 | -0.329237 | -0.901558 | 1370.923131 | 43.792328 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | -0.999430 | -1.501041 | -1.274872 | -0.568686 | -1.626825 | 1370.923131 | 67.564819 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | -0.075469 | -1.490250 | -0.695633 | -0.341573 | -0.864635 | 1370.923131 | 43.792328 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_Xtrain_scaled.sum().sum(), df_Xtest_scaled.sum().sum()
(13273998.150000809, 3374471.9234355474)
df_Xtrain_scaled.isna().sum().sum(), df_Xtest_scaled.isna().sum().sum()
(0, 0)
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain_smote)
df_Xtrain_smote_scaled = pd.DataFrame(transformer.transform(df_Xtrain_smote))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest)) # Xtest is NEVER oversampled.
#features
df_Xtrain_smote_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
LogisticRegression(
penalty='l2', # 'l1', 'l2', 'elasticnet', 'none'}, default='l2'
*,
dual = False,
tol = 0.0001, # default=1e-4
C = 1.0,
fit_intercept = True,
intercept_scaling = 1,
class_weight = None, # dict or 'balanced', default=None
random_state = None,
solver = 'lbfgs', # 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'
max_iter = 100,
multi_class = 'auto',
verbose = 0,
warm_start = False,
n_jobs = None,
l1_ratio = None,
)
solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'
Algorithm to use in the optimization problem.
- For small datasets, 'liblinear' is a good choice, whereas 'sag' and
'saga' are faster for large ones.
- For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
handle multinomial loss; 'liblinear' is limited to one-versus-rest
schemes.
- 'newton-cg', 'lbfgs', 'sag' and 'saga' handle L2 or no penalty
- 'liblinear' and 'saga' also handle L1 penalty
- 'saga' also supports 'elasticnet' penalty
- 'liblinear' does not support setting ``penalty='none'``
Note that 'sag' and 'saga' fast convergence is only guaranteed on
features with approximately the same scale. You can
preprocess the data with a scaler from sklearn.preprocessing.
from sklearn.linear_model import LogisticRegression
# LogisticRegression?
model = LogisticRegression(random_state=SEED)
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds = model.predict(df_Xtest)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds)
array([[1035, 0], [ 374, 0]])
# help(HyperbandSearchCV)
X = df_Xtrain_smote_scaled
y = ser_ytrain_smote
X = df_Xtrain_scaled
y = ser_ytrain
show_methods(sklearn.exceptions)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | ChangedBehaviorWarning | DataDimensionalityWarning | NonBLASDotWarning | SkipTestWarning |
1 | ConvergenceWarning | EfficiencyWarning | NotFittedError | UndefinedMetricWarning |
2 | DataConversionWarning | FitFailedWarning | PositiveSpectrumWarning |
%%time
import scipy.stats as stats
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import warnings
from scipy.optimize.linesearch import LineSearchWarning
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
# Define our model
params_fixed = {'dual': False,
'random_state': SEED,
'n_jobs': 1
}
params_hyp = {
'max_iter': stats.randint(200,500),
'solver' : ['lbfgs', 'sag', 'newton-cg'],
'penalty' : ['l2'],
'C' : stats.loguniform(0.01, 10),
}
# This gives
# [[384 651]
# [ 55 319]]
# params_hyp = {
# 'max_iter' : stats.randint(200,500),
# 'solver' : ['saga'], # newton-cg only support l2
# 'penalty' : ['l1','l2','elasticnet'],
# 'C' : stats.loguniform(0.01, 10),
# 'l1_ratio' : stats.loguniform(0.01, 1), # must be <= 1.0
# 'class_weight' : [ None, 'balanced'],
# }
# This gives much worse result
# array([[901, 134],
# [182, 192]])
# # Wall time: 6min 30s
model = LogisticRegression(**params_fixed)
# Perform Hyperparameter Tuning
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=SEED)
grid = HyperbandSearchCV(model,params_hyp,
resource_param = 'max_iter',
min_iter = 200,
max_iter = 1000,
cv = cv,
scoring = 'roc_auc',
refit = True,
verbose = 0,
random_state = SEED
)
# comment these after run
# grid.fit(X, y)
# print('Best parameters: ', grid.best_params_)
# print('AUC - ROC score: ', grid.best_score_)
# Wall time: 6min 30s
CPU times: user 2.23 ms, sys: 899 µs, total: 3.13 ms Wall time: 3.49 ms
params_best = {'C': 0.42679058013626753, 'max_iter': 1000,
'penalty': 'l2', 'solver': 'lbfgs'}
# params_best = grid.best_params_
params = params_fixed
params.update(params_best)
model = LogisticRegression(**params)
# Xtrain ==> smote + scaled
# Xtest ==> Xtest
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds_smote_scaled = model.predict(df_Xtest)
yprobs2d_smote_scaled = model.predict_proba(df_Xtest)
skmetrics.confusion_matrix(ytest, ypreds_smote_scaled)
array([[384, 651], [ 55, 319]])
# Xtrain ==> orig
# Xtest ==> orig
model.fit(df_Xtrain, ser_ytrain)
ypreds_no_smote = model.predict(df_Xtest)
yprobs_no_smote = model.predict_proba(df_Xtest)
skmetrics.confusion_matrix(ytest, ypreds_no_smote)
array([[905, 130], [166, 208]])
# choose better predictions
ypreds = ypreds_smote_scaled
yprobs2d = yprobs2d_smote_scaled
yprobs2d[:5]
array([[0.00000000e+00, 1.00000000e+00], [4.33068154e-02, 9.56693185e-01], [1.17439347e-09, 9.99999999e-01], [0.00000000e+00, 1.00000000e+00], [0.00000000e+00, 1.00000000e+00]])
pred_name = 'lr'
path_pred = f'../predictions/{pred_name}.csv'
df_preds = pd.DataFrame({'customerID': ser_test_ids})
df_preds[f'ypreds_{pred_name}'] = ypreds
df_preds[f'yprobs_{pred_name}'] = yprobs2d[:,1]
df_preds.to_csv(path_pred,index=False)
df_preds.head()
customerID | ypreds_lr | yprobs_lr | |
---|---|---|---|
0 | 1794-HBQTJ | 1 | 1.000000 |
1 | 0356-OBMAC | 1 | 0.956693 |
2 | 4077-CROMM | 1 | 1.000000 |
3 | 5442-PPTJY | 1 | 1.000000 |
4 | 2333-KWEWW | 1 | 1.000000 |
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('LR',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 0.87 0.37 0.52 1035 1 0.33 0.85 0.47 374 accuracy 0.50 1409 macro avg 0.60 0.61 0.50 1409 weighted avg 0.73 0.50 0.51 1409 [[384 651] [ 55 319]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
LR | 0.4989 | 0.3289 | 0.8529 | 0.4747 | 0.6120 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 9 secs