References
Some models can fit data for a range of values of some parameter almost as efficiently as fitting the estimator for a single value of the parameter. These models are:
linear_model.LogisticRegressionCV
------------------
RidgeCV RidgeClassifierCV ElasticNetCV
LarsCV LassoCV LassoLarsCV
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import os,sys,time
import joblib
from tqdm import tqdm_notebook as tqdm
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
# machine learning
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics
from sklearn.utils.fixes import loguniform
import imblearn
from imblearn.over_sampling import SMOTE
# warnings
import warnings
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
imblearn 0.7.0 json 2.0.9 pandas 1.1.4 joblib 0.17.0 seaborn 0.11.0 plotly_express 0.4.1 numpy 1.19.4 autopep8 1.5.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0] != '_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
ser_test_ids = df_test['customerID']
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
def clean_data(dfx):
dfx = dfx.copy()
# from eda we see that gender has no effect
cols_drop = ['customerID','gender']
dfx = dfx.drop(cols_drop,axis=1)
# replace values
dic_replace = [
{'SeniorCitizen': {0:'No', 1:'Yes'}},
{'MultipleLines': {'No phone service':'N/A'}},
{'SeniorCitizen': {'No':'Not_SenCit', 'Yes':'SeniorCitizen'}},
{'Partner': {'No':'No_Partner', 'Yes':'Partner'}},
{'Dependents': {'No':'No_Dependents', 'Yes':'Dependents'}},
{'PaperlessBilling': {'No':'No_PaperlessBill', 'Yes':'PaperlessBill'}},
{'PhoneService': {'No':'No_PhoneService', 'Yes':'PhoneService'}},
{'MultipleLines': {'No':'No_MultiLines', 'Yes':'MultiLines', 'N/A': 'No_PhoneService'}},
{'InternetService': {'No':'No_internet_service'}},
{'OnlineSecurity': {'No':'No_OnlineSecurity', 'Yes':'OnlineSecurity'}},
{'OnlineBackup': {'No':'No_OnlineBackup', 'Yes':'OnlineBackup'}},
{'DeviceProtection': {'No':'No_DeviceProtection', 'Yes':'DeviceProtection'}},
{'TechSupport': {'No':'No_TechSupport', 'Yes':'TechSupport'}},
{'StreamingTV': {'No':'No_StreamingTV', 'Yes':'StreamingTV'}},
{'StreamingMovies': {'No':'No_StreamingMov', 'Yes':'StreamingMov'}}
]
for dic in dic_replace:
dfx = dfx.replace(dic)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],errors='coerce').fillna(0)
# sum of features
dfx['SenCit_Dependents'] = dfx['SeniorCitizen'] + '_' + dfx['Dependents']
dfx['SenCit_Partner'] = dfx['SeniorCitizen'] + '_' + dfx['Partner']
dfx['SenCit_Contract'] = dfx['SeniorCitizen'] + '_' + dfx['Contract']
dfx['SenCit_TechSupport'] = dfx['SeniorCitizen'] + '_' + dfx['TechSupport']
dfx['SenCit_PayMeth'] = dfx['SeniorCitizen'] + '_' + dfx['PaymentMethod']
dfx['Partner_Dependents'] = dfx['Partner'] + '_' + dfx['Dependents']
# aggration features
temp = (dfx.groupby('Contract')['TotalCharges'].agg(['mean'])
.rename({'mean':'Contract_mean_totCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='Contract', how='left')
dfx['Contract_totCharges_diff'] = (dfx['TotalCharges']
- dfx['Contract_mean_totCharges'])
temp = (dfx.groupby('PaymentMethod')['MonthlyCharges'].agg(['mean'])
.rename({'mean':'PayMeth_mean_monthCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='PaymentMethod', how='left')
dfx['PayMeth_monthCharges_diff'] = (dfx['MonthlyCharges']
- dfx['PayMeth_mean_monthCharges'])
multiLines_dict = {'No_PhoneService':0, 'No_MultiLines':1, 'MultiLines':2}
dfx['MultipleLines_Ordinal'] = dfx['MultipleLines'].map(multiLines_dict)
# Ordinal encoding of 'InternetService'
intServ_dict = {'No_internet_service':0, 'DSL':1, 'Fiber_optic':2}
dfx['InternetService_Ordinal'] = dfx['InternetService'].map(intServ_dict)
# Ordinal encoding of 'Contract'
contract_dict = {'Month-to-month':0, 'One_year':1, 'Two_year':2}
dfx['Contract_Ordinal'] = dfx['Contract'].map(contract_dict)
# Drop unnecessary columns that have been encoded
ordinal_drop_cols = ['MultipleLines', 'InternetService', 'Contract']
dfx.drop(ordinal_drop_cols, axis=1, inplace=True)
# Apply one-hot encoder to the relevant columns
cols_ohe = ['SeniorCitizen', 'Partner', 'Dependents',
'PaperlessBilling', 'PhoneService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'PaymentMethod',
'SenCit_Dependents', 'Partner_Dependents', 'SenCit_Partner',
'SenCit_Contract', 'SenCit_TechSupport', 'SenCit_PayMeth']
enc_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_ohe = pd.DataFrame(enc_ohe.fit_transform(dfx[cols_ohe]))
# Replace default column names with more descriptive ones
df_ohe.columns = enc_ohe.get_feature_names(cols_ohe)
# One-hot encoding removed index; put it back
df_ohe.index = dfx.index
# Remove categorical columns (will replace with one-hot encoding)
dfx.drop(cols_ohe, axis=1, inplace=True)
# Add one-hot encoded columns to numerical features
dfx = pd.concat([dfx, df_ohe], axis=1)
# remove columns
cols_drop = ['InternetService_Ordinal','Contract_Ordinal']
dfx = dfx.drop(cols_drop, axis=1)
# remove white spaces from column names
dfx = dfx.rename(columns=lambda x: x.strip())
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
df_Xtrain.sum().sum(), ser_ytrain.sum().sum()
(26621171.299999997, 1495)
df_Xtrain.head(2)
tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 36 | 106.05 | 3834.40 | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 10 | 62.25 | 612.95 | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
ser_ytrain.head(2)
0 0 1 0 Name: Churn, dtype: int64
from imblearn.over_sampling import SMOTE # smote needs sklearn 0.23.1
import sklearn
sklearn.__version__
'0.23.1'
smote = SMOTE(sampling_strategy=0.5, random_state=SEED)
df_Xtrain_smote, ser_ytrain_smote = smote.fit_resample(df_Xtrain,ser_ytrain)
# smote = SMOTE(ratio='minority', random_state=SEED)
# df_Xtrain_smote, ser_ytrain_smote = smote.fit_sample(df_Xtrain, ser_ytrain)
Instead of Standard scaling use power transformer (yeo-johnson) for not-uniform distribution
sklearn.preprocessing.PowerTransformer(
method='yeo-johnson', *, standardize=True, copy=True)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
df_Xtrain.head()
tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 36 | 106.05 | 3834.40 | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 10 | 62.25 | 612.95 | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 25 | 19.15 | 477.60 | 1370.923131 | -893.323131 | 43.792328 | -24.642328 | 1 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 7 | 20.00 | 137.60 | 1370.923131 | -1233.323131 | 67.564819 | -47.564819 | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 24 | 20.30 | 459.95 | 1370.923131 | -910.973131 | 43.792328 | -23.492328 | 1 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_Xtrain.columns[df_Xtrain.apply(pd.Series.nunique)>5]
Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract_totCharges_diff', 'PayMeth_monthCharges_diff'], dtype='object')
# Define the columns we wish to transform
cols_scale = ['tenure', 'MonthlyCharges', 'TotalCharges',
'Contract_totCharges_diff',
'PayMeth_monthCharges_diff']
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain)
df_Xtrain_scaled = pd.DataFrame(transformer.transform(df_Xtrain))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest))
#features
df_Xtrain_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
df_Xtrain_scaled.head()
tenure | MonthlyCharges | TotalCharges | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SenCit_Dependents_Not_SenCit_Dependents | SenCit_Dependents_Not_SenCit_No_Dependents | SenCit_Dependents_SeniorCitizen_Dependents | SenCit_Dependents_SeniorCitizen_No_Dependents | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SenCit_Partner_Not_SenCit_No_Partner | SenCit_Partner_Not_SenCit_Partner | SenCit_Partner_SeniorCitizen_No_Partner | SenCit_Partner_SeniorCitizen_Partner | SenCit_Contract_Not_SenCit_Month-to-month | SenCit_Contract_Not_SenCit_One year | SenCit_Contract_Not_SenCit_Two year | SenCit_Contract_SeniorCitizen_Month-to-month | SenCit_Contract_SeniorCitizen_One year | SenCit_Contract_SeniorCitizen_Two year | SenCit_TechSupport_Not_SenCit_No internet service | SenCit_TechSupport_Not_SenCit_No_TechSupport | SenCit_TechSupport_Not_SenCit_TechSupport | SenCit_TechSupport_SeniorCitizen_No internet service | SenCit_TechSupport_SeniorCitizen_No_TechSupport | SenCit_TechSupport_SeniorCitizen_TechSupport | SenCit_PayMeth_Not_SenCit_Bank transfer (automatic) | SenCit_PayMeth_Not_SenCit_Credit card (automatic) | SenCit_PayMeth_Not_SenCit_Electronic check | SenCit_PayMeth_Not_SenCit_Mailed check | SenCit_PayMeth_SeniorCitizen_Bank transfer (automatic) | SenCit_PayMeth_SeniorCitizen_Credit card (automatic) | SenCit_PayMeth_SeniorCitizen_Electronic check | SenCit_PayMeth_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.367334 | 1.355904 | 0.883378 | 0.329235 | 1.553040 | 3683.643192 | 65.801934 | 2.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | -0.786426 | -0.056378 | -0.528932 | -0.235047 | -0.268218 | 1370.923131 | 67.564819 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | -0.034567 | -1.531684 | -0.674473 | -0.329237 | -0.901558 | 1370.923131 | 43.792328 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | -0.999430 | -1.501041 | -1.274872 | -0.568686 | -1.626825 | 1370.923131 | 67.564819 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | -0.075469 | -1.490250 | -0.695633 | -0.341573 | -0.864635 | 1370.923131 | 43.792328 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_Xtrain_scaled.sum().sum(), df_Xtest_scaled.sum().sum()
(13273998.150000809, 3374471.9234355474)
df_Xtrain_scaled.isna().sum().sum(), df_Xtest_scaled.isna().sum().sum()
(0, 0)
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain_smote)
df_Xtrain_smote_scaled = pd.DataFrame(transformer.transform(df_Xtrain_smote))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest)) # Xtest is NEVER oversampled.
#features
df_Xtrain_smote_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
LogisticRegressionCV(
*,
Cs = 10, # 10 means 10 logvalues between e-4 and e+4
fit_intercept = True,
cv = None,
dual = False,
penalty = 'l2',
scoring = None,
solver = 'lbfgs',
tol = 0.0001,
max_iter = 100,
class_weight = None,
n_jobs = None,
verbose = 0,
refit = True,
intercept_scaling = 1.0,
multi_class = 'auto',
random_state = None,
l1_ratios = None, # only used for elasticnet with saga
)
solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’
Algorithm to use in the optimization problem.
For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.
‘liblinear’ might be slower in LogisticRegressionCV because it does not handle warm-starting.
from sklearn.linear_model import LogisticRegressionCV
# LogisticRegressionCV?
np.logspace(-2,0,num=10)
array([0.01 , 0.01668101, 0.02782559, 0.04641589, 0.07742637, 0.12915497, 0.21544347, 0.35938137, 0.59948425, 1. ])
warnings.simplefilter('ignore')
def custom_loss(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
scoring = skmetrics.make_scorer(custom_loss, greater_is_better=True)
# scoring = 'f1'
solver,penalty,l1_ratios = 'lbfgs','l2',None
# solver,penalty,l1_ratios = 'saga', 'elasticnet',np.logspace(-2,0,num=10)
Cs = [10**-i for i in range(5)]
model = LogisticRegressionCV(random_state=SEED,
scoring=scoring, # f1,roc_auc, recall
n_jobs=-1,
solver=solver, # lbfgs, saga
penalty=penalty, # l1 l2 elasticnet(only saga)
class_weight='balanced', # None
max_iter=100, # 1k gave worse result.
Cs=Cs,
l1_ratios=l1_ratios,
)
# Xtrain ==> smote + scaled
# Xtest ==> Xtest
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds_smote_scaled = model.predict(df_Xtest)
yprobs2d_smote_scaled = model.predict_proba(df_Xtest)
profit = custom_loss(ytest,ypreds_smote_scaled)
print(f"profit = {profit:,d}")
skmetrics.confusion_matrix(ytest, ypreds_smote_scaled)
profit = -74,100
array([[1030, 5], [ 372, 2]])
# Xtrain ==> orig
# Xtest ==> orig
model.fit(df_Xtrain, ser_ytrain)
ypreds_no_smote = model.predict(df_Xtest)
yprobs2d_no_smote = model.predict_proba(df_Xtest)
profit = custom_loss(ytest,ypreds_no_smote)
print(f"profit = {profit:,d}")
skmetrics.confusion_matrix(ytest, ypreds_no_smote)
profit = 82,500
array([[724, 311], [ 60, 314]])
show_methods(model)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | C_ | dual | multi_class | refit |
1 | Cs | fit | n_features_in_ | score |
2 | Cs_ | fit_intercept | n_iter_ | scores_ |
3 | class_weight | get_params | n_jobs | scoring |
4 | classes_ | intercept_ | penalty | set_params |
5 | coef_ | intercept_scaling | predict | solver |
6 | coefs_paths_ | l1_ratio_ | predict_log_proba | sparsify |
7 | cv | l1_ratios | predict_proba | tol |
8 | decision_function | l1_ratios_ | random_state | verbose |
9 | densify | max_iter |
print(model.scores_)
if solver == 'lbfgs':
df_scores = pd.DataFrame(model.scores_[1]).T
df_scores.columns = [f'fold_{i+1}' for i in range(5) ]
print(f'mean score : {df_scores.mean().mean():,.0f}')
display(df_scores)
display(px.box(df_scores))
{1: array([[59100, 57500, 59100, 57500, 56200], [62400, 62000, 62100, 61000, 60100], [55300, 56800, 54800, 61900, 63900], [54800, 55900, 55600, 55100, 54700], [65600, 65800, 65600, 65900, 64300]])} mean score : 59,720
fold_1 | fold_2 | fold_3 | fold_4 | fold_5 | |
---|---|---|---|---|---|
0 | 59100 | 62400 | 55300 | 54800 | 65600 |
1 | 57500 | 62000 | 56800 | 55900 | 65800 |
2 | 59100 | 62100 | 54800 | 55600 | 65600 |
3 | 57500 | 61000 | 61900 | 55100 | 65900 |
4 | 56200 | 60100 | 63900 | 54700 | 64300 |
# choose better predictions
ypreds = ypreds_no_smote
yprobs2d = yprobs2d_no_smote
pred_name = 'lrcv'
path_pred = f'../predictions/{pred_name}.csv'
df_preds = pd.DataFrame({'customerID': ser_test_ids})
df_preds[f'ypreds_{pred_name}'] = ypreds
df_preds[f'yprobs_{pred_name}'] = yprobs2d[:,1]
df_preds.to_csv(path_pred,index=False)
df_preds.head()
customerID | ypreds_lrcv | yprobs_lrcv | |
---|---|---|---|
0 | 1794-HBQTJ | 1 | 0.713449 |
1 | 0356-OBMAC | 0 | 0.210394 |
2 | 4077-CROMM | 1 | 0.757991 |
3 | 5442-PPTJY | 0 | 0.074839 |
4 | 2333-KWEWW | 0 | 0.087670 |
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
profit = custom_loss(ytest,ypreds)
print(f"profit = {profit:,d}")
model_eval_bin('LRCV',ytest,ypreds,yprobs2d,show_plots=True)
profit = 82,500 precision recall f1-score support 0 0.92 0.70 0.80 1035 1 0.50 0.84 0.63 374 accuracy 0.74 1409 macro avg 0.71 0.77 0.71 1409 weighted avg 0.81 0.74 0.75 1409 [[724 311] [ 60 314]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
LRCV | 0.7367 | 0.5024 | 0.8396 | 0.6286 | 0.7695 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 13 secs