What If Tool (WIT) is developed by Google for model explanation. Here, we use the tool for xgboost model interpretation.
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install --upgrade witwidget
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
import xgboost
import sklearn
from sklearn import metrics as skmetrics
# model eval
import witwidget
from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget
SEED = 100
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-12-19 CPython 3.7.9 IPython 7.18.1 compiler : Clang 10.0.0 system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit pandas 1.1.2 numpy 1.18.5 json 2.0.9 xgboost 1.2.0 sklearn 0.23.2 autopep8 1.5.4 watermark 2.0.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols):
rmse = np.sqrt(skmetrics.mean_squared_error(ytest,ypreds))
r2 = skmetrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = skmetrics.explained_variance_score(ytest, ypreds)
print(f"""
RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}
""")
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | ... | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | ... | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | ... | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
4 rows × 21 columns
target_name = 'Churn'
from sklearn.preprocessing import OneHotEncoder
def clean_data(dfx):
dfx = dfx.copy()
# from eda we see that gender has no effect
cols_drop = ['customerID','gender']
dfx = dfx.drop(cols_drop,axis=1)
# replace values
dic_replace = [
{'SeniorCitizen': {0:'No', 1:'Yes'}},
{'MultipleLines': {'No phone service':'N/A'}},
{'SeniorCitizen': {'No':'Not_SenCit', 'Yes':'SeniorCitizen'}},
{'Partner': {'No':'No_Partner', 'Yes':'Partner'}},
{'Dependents': {'No':'No_Dependents', 'Yes':'Dependents'}},
{'PaperlessBilling': {'No':'No_PaperlessBill', 'Yes':'PaperlessBill'}},
{'PhoneService': {'No':'No_PhoneService', 'Yes':'PhoneService'}},
{'MultipleLines': {'No':'No_MultiLines', 'Yes':'MultiLines', 'N/A': 'No_PhoneService'}},
{'InternetService': {'No':'No_internet_service'}},
{'OnlineSecurity': {'No':'No_OnlineSecurity', 'Yes':'OnlineSecurity'}},
{'OnlineBackup': {'No':'No_OnlineBackup', 'Yes':'OnlineBackup'}},
{'DeviceProtection': {'No':'No_DeviceProtection', 'Yes':'DeviceProtection'}},
{'TechSupport': {'No':'No_TechSupport', 'Yes':'TechSupport'}},
{'StreamingTV': {'No':'No_StreamingTV', 'Yes':'StreamingTV'}},
{'StreamingMovies': {'No':'No_StreamingMov', 'Yes':'StreamingMov'}}
]
for dic in dic_replace:
dfx = dfx.replace(dic)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],errors='coerce').fillna(0)
# sum of features
dfx['SenCit_Dependents'] = dfx['SeniorCitizen'] + '_' + dfx['Dependents']
dfx['Partner_Dependents'] = dfx['Partner'] + '_' + dfx['Dependents']
dfx['SenCit_Partner'] = dfx['SeniorCitizen'] + '_' + dfx['Partner']
dfx['SenCit_Contract'] = dfx['SeniorCitizen'] + '_' + dfx['Contract']
dfx['SenCit_TechSupport'] = dfx['SeniorCitizen'] + '_' + dfx['TechSupport']
dfx['SenCit_PayMeth'] = dfx['SeniorCitizen'] + '_' + dfx['PaymentMethod']
# aggration features
temp = (dfx.groupby('Contract')['TotalCharges'].agg(['mean'])
.rename({'mean':'Contract_mean_totCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='Contract', how='left')
dfx['Contract_totCharges_diff'] = (dfx['TotalCharges']
- dfx['Contract_mean_totCharges'])
temp = (dfx.groupby('PaymentMethod')['MonthlyCharges'].agg(['mean'])
.rename({'mean':'PayMeth_mean_monthCharges'},axis=1))
dfx = pd.merge(dfx, temp, on='PaymentMethod', how='left')
dfx['PayMeth_monthCharges_diff'] = (dfx['MonthlyCharges']
- dfx['PayMeth_mean_monthCharges'])
multiLines_dict = {'No_PhoneService':0, 'No_MultiLines':1, 'MultiLines':2}
dfx['MultipleLines_Ordinal'] = dfx['MultipleLines'].map(multiLines_dict)
# Ordinal encoding of 'InternetService'
intServ_dict = {'No_internet_service':0, 'DSL':1, 'Fiber_optic':2}
dfx['InternetService_Ordinal'] = dfx['InternetService'].map(intServ_dict)
# Ordinal encoding of 'Contract'
contract_dict = {'Month-to-month':0, 'One_year':1, 'Two_year':2}
dfx['Contract_Ordinal'] = dfx['Contract'].map(contract_dict)
# Drop unnecessary columns that have been encoded
ordinal_drop_cols = ['MultipleLines', 'InternetService', 'Contract']
dfx.drop(ordinal_drop_cols, axis=1, inplace=True)
# Apply one-hot encoder to the relevant columns
cols_ohe = ['SeniorCitizen', 'Partner', 'Dependents',
'PaperlessBilling', 'PhoneService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'PaymentMethod',
'SenCit_Dependents', 'Partner_Dependents', 'SenCit_Partner',
'SenCit_Contract', 'SenCit_TechSupport', 'SenCit_PayMeth']
enc_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_ohe = pd.DataFrame(enc_ohe.fit_transform(dfx[cols_ohe]))
# Replace default column names with more descriptive ones
df_ohe.columns = enc_ohe.get_feature_names(cols_ohe)
# One-hot encoding removed index; put it back
df_ohe.index = dfx.index
# Remove categorical columns (will replace with one-hot encoding)
dfx.drop(cols_ohe, axis=1, inplace=True)
# Add one-hot encoded columns to numerical features
dfx = pd.concat([dfx, df_ohe], axis=1)
# remove nans
cols_drop = ['InternetService_Ordinal','Contract_Ordinal']
dfx = dfx.drop(cols_drop, axis=1)
# remove white spaces from column names
dfx = dfx.rename(columns=lambda x: x.strip())
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
ytest = np.array(ser_ytest).flatten()
from imblearn.over_sampling import SMOTE # smote needs sklearn 0.23.1
import sklearn
sklearn.__version__
'0.23.2'
smote = SMOTE(sampling_strategy=0.5, random_state=SEED)
df_Xtrain_smote, ser_ytrain_smote = smote.fit_resample(df_Xtrain,ser_ytrain)
Instead of Standard scaling use power transformer (yeo-johnson) for not-uniform distribution
sklearn.preprocessing.PowerTransformer(
method='yeo-johnson', *, standardize=True, copy=True)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
# Define the columns we wish to transform
cols_scale = ['tenure', 'MonthlyCharges', 'TotalCharges',
'Contract_totCharges_diff',
'PayMeth_monthCharges_diff']
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain)
df_Xtrain_scaled = pd.DataFrame(transformer.transform(df_Xtrain))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest))
#features
df_Xtrain_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
df_Xtrain_scaled.isna().sum().sum(), df_Xtest_scaled.isna().sum().sum()
(0, 0)
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain_smote)
df_Xtrain_smote_scaled = pd.DataFrame(transformer.transform(df_Xtrain_smote))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest)) # Xtest is NEVER oversampled.
#features
df_Xtrain_smote_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
from sklearn.linear_model import LogisticRegression
params_fixed = {'dual': False,
'random_state': SEED,
'n_jobs': 1
}
params_best = {'C': 0.42679058013626753, 'max_iter': 1000,
'penalty': 'l2', 'solver': 'lbfgs'}
# params_best = grid.best_params_
params = params_fixed
params.update(params_best)
model = LogisticRegression(**params)
model.fit(df_Xtrain_smote_scaled, ser_ytrain_smote)
ypreds = model.predict(df_Xtest)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds)
array([[384, 651], [ 55, 319]])
ypreds_scaled = model.predict(df_Xtest_scaled)
skmetrics.confusion_matrix(np.array(ser_ytest), ypreds_scaled)
array([[874, 161], [135, 239]])
df_preds = pd.DataFrame({'ytest': ytest, 'ypreds': ypreds})
pd.crosstab(df_preds['ytest'],df_preds['ypreds'],margins=True)
ypreds | 0 | 1 | All |
---|---|---|---|
ytest | |||
0 | 384 | 651 | 1035 |
1 | 55 | 319 | 374 |
All | 439 | 970 | 1409 |
features = df_Xtrain_smote.columns.difference([target_name]).tolist()
features_all = features + [target_name]
import witwidget
from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget
def custom_predict_fn(lst):
testing_data = pd.DataFrame(lst, columns=features)
return model.predict_proba(testing_data)
# arr_examples.shape, df_Xtest.shape
N = 100
N = len(df_Xtest)
HEIGHT = 1000
arr_examples = np.c_[df_Xtest.to_numpy(), ytest][:N]
lst_examples = arr_examples.tolist()
config_builder = WitConfigBuilder(lst_examples, features_all)
config_builder.set_target_feature(target_name)
config_builder.set_custom_predict_fn(custom_predict_fn)
config_builder.set_model_type('classification')
WitWidget(config_builder, height=HEIGHT)