References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# Special
!pip install featuretools[complete]
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
# update modules
!pip uninstall xgboost
!pip install -U xgboost
print('Environment: Google Colab')
from hyperband_search import HyperbandSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics
# special
import imblearn
import featuretools as ft
# warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
<frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject <frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
2020-12-23 18:52:18,691 featuretools - WARNING Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug. numpy : 1.19.4 pandas : 1.1.5 seaborn : 0.11.0 sklearn : 0.23.2 joblib : 1.0.0 imblearn : 0.7.0 autopep8 : 1.5.4 featuretools : 0.22.0 sys : 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] plotly_express: 0.4.1 json : 2.0.9 matplotlib : 3.3.3
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
ser_test_ids = df_test['customerID']
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,width=300,height=200)
df_train['customerID'].nunique() == len(df_train)
True
def clean_data(dfx):
dfx = dfx.copy()
# keep customerid for index feature.
# from eda we see that gender has no effect
cols_drop = ['gender']
dfx = dfx.drop(cols_drop,axis=1)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],
errors='coerce').fillna(0)
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
from featuretools import variable_types as vtypes
show_methods(vtypes)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Boolean | FilePath | PandasTypes | api |
1 | Categorical | FullName | PhoneNumber | camel_to_snake |
2 | ClassNameDescriptor | IPAddress | SubRegionCode | find_variable_types |
3 | CountryCode | Id | Text | graph_variable_types |
4 | DEFAULT_DTYPE_VALUES | Index | TimeIndex | list_variable_types |
5 | DateOfBirth | LatLong | Timedelta | np |
6 | Datetime | NaturalLanguage | URL | pd |
7 | DatetimeTimeIndex | Numeric | Unknown | utils |
8 | Discrete | NumericTimeIndex | Variable | variable |
9 | EmailAddress | Ordinal | ZIPCode | warnings |
df_Xtrain.head(2)
customerID | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
1 | 7143-BQIBA | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
# df_train[cols_obj].apply(lambda x: pd.Series.unique(x))
cols_obj = df_train.select_dtypes('object').columns.tolist()
df_train[cols_obj].apply(lambda x: pd.Series.nunique(x)).sort_values()
Partner 2 Dependents 2 PhoneService 2 PaperlessBilling 2 Churn 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaymentMethod 4 customerID 5634 dtype: int64
# customer id is index
# other columns have very low cardinality, we can take ordinal or ohe.
cols_cat = [i for i in cols_obj if i not in ['customerID',target_name]]
print(cols_cat)
['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
cols_num = df_train.select_dtypes('number').columns.tolist()
cols_num
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
cols_num = [i for i in cols_num if i not in ['SeniorCitizen']]
cols_cat += ['SeniorCitizen']
features = cols_cat + cols_num
def get_fm(dfx,cols_num=cols_num,cols_cat=cols_cat,
index='customerID'
):
dic_cat = {i:vtypes.Categorical for i in cols_cat}
dic_num = {i:vtypes.Numeric for i in cols_num}
all_variable_types = {**dic_cat, **dic_num}
es = ft.EntitySet("data")
es.entity_from_dataframe(entity_id="data",
dataframe=dfx,
index=index,
time_index=None,
variable_types=all_variable_types)
new_entity_id="SeniorCitizen"
es.normalize_entity(base_entity_id="data",
new_entity_id=new_entity_id,
index=new_entity_id
)
# Adding This gave me worse result.
# new_entity_id="Dependents"
# es.normalize_entity(base_entity_id="data",
# new_entity_id=new_entity_id,
# index=new_entity_id
# )
trans_primitives = [
'divide_numeric', # cross multiply all numeric features (not others)
]
feature_matrix, features = ft.dfs(entityset=es,
target_entity="data",
trans_primitives=trans_primitives,
drop_exact=[],
verbose=True
)
df_out = feature_matrix
cols_cat = list(df_out.select_dtypes('object').columns)
df_out = pd.get_dummies(df_out,columns=cols_cat,drop_first=False)
return df_out
df_Xtrain_new = get_fm(df_Xtrain)
df_Xtrain_new.head(2)
Built 71 features Elapsed: 00:00 | Progress: 100%|██████████
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | MonthlyCharges / TotalCharges | MonthlyCharges / tenure | TotalCharges / MonthlyCharges | TotalCharges / tenure | tenure / MonthlyCharges | tenure / TotalCharges | SeniorCitizen.COUNT(data) | SeniorCitizen.MAX(data.MonthlyCharges) | SeniorCitizen.MAX(data.TotalCharges) | SeniorCitizen.MAX(data.tenure) | SeniorCitizen.MEAN(data.MonthlyCharges) | SeniorCitizen.MEAN(data.TotalCharges) | SeniorCitizen.MEAN(data.tenure) | SeniorCitizen.MIN(data.MonthlyCharges) | SeniorCitizen.MIN(data.TotalCharges) | SeniorCitizen.MIN(data.tenure) | SeniorCitizen.NUM_UNIQUE(data.Contract) | SeniorCitizen.NUM_UNIQUE(data.Dependents) | SeniorCitizen.NUM_UNIQUE(data.DeviceProtection) | SeniorCitizen.NUM_UNIQUE(data.InternetService) | SeniorCitizen.NUM_UNIQUE(data.MultipleLines) | SeniorCitizen.NUM_UNIQUE(data.OnlineBackup) | SeniorCitizen.NUM_UNIQUE(data.OnlineSecurity) | SeniorCitizen.NUM_UNIQUE(data.PaperlessBilling) | SeniorCitizen.NUM_UNIQUE(data.Partner) | SeniorCitizen.NUM_UNIQUE(data.PaymentMethod) | SeniorCitizen.NUM_UNIQUE(data.PhoneService) | SeniorCitizen.NUM_UNIQUE(data.StreamingMovies) | SeniorCitizen.NUM_UNIQUE(data.StreamingTV) | SeniorCitizen.NUM_UNIQUE(data.TechSupport) | SeniorCitizen.SKEW(data.MonthlyCharges) | SeniorCitizen.SKEW(data.TotalCharges) | SeniorCitizen.SKEW(data.tenure) | SeniorCitizen.STD(data.MonthlyCharges) | SeniorCitizen.STD(data.TotalCharges) | SeniorCitizen.STD(data.tenure) | SeniorCitizen.SUM(data.MonthlyCharges) | SeniorCitizen.SUM(data.TotalCharges) | SeniorCitizen.SUM(data.tenure) | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen.MODE(data.Contract)_Month-to-month | SeniorCitizen.MODE(data.Dependents)_No | SeniorCitizen.MODE(data.DeviceProtection)_No | SeniorCitizen.MODE(data.InternetService)_Fiber optic | SeniorCitizen.MODE(data.MultipleLines)_No | SeniorCitizen.MODE(data.MultipleLines)_Yes | SeniorCitizen.MODE(data.OnlineBackup)_No | SeniorCitizen.MODE(data.OnlineSecurity)_No | SeniorCitizen.MODE(data.PaperlessBilling)_Yes | SeniorCitizen.MODE(data.Partner)_No | SeniorCitizen.MODE(data.PaymentMethod)_Electronic check | SeniorCitizen.MODE(data.PhoneService)_Yes | SeniorCitizen.MODE(data.StreamingMovies)_No | SeniorCitizen.MODE(data.StreamingMovies)_Yes | SeniorCitizen.MODE(data.StreamingTV)_No | SeniorCitizen.MODE(data.StreamingTV)_Yes | SeniorCitizen.MODE(data.TechSupport)_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
customerID | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1621-YNCJH | 0 | 36 | 106.05 | 3834.40 | 0.027658 | 2.945833 | 36.156530 | 106.511111 | 0.339463 | 0.009389 | 4743 | 118.75 | 8672.45 | 72 | 61.71498 | 2166.606304 | 32.078431 | 18.25 | 0.0 | 0 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.088151 | 1.037776 | 0.255781 | 30.247656 | 2221.937336 | 24.622527 | 292714.15 | 10276213.7 | 152148 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
7143-BQIBA | 0 | 10 | 62.25 | 612.95 | 0.101558 | 6.225000 | 9.846586 | 61.295000 | 0.160643 | 0.016315 | 4743 | 118.75 | 8672.45 | 72 | 61.71498 | 2166.606304 | 32.078431 | 18.25 | 0.0 | 0 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.088151 | 1.037776 | 0.255781 | 30.247656 | 2221.937336 | 24.622527 | 292714.15 | 10276213.7 | 152148 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
df_Xtest_new = get_fm(df_Xtest)
df_Xtest_new.head(2)
Built 71 features Elapsed: 00:00 | Progress: 100%|██████████
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | MonthlyCharges / TotalCharges | MonthlyCharges / tenure | TotalCharges / MonthlyCharges | TotalCharges / tenure | tenure / MonthlyCharges | tenure / TotalCharges | SeniorCitizen.COUNT(data) | SeniorCitizen.MAX(data.MonthlyCharges) | SeniorCitizen.MAX(data.TotalCharges) | SeniorCitizen.MAX(data.tenure) | SeniorCitizen.MEAN(data.MonthlyCharges) | SeniorCitizen.MEAN(data.TotalCharges) | SeniorCitizen.MEAN(data.tenure) | SeniorCitizen.MIN(data.MonthlyCharges) | SeniorCitizen.MIN(data.TotalCharges) | SeniorCitizen.MIN(data.tenure) | SeniorCitizen.NUM_UNIQUE(data.Contract) | SeniorCitizen.NUM_UNIQUE(data.Dependents) | SeniorCitizen.NUM_UNIQUE(data.DeviceProtection) | SeniorCitizen.NUM_UNIQUE(data.InternetService) | SeniorCitizen.NUM_UNIQUE(data.MultipleLines) | SeniorCitizen.NUM_UNIQUE(data.OnlineBackup) | SeniorCitizen.NUM_UNIQUE(data.OnlineSecurity) | SeniorCitizen.NUM_UNIQUE(data.PaperlessBilling) | SeniorCitizen.NUM_UNIQUE(data.Partner) | SeniorCitizen.NUM_UNIQUE(data.PaymentMethod) | SeniorCitizen.NUM_UNIQUE(data.PhoneService) | SeniorCitizen.NUM_UNIQUE(data.StreamingMovies) | SeniorCitizen.NUM_UNIQUE(data.StreamingTV) | SeniorCitizen.NUM_UNIQUE(data.TechSupport) | SeniorCitizen.SKEW(data.MonthlyCharges) | SeniorCitizen.SKEW(data.TotalCharges) | SeniorCitizen.SKEW(data.tenure) | SeniorCitizen.STD(data.MonthlyCharges) | SeniorCitizen.STD(data.TotalCharges) | SeniorCitizen.STD(data.tenure) | SeniorCitizen.SUM(data.MonthlyCharges) | SeniorCitizen.SUM(data.TotalCharges) | SeniorCitizen.SUM(data.tenure) | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen.MODE(data.Contract)_Month-to-month | SeniorCitizen.MODE(data.Dependents)_No | SeniorCitizen.MODE(data.DeviceProtection)_No | SeniorCitizen.MODE(data.InternetService)_Fiber optic | SeniorCitizen.MODE(data.MultipleLines)_No | SeniorCitizen.MODE(data.MultipleLines)_Yes | SeniorCitizen.MODE(data.OnlineBackup)_No | SeniorCitizen.MODE(data.OnlineSecurity)_No | SeniorCitizen.MODE(data.PaperlessBilling)_Yes | SeniorCitizen.MODE(data.Partner)_No | SeniorCitizen.MODE(data.Partner)_Yes | SeniorCitizen.MODE(data.PaymentMethod)_Electronic check | SeniorCitizen.MODE(data.PhoneService)_Yes | SeniorCitizen.MODE(data.StreamingMovies)_No | SeniorCitizen.MODE(data.StreamingMovies)_Yes | SeniorCitizen.MODE(data.StreamingTV)_No | SeniorCitizen.MODE(data.StreamingTV)_Yes | SeniorCitizen.MODE(data.TechSupport)_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
customerID | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1794-HBQTJ | 0 | 1 | 48.6 | 48.6 | 1.000000 | 48.600000 | 1.00000 | 48.600000 | 0.020576 | 0.020576 | 1158 | 118.60 | 8684.80 | 72 | 62.389983 | 2219.692358 | 32.658031 | 18.7 | 0.00 | 0 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.070659 | 1.072957 | 0.228395 | 30.601714 | 2278.817867 | 24.658823 | 72247.60 | 2570403.75 | 37818 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
0356-OBMAC | 1 | 56 | 99.9 | 5706.3 | 0.017507 | 1.783929 | 57.12012 | 101.898214 | 0.560561 | 0.009814 | 251 | 117.35 | 8436.25 | 72 | 79.820120 | 2727.039641 | 32.314741 | 19.2 | 19.45 | 1 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.974773 | 0.663470 | 0.237100 | 22.771121 | 2395.833841 | 24.655882 | 20034.85 | 684486.95 | 8111 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 |
def post_process_fm(fm,thr_miss=0.95,thr_corr=0.95):
"""Post process feature matrix.
1. remove duplicated features
2. remove features having many missing features
3. remvoe zero variance features
4. remove high collinear features
"""
# Remove duplicated features
start_features = fm.shape[1]
fm = fm.iloc[:, ~fm.columns.duplicated()]
n_dups = start_features - fm.shape[1]
print(f'There were {n_dups} duplicated features.')
fm = fm.replace({np.inf: np.nan, -np.inf:np.nan})
# Remove the ids and labels
idname = 'index'
targetname = 'price'
cols_drop_id = [ i for i in fm.columns if idname in i]
cols_drop_target = [ i for i in fm.columns if targetname in i]
cols_drop_id_target = cols_drop_id + cols_drop_target
print('Dropping ids and label: ', cols_drop_id_target)
fm = fm.drop(cols_drop_id_target,axis=1)
# One hot encoding (if necessary)
fm = pd.get_dummies(fm)
n_features_start = fm.shape[1]
print('Original shape: ', fm.shape)
# Find missing and percentage
df_miss = pd.DataFrame(fm.isnull().sum())
df_miss['frac'] = df_miss[0] / fm.shape[0]
df_miss.sort_values('frac', ascending = False, inplace = True)
# Missing above threshold
cols_miss = list(df_miss[df_miss['frac'] > thr_miss].index)
n_cols_miss = len(cols_miss)
# Remove missing columns
fm = fm[[i for i in fm if i not in cols_miss]]
print('{} missing columns with threshold: {}.'.format(
n_cols_miss, thr_miss))
# Zero variance
df_unq_ct = pd.DataFrame(fm.nunique()).sort_values(0,ascending=True)
cols_zero_var = list(df_unq_ct[df_unq_ct[0] == 1].index)
n_cols_zero_var = len(cols_zero_var)
# Remove zero variance columns
fm = fm[[i for i in fm if i not in cols_zero_var]]
print('{} zero variance columns.'.format(n_cols_zero_var))
# Correlations
df_corr = fm.corr()
# Extract the upper triangle of the correlation matrix
df_upper = df_corr.where(np.triu(np.ones(df_corr.shape), k = 1).astype(np.bool))
# Select the features with correlations above the threshold
# Need to use the absolute value
cols_drop = [col for col in df_upper.columns
if any(df_upper[col].abs() > thr_corr)]
n_collinear = len(cols_drop)
fm = fm[[i for i in fm if i not in cols_drop]]
print('{} collinear columns removed with correlation above {}.'.format(
n_collinear, thr_corr))
n_total_cols_removed = n_dups + n_cols_miss + n_cols_zero_var + n_collinear
print('Total columns removed: ', n_total_cols_removed)
print('Shape after feature selection: {}.'.format(fm.shape))
return fm
df_Xtrain_good = post_process_fm(df_Xtrain_new,thr_miss=0.9,thr_corr=0.9)
df_Xtest_good = post_process_fm(df_Xtest_new,thr_miss=0.9,thr_corr=0.9)
There were 0 duplicated features. Dropping ids and label: [] Original shape: (5634, 99) 0 missing columns with threshold: 0.9. 26 zero variance columns. 39 collinear columns removed with correlation above 0.9. Total columns removed: 65 Shape after feature selection: (5634, 34). There were 0 duplicated features. Dropping ids and label: [] Original shape: (1409, 100) 0 missing columns with threshold: 0.9. 25 zero variance columns. 41 collinear columns removed with correlation above 0.9. Total columns removed: 66 Shape after feature selection: (1409, 34).
# if some features are not common in train test, exclude them.
cols_exclude = np.setdiff1d(df_Xtest_good.columns,df_Xtest_good.columns)
cols_exclude
array([], dtype=object)
Xtr = df_Xtrain_good.fillna(0)
ytr = np.array(ser_ytrain)
Xtx = df_Xtest_good.fillna(0)
ytx = np.array(ser_ytest)
# Gives Worse result
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(Xtr)
# Xtr = scaler.transform(Xtr)
# Xtx = scaler.transform(Xtx)
from xgboost import XGBClassifier
model = XGBClassifier(random_state=SEED,subsample=0.9,max_depth=3)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
skmetrics.confusion_matrix(ytx, ypreds)
[18:52:21] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/Users/poudel/opt/miniconda3/envs/ft/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
array([[921, 114], [169, 205]])
# XGBClassifier?
from sklearn.linear_model import LogisticRegression
params_fixed = {'dual': False,
'random_state': SEED,
'n_jobs': 1
}
params_best = {'C': 0.42679058013626753, 'max_iter': 1000,
'penalty': 'l2', 'solver': 'lbfgs'}
params = params_fixed
params.update(params_best)
model = LogisticRegression(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
skmetrics.confusion_matrix(ytx, ypreds)
array([[925, 110], [168, 206]])
ytest = np.array(ser_ytest)
yprobs2d = model.predict_proba(Xtx)
pred_name = 'featuretools_lr'
path_pred = f'../predictions/{pred_name}.csv'
df_preds_out = pd.DataFrame({'customerID': ser_test_ids})
df_preds_out[f'ypreds_{pred_name}'] = ypreds
df_preds_out[f'yprobs_{pred_name}'] = yprobs2d[:,1]
df_preds_out.to_csv(path_pred,index=False)
df_preds_out.head()
customerID | ypreds_featuretools_lr | yprobs_featuretools_lr | |
---|---|---|---|
0 | 1794-HBQTJ | 0 | 0.445886 |
1 | 0356-OBMAC | 0 | 0.078101 |
2 | 4077-CROMM | 1 | 0.508567 |
3 | 5442-PPTJY | 0 | 0.022152 |
4 | 2333-KWEWW | 0 | 0.019473 |
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('LR',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 0.85 0.89 0.87 1035 1 0.65 0.55 0.60 374 accuracy 0.80 1409 macro avg 0.75 0.72 0.73 1409 weighted avg 0.79 0.80 0.80 1409 [[925 110] [168 206]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
LR | 0.8027 | 0.6519 | 0.5508 | 0.5971 | 0.7223 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 8 secs