import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# Special
!pip install featuretools[complete]
!pip install composeml
!pip install evalml
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics
# special
import imblearn
# featuretools
import composeml as cp
import featuretools as ft
import evalml
# warnings
import warnings
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
2020-12-20 14:35:04,517 featuretools - WARNING Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug. joblib : 1.0.0 imblearn : 0.7.0 sys : 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] json : 2.0.9 plotly_express: 0.4.1 evalml : 0.16.1 seaborn : 0.11.0 composeml : 0.5.1 featuretools : 0.22.0 autopep8 : 1.5.4 matplotlib : 3.3.3 numpy : 1.19.4 sklearn : 0.23.2 pandas : 1.1.5
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,width=300,height=200)
df_train['customerID'].nunique() == len(df_train)
True
def clean_data(dfx):
dfx = dfx.copy()
# keep customerid for index feature.
# from eda we see that gender has no effect
cols_drop = ['gender']
dfx = dfx.drop(cols_drop,axis=1)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],
errors='coerce').fillna(0)
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
import evalml
from evalml.objectives import CostBenefitMatrix
from evalml import AutoMLSearch
show_methods(evalml)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | AutoMLSearch | exceptions | preprocessing | skopt |
1 | automl | model_family | print_info | tuners |
2 | data_checks | model_understanding | problem_types | utils |
3 | demos | objectives | sklearn | warnings |
4 | evalml | pipelines |
In real business situations, we have different cost of misclassifying different components.
Here, let's choose following costs:
true_positive = 400 ==> PROFIT of $400
==> intervene, incentivize the customer to stay,
and sign a new contract.
true_negative = 0 ==> nothing needs to be done for that customer.
false_positive = -100 ==> COST of $100
==> marketing and effort to retain the user.
false_negative = -200 ==> COST of $200
==> Revenue lost by losing customer.
false_negative/false_positive = 2
show_methods(evalml.objectives)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | AUC | F1Weighted | Precision | cost_benefit_matrix |
1 | AUCMacro | FraudCost | PrecisionMacro | fraud_cost |
2 | AUCMicro | LeadScoring | PrecisionMicro | get_all_objective_names |
3 | AUCWeighted | LogLossBinary | PrecisionWeighted | get_core_objective_names |
4 | AccuracyBinary | LogLossMulticlass | R2 | get_core_objectives |
5 | AccuracyMulticlass | MAE | Recall | get_non_core_objectives |
6 | BalancedAccuracyBinary | MCCBinary | RecallMacro | get_objective |
7 | BalancedAccuracyMulticlass | MCCMulticlass | RecallMicro | lead_scoring |
8 | BinaryClassificationObjective | MSE | RecallWeighted | multiclass_classification_objective |
9 | CostBenefitMatrix | MaxError | RegressionObjective | objective_base |
10 | ExpVariance | MeanSquaredLogError | RootMeanSquaredError | regression_objective |
11 | F1 | MedianAE | RootMeanSquaredLogError | standard_metrics |
12 | F1Macro | MulticlassClassificationObjective | binary_classification_objective | utils |
13 | F1Micro | ObjectiveBase |
from evalml.objectives import CostBenefitMatrix
cost_benefit_matrix = CostBenefitMatrix(true_positive=400,
true_negative=0,
false_positive=-100,
false_negative=-200)
AutoMLSearch(
problem_type = None,
objective = 'auto', # r2, log loss binary/multiclass
max_iterations = None,
max_time = None, # inteter is seconds, strings: minutes hours
patience = None, # default is no early stopping
tolerance = None,
data_split = None, # StratifiedKFold
allowed_pipelines = None,
allowed_model_families = None,
start_iteration_callback = None,
add_result_callback = None,
error_callback = None,
additional_objectives = None,
random_state = 0,
n_jobs = -1,
tuner_class = None,
verbose = True,
optimize_thresholds = False,
ensembling = False,
max_batches = None,
problem_configuration = None,
_pipelines_per_batch = 5,
)
automl.search(X, y,
data_checks='auto',
show_iteration_plot=True)
from evalml import AutoMLSearch
%%time
import warnings
warnings.simplefilter('ignore')
automl = AutoMLSearch(problem_type='binary', objective='log loss binary')
automl.search(df_Xtrain, ser_ytrain)
pipe_ll = automl.best_pipeline
pipe_ll.fit(df_Xtrain, ser_ytrain)
Using default limit of max_batches=1. `X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead. `y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead. Generating pipelines to search over... ***************************** * Beginning pipeline search * ***************************** Optimizing for Log Loss Binary. Lower score is better. Searching up to 1 batches for a total of 9 pipelines. Allowed model families: xgboost, linear_model, random_forest, extra_trees, decision_tree, catboost, lightgbm
Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00 Starting cross validation Finished cross validation - mean Log Loss Binary: 9.165 Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.579 Batch 1: (3/9) LightGBM Classifier w/ Imputer + Text... Elapsed:00:38 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.440 Batch 1: (4/9) Extra Trees Classifier w/ Imputer + T... Elapsed:01:13 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.430 Batch 1: (5/9) Elastic Net Classifier w/ Imputer + T... Elapsed:01:46 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.579 Batch 1: (6/9) CatBoost Classifier w/ Imputer + Text... Elapsed:02:18 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.550 Batch 1: (7/9) XGBoost Classifier w/ Imputer + Text ... Elapsed:02:50 Starting cross validation [14:38:02] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:38:13] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:38:25] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Finished cross validation - mean Log Loss Binary: 0.431 Batch 1: (8/9) Random Forest Classifier w/ Imputer +... Elapsed:03:25 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.421 Batch 1: (9/9) Logistic Regression Classifier w/ Imp... Elapsed:04:01 Starting cross validation Finished cross validation - mean Log Loss Binary: 0.415 Search finished after 04:45 Best pipeline: Logistic Regression Classifier w/ Imputer + Text Featurization Component + One Hot Encoder + Standard Scaler Best pipeline Log Loss Binary: 0.415499
GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Text Featurization Component':{'text_columns': ['Contract', 'PaymentMethod']}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': None, 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Logistic Regression Classifier':{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'},})
pipe_score_ll = pipe_ll.score(df_Xtest, ser_ytest, ['log loss binary'])
print (pipe_score_ll)
pipe_score_ll_cbm = pipe_ll.score(df_Xtest, ser_ytest, [cost_benefit_matrix])
print (pipe_score_ll_cbm)
# Calculate total profit across all customers using
# pipeline optimized for Log Loss
total_profit_ll = pipe_score_ll_cbm['Cost Benefit Matrix'] * len(df_Xtrain)
print (f"{total_profit_ll:,.2f}")
OrderedDict([('Log Loss Binary', 0.42941442097922433)]) OrderedDict([('Cost Benefit Matrix', 26.685592618878637)]) 150,346.63
%%time
import warnings
warnings.simplefilter('ignore')
automl = AutoMLSearch(problem_type='binary', objective=cost_benefit_matrix)
automl.search(df_Xtrain, ser_ytrain)
pipe_cbm = automl.best_pipeline
pipe_cbm.fit(df_Xtrain, ser_ytrain)
Using default limit of max_batches=1. `X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead. `y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead. Generating pipelines to search over... ***************************** * Beginning pipeline search * ***************************** Optimizing for Cost Benefit Matrix. Greater score is better. Searching up to 1 batches for a total of 9 pipelines. Allowed model families: xgboost, linear_model, random_forest, extra_trees, decision_tree, catboost, lightgbm
Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: -53.071 Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: 17.678 High coefficient of variation (cv >= 0.2) within cross validation scores. Decision Tree Classifier w/ Imputer + Text Featurization Component + One Hot Encoder may not perform as estimated on unseen data. Batch 1: (3/9) LightGBM Classifier w/ Imputer + Text... Elapsed:00:36 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: 22.009 Batch 1: (4/9) Extra Trees Classifier w/ Imputer + T... Elapsed:01:17 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: 15.602 High coefficient of variation (cv >= 0.2) within cross validation scores. Extra Trees Classifier w/ Imputer + Text Featurization Component + One Hot Encoder may not perform as estimated on unseen data. Batch 1: (5/9) Elastic Net Classifier w/ Imputer + T... Elapsed:01:58 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: -53.071 Batch 1: (6/9) CatBoost Classifier w/ Imputer + Text... Elapsed:02:30 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: 10.490 High coefficient of variation (cv >= 0.2) within cross validation scores. CatBoost Classifier w/ Imputer + Text Featurization Component may not perform as estimated on unseen data. Batch 1: (7/9) XGBoost Classifier w/ Imputer + Text ... Elapsed:03:08 Starting cross validation [14:45:28] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:45:41] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:45:53] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Finished cross validation - mean Cost Benefit Matrix: 24.299 High coefficient of variation (cv >= 0.2) within cross validation scores. XGBoost Classifier w/ Imputer + Text Featurization Component + One Hot Encoder may not perform as estimated on unseen data. Batch 1: (8/9) Random Forest Classifier w/ Imputer +... Elapsed:03:46 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: 18.229 Batch 1: (9/9) Logistic Regression Classifier w/ Imp... Elapsed:04:25 Starting cross validation Finished cross validation - mean Cost Benefit Matrix: 26.926 Search finished after 05:04 Best pipeline: Logistic Regression Classifier w/ Imputer + Text Featurization Component + One Hot Encoder + Standard Scaler Best pipeline Cost Benefit Matrix: 26.925808 CPU times: user 4min 20s, sys: 48.6 s, total: 5min 8s Wall time: 5min 11s
GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Text Featurization Component':{'text_columns': ['Contract', 'PaymentMethod']}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': None, 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Logistic Regression Classifier':{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'},})
pipe_score_cbm = pipe_cbm.score(df_Xtest, ser_ytest, [cost_benefit_matrix])
print (pipe_score_cbm)
# Calculate total profit across all customers using
# pipeline optimized for Log Loss
total_profit_cbm = pipe_score_cbm['Cost Benefit Matrix'] * len(df_Xtrain)
print (f"{total_profit_cbm:,.2f}")
print (f"{total_profit_ll:,.2f}")
OrderedDict([('Cost Benefit Matrix', 26.685592618878637)]) 150,346.63 150,346.63
pipe_cbm
GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Text Featurization Component':{'text_columns': ['Contract', 'PaymentMethod']}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': None, 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Logistic Regression Classifier':{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'},})
show_methods(evalml.model_understanding)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | binary_objective_vs_threshold | explain_predictions_best_worst | graph_precision_recall_curve | partial_dependence |
1 | calculate_permutation_importance | graph_binary_objective_vs_threshold | graph_prediction_vs_actual | precision_recall_curve |
2 | confusion_matrix | graph_confusion_matrix | graph_roc_curve | prediction_explanations |
3 | explain_prediction | graph_partial_dependence | graphs | roc_curve |
4 | explain_predictions | graph_permutation_importance | normalize_confusion_matrix |
from evalml.model_understanding import graphs as evgraphs
show_methods(evgraphs)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | LabelBinarizer | get_objective | import_or_raise | sk_partial_dependence |
1 | ModelFamily | graph_binary_objective_vs_threshold | jupyter_check | sk_permutation_importance |
2 | NullsInColumnWarning | graph_confusion_matrix | normalize_confusion_matrix | sklearn_auc |
3 | ProblemTypes | graph_partial_dependence | np | sklearn_confusion_matrix |
4 | binary_objective_vs_threshold | graph_permutation_importance | partial_dependence | sklearn_precision_recall_curve |
5 | calculate_permutation_importance | graph_precision_recall_curve | pd | sklearn_roc_curve |
6 | confusion_matrix | graph_prediction_vs_actual | precision_recall_curve | unique_labels |
7 | copy | graph_roc_curve | roc_curve | warnings |
8 | evalml |
show_methods(pipe_cbm)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | classes_ | describe | hyperparameters | predict_proba |
1 | clone | estimator | input_feature_names | problem_type |
2 | component_graph | feature_importance | load | random_state |
3 | compute_estimator_features | fit | model_family | save |
4 | custom_hyperparameters | get_component | name | score |
5 | custom_name | graph | parameters | summary |
6 | default_parameters | graph_feature_importance | predict | threshold |
ytest = np.array(ser_ytest).flatten()
ypreds = np.array(pipe_cbm.predict(df_Xtest)).flatten()
yprobs2d = pipe_cbm.predict_proba(df_Xtest).to_numpy()
yprobs = yprobs2d[:,1]
evgraphs.graph_confusion_matrix(ytest, ypreds)
evgraphs.graph_roc_curve(ytest,yprobs)
evgraphs.graph_precision_recall_curve(ytest,yprobs)
show_methods(evgraphs,contains='graph')
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | graph_binary_objective_vs_threshold | graph_partial_dependence | graph_precision_recall_curve | graph_roc_curve |
1 | graph_confusion_matrix | graph_permutation_importance | graph_prediction_vs_actual |
evgraphs.graph_partial_dependence(pipe_cbm,df_Xtest,feature='SeniorCitizen')
N = 100
evgraphs.graph_binary_objective_vs_threshold(
pipe_cbm,df_Xtest.iloc[:N],ytest[:N],objective=cost_benefit_matrix)
evgraphs.graph_permutation_importance(pipe_cbm,df_Xtest.iloc[:N],ytest[:N],
objective=cost_benefit_matrix)
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('evalml',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 0.85 0.89 0.87 1035 1 0.64 0.55 0.59 374 accuracy 0.80 1409 macro avg 0.74 0.72 0.73 1409 weighted avg 0.79 0.80 0.79 1409 [[917 118] [167 207]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
evalml | 0.7977 | 0.6369 | 0.5535 | 0.5923 | 0.7197 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 14 min 8 secs