import time
notebook_start_time = time.time()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('max_rows',50)
pd.set_option('max_columns',50)
SEED = 100

plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
sns.set()
%matplotlib inline

# modelling
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint


# boosting
import xgboost
from xgboost import XGBClassifier


# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2021-02-13 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

sklearn 0.23.1
numpy   1.19.5
pandas  1.1.4
seaborn 0.11.0
xgboost 1.2.0


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
import bp


df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                    })


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

print(df_train.shape)
df_train.head()

(455, 33)


target = 'diagnosis'
col_id = 'id'


cols_drop = ['id','Unnamed: 32' ]
df_train = df_train.drop(cols_drop, axis=1)
df_test = df_test.drop(cols_drop, axis=1)


df_train['diagnosis'] = df_train['diagnosis'].map({'B': 0, 'M': 1})
df_test['diagnosis'] = df_test['diagnosis'].map({'B': 0, 'M': 1})


# df_train.bp.describe()


cols = df_train.filter(regex='mean').columns

fig,ax = plt.subplots(5,2, figsize=(15,10))

df_train.query('diagnosis==0')[cols].plot(kind= 'density', subplots=True, sharex=False, 
                     sharey=False,fontsize=12,ax=ax)


df_train.query('diagnosis==1')[cols].plot(kind= 'density', subplots=True, sharex=False, 
                     sharey=False,fontsize=12,ax=ax,style='-.')

plt.suptitle('Density Plot for Benign (solid) and Malignant (dashdot) Cases',fontsize=18)

plt.savefig('images/densityplot_mean_features.png',dpi=300)
plt.show()


"""
Observation:

The density plots for benign and malignant cases are well separated.
This means the features we use here are useful for machine learning.

""";


df_train[target].value_counts(normalize=True)

0    0.626374
1    0.373626
Name: diagnosis, dtype: float64


df_test[target].value_counts(normalize=True)

# we have same distribution in train and test data.
# the data is imbalanced, there are almost double benign case than malignant.

0    0.631579
1    0.368421
Name: diagnosis, dtype: float64


# bp.show_methods(bp, contains='corr') # my local functions


# select only the mean features
cols = df_train.filter(regex='_mean').columns.tolist()
df1 = df_train[cols].rename(columns=lambda x: x.replace('_mean',''))

bp.plot_corr(df1,xrot=90)


bp.plot_corr_style(df_train)


bp.get_high_correlated_features_df(df_train).head()


# bp.show_methods(df_train.bp)


df_train.bp.corr_high(thr=0.98)

cols_high_corr = ['area_mean', 'radius_worst', 'radius_mean', 'area_worst', 'perimeter_worst', 'perimeter_mean']
cols_high_corr1 = ['radius_mean', 'radius_worst', 'radius_mean', 'perimeter_mean', 'radius_worst']
cols_high_corr2 = ['perimeter_mean', 'perimeter_worst', 'area_mean', 'area_mean', 'area_worst']
cols_high_corr_drop = ['radius_mean', 'radius_worst']


cols_high_corr_drop = ['area_worst', 'perimeter_mean', 'perimeter_se']
df_train2 = df_train.drop(cols_high_corr_drop,axis=1)
df_test2  = df_test.drop(cols_high_corr_drop,axis=1)


import xgboost as xgb
import sklearn.metrics as skmetrics

def get_row_eval(model,desc,df_eval,sort='F1'):
    model.fit(df_Xtrain, ser_ytrain)
    ypreds = model.predict(df_Xtest)
    ytx = np.array(ser_ytest).flatten()
    average = 'binary'
    row_eval = ['Xgboost',desc, 
                skmetrics.accuracy_score(ytx, ypreds),
                skmetrics.precision_score(ytx, ypreds, average=average),
                skmetrics.recall_score(ytx, ypreds, average=average),
                skmetrics.f1_score(ytx, ypreds, average=average),
                skmetrics.roc_auc_score(ytx, ypreds)]

    return row_eval,ypreds


df_Xtrain = df_train.drop(target,axis=1)
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)
ser_ytest = df_test[target]


model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'default',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)


df_Xtrain = df_train2.drop(target,axis=1)
ser_ytrain = df_train2[target]
df_Xtest = df_test2.drop(target,axis=1)
ser_ytest = df_test2[target]


# fitting
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'corr_thr<0.98',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)

# removing correlated features gave worse result.


from sklearn.feature_selection import RFECV

model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
est = RFECV(model,step=1,cv=5,scoring='roc_auc',n_jobs=-1)
est.fit(df_Xtrain,ser_ytrain)

print('Optimal features =',est.n_features_)
print(' Best features =', df_Xtrain.columns[est.support_])

Optimal features = 15
 Best features = Index(['texture_mean', 'smoothness_mean', 'concave points_mean',
       'fractal_dimension_mean', 'radius_se', 'texture_se', 'area_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst'],
      dtype='object')


cols = ['texture_mean', 'area_mean', 'smoothness_mean', 'concave points_mean',
       'radius_se', 'area_se', 'symmetry_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst', 'concavity_worst',
       'concave points_worst']

df_Xtrain = df_train.drop(target,axis=1)[cols]
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)[cols]
ser_ytest = df_test[target]

# fitting
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'RFECV',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)

# rfecv gave worse result. so reset data
df_Xtrain = df_train.drop(target,axis=1)
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)
ser_ytest = df_test[target]


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)


# %%time

# params_grid = {
#     'max_depth': [4,5,6,7,8,9,10,11,None],
#     'subsample': [0.6,0.7,0.8,0.9,1],
#     'scale_pos_weight': [1,2,3,5,10,30,40],
#        }

# model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
# skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
# est = GridSearchCV(model,params_grid,
#                     cv = skf,
#                     verbose=2,
#                     n_jobs = -1,
#                     scoring='f1')

# # Fit the random search model
# est.fit(df_Xtrain, ser_ytrain) # comment this

# params_best = est.best_params_

# NOTE: comment grid search after done.
# Wall time: 7min 8s
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}

params_best

{'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}


model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED, **params_best)
row_eval,ypreds = get_row_eval(model,'grid_search',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)


params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,n_estimators=1000,**params_best)
row_eval,ypreds = get_row_eval(model,'grid_search2',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)

ypreds_best = ypreds


from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)

def hpo_hyperopt(param_space, Xtrain, ytrain, Xtest, ytest, num_eval,cv=5,fixed_params={}):
    """HPO using hyperopt package.
    """
    # time
    time_start = time.time()

    # define objective function
    def objective_function(params):
        model = xgb.XGBClassifier(**params,**fixed_params)
        score = sklearn.model_selection.cross_val_score(model,
                    Xtrain, ytrain,
                    cv=cv,scoring='f1')
        score = score.mean()
        return {'loss': -score, 'status': STATUS_OK}

    # keep track of trials
    trials = Trials()

    # best params
    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate= np.random.RandomState(SEED))
    
    # dict best params
    dict_best_params = copy.copy(best_param)

    if 'boosting_type' in dict_best_params: 
        dict_best_params['boosting_type'] = 'gbdt' if dict_best_params['boosting_type'] == 0 else 'dart'


    int_params = ['max_depth','num_leaves','n_estimators']

    for int_param in int_params:
        # make integer if exist
        if int_param in dict_best_params:
            dict_best_params[int_param] = int(dict_best_params[int_param])
    
    # loss
    loss = [x['result']['loss'] for x in trials.trials]

    # best model    
    model_best = xgb.XGBClassifier(**dict_best_params)                      
    model_best.fit(Xtrain, ytrain)

    time_taken = time.time() - time_start
    
    print("\nResults\n" + '='*50)
    print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
    print("Number of parameter combinations tested: ", num_eval)
    print("Train Score Best                       : {:.4f} ".format(min(loss)*-1))
    print("Test Score                             : {:.4f} ".format(model_best.score(Xtest, ytest)))
    print("Best parameters:")
    pp.pprint(dict_best_params)
    
    return trials, dict_best_params


params_hyp= {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
    # 'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 50)),
    'subsample': hp.uniform ('subsample', 0.5, 1),
    'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
    'scale_pos_weight': hp.quniform('scale_pos_weight',1,100,1),

    # regularization
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.1),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.1),
    'gamma' : hp.uniform ('gamma', 0.1,0.5),
}


# current values
Xtr = df_Xtrain
ytr = ser_ytrain
Xtx = df_Xtest
ytx = ser_ytest

# fixed_params = {'n_estimators': 1000}
# trials, dict_best_params = hpo_hyperopt(params_hyp, Xtr, ytr, Xtx, ytx,
#                                         num_eval=100,fixed_params=fixed_params)

dict_best_params = {   
    'gamma': 0.29426928529915647,
    'learning_rate': 0.0227779530532774,
    'max_depth': 4,
    'min_child_weight': 1.0,
    'reg_alpha': 0.019685023503677693,
    'reg_lambda': 0.0538168932849033,
    'scale_pos_weight': 2.0,
    'subsample': 0.7231941501698588}


model = XGBClassifier(n_jobs=-1, random_state=SEED,**dict_best_params)

row_eval,ypreds = get_row_eval(model,'hyperopt',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)


from bp import plotly_binary_clf_evaluation

# help(plotly_binary_clf_evaluation)


params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,
                          n_estimators=1000,**params_best)
model.fit(df_Xtrain, ser_ytrain)

ypreds = model.predict(df_Xtest)
yprobs = model.predict_proba(df_Xtest)
yprobs = yprobs[:,0] # take only first column


plotly_binary_clf_evaluation('xgb_gridsearch2',model,ytx,ypreds,yprobs,df_Xtrain)


show_methods(bp, contains='classi')


df_clf_report = bp.get_binary_classification_report(
    'xgboost',
    ytx,
    ypreds,
    desc='gridsearch2',
    df_clf_report=None,
    style_col='Recall_1',
    show=True,
)


df_eval = bp.get_binary_classification_scalar_metrics(
    'xgboost',
    model,
    df_Xtest,
    ytx,
    ypreds,
    desc='gridsearch2',
    df_eval=None,
    style_col='Recall',
    show=True,
    round_=None,
)


import shap
shap.initjs()


params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,
                          n_estimators=1000,**params_best)
model.fit(df_Xtrain, ser_ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
              random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=3,
              subsample=0.6, tree_method='exact', validate_parameters=1,
              verbosity=None)


explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)

print(df_Xtest.shape, shap_values.shape)
shap_values[0][:2]

(114, 30) (114, 30)

array([-0.16933775, -1.2920814 ], dtype=float32)


max_display = 30
shap.summary_plot(shap_values, df_Xtest, plot_type="bar",
                  max_display = max_display)


shap.summary_plot(shap_values, df_Xtest, plot_type='dot', max_display = max_display)


notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole noteook: 0 hr 0 min 19 secs

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
0	905501	B	12.27	17.92	78.41	466.1	0.08685	0.06526	0.03211	0.02653	0.1966	0.05597	0.3342	1.7810	2.079	25.79	0.005888	0.02310	0.02059	0.010750	0.02578	0.002267	14.10	28.88	89.00	610.2	0.1240	0.1795	0.1377	0.09532	0.3455	0.06896	NaN
1	926954	M	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	0.4564	1.0750	3.425	48.55	0.005903	0.03731	0.04730	0.015570	0.01318	0.003892	18.98	34.12	126.70	1124.0	0.1139	0.3094	0.3403	0.14180	0.2218	0.07820	NaN
2	861103	B	11.45	20.97	73.81	401.5	0.11020	0.09362	0.04591	0.02233	0.1842	0.07005	0.3251	2.1740	2.077	24.62	0.010370	0.01706	0.02586	0.007506	0.01816	0.003976	13.11	32.16	84.53	525.1	0.1557	0.1676	0.1755	0.06127	0.2762	0.08851	NaN
3	86973702	B	14.44	15.18	93.97	640.1	0.09970	0.10210	0.08487	0.05532	0.1724	0.06081	0.2406	0.7394	2.120	21.20	0.005706	0.02297	0.03114	0.014930	0.01454	0.002528	15.85	19.85	108.60	766.9	0.1316	0.2735	0.3103	0.15990	0.2691	0.07683	NaN
4	8810703	M	28.11	18.47	188.50	2499.0	0.11420	0.15160	0.32010	0.15950	0.1648	0.05525	2.8730	1.4760	21.980	525.60	0.013450	0.02772	0.06389	0.014070	0.04783	0.004476	28.11	18.47	188.50	2499.0	0.1142	0.1516	0.3201	0.15950	0.1648	0.05525	NaN

	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
diagnosis	1.00	0.73	0.43	0.75	0.71	0.37	0.61	0.69	0.78	0.34	-0.03	0.57	0.02	0.56	0.54	-0.05	0.29	0.23	0.41	-0.03	0.07	0.78	0.47	0.79	0.74	0.42	0.58	0.65	0.79	0.39	0.31
radius_mean	0.73	1.00	0.34	1.00	0.99	0.18	0.51	0.68	0.83	0.17	-0.33	0.69	-0.06	0.68	0.73	-0.20	0.20	0.18	0.38	-0.11	-0.05	0.97	0.31	0.96	0.94	0.11	0.39	0.52	0.74	0.14	-0.02
texture_mean	0.43	0.34	1.00	0.35	0.34	0.00	0.26	0.31	0.31	0.08	-0.07	0.26	0.36	0.27	0.26	-0.00	0.20	0.14	0.19	-0.01	0.05	0.37	0.91	0.37	0.36	0.11	0.29	0.32	0.32	0.11	0.13
perimeter_mean	0.75	1.00	0.35	1.00	0.99	0.22	0.56	0.71	0.86	0.20	-0.28	0.70	-0.05	0.70	0.74	-0.18	0.25	0.21	0.41	-0.09	-0.01	0.97	0.31	0.97	0.94	0.14	0.43	0.55	0.77	0.16	0.03
area_mean	0.71	0.99	0.34	0.99	1.00	0.19	0.51	0.69	0.83	0.17	-0.29	0.74	-0.03	0.74	0.80	-0.14	0.21	0.19	0.38	-0.08	-0.02	0.96	0.29	0.96	0.96	0.12	0.37	0.50	0.72	0.12	-0.02
smoothness_mean	0.37	0.18	0.00	0.22	0.19	1.00	0.65	0.52	0.56	0.58	0.56	0.30	0.10	0.30	0.25	0.33	0.31	0.24	0.37	0.18	0.27	0.22	0.06	0.25	0.22	0.79	0.46	0.44	0.51	0.39	0.49
compactness_mean	0.61	0.51	0.26	0.56	0.51	0.65	1.00	0.88	0.82	0.60	0.55	0.49	0.07	0.53	0.45	0.14	0.75	0.55	0.63	0.20	0.51	0.54	0.26	0.59	0.52	0.56	0.87	0.82	0.82	0.49	0.70
concavity_mean	0.69	0.68	0.31	0.71	0.69	0.52	0.88	1.00	0.92	0.50	0.32	0.63	0.11	0.65	0.61	0.11	0.67	0.69	0.68	0.16	0.45	0.68	0.30	0.72	0.67	0.43	0.73	0.88	0.85	0.37	0.50
concave points_mean	0.78	0.83	0.31	0.86	0.83	0.56	0.82	0.92	1.00	0.47	0.14	0.70	0.06	0.71	0.69	0.04	0.48	0.42	0.60	0.08	0.25	0.83	0.30	0.86	0.81	0.44	0.64	0.74	0.91	0.34	0.35
symmetry_mean	0.34	0.17	0.08	0.20	0.17	0.58	0.60	0.50	0.47	1.00	0.46	0.30	0.12	0.30	0.22	0.15	0.39	0.32	0.36	0.39	0.31	0.21	0.09	0.24	0.20	0.44	0.47	0.44	0.45	0.68	0.43
fractal_dimension_mean	-0.03	-0.33	-0.07	-0.28	-0.29	0.56	0.55	0.32	0.14	0.46	1.00	-0.03	0.18	0.01	-0.11	0.40	0.56	0.44	0.33	0.33	0.69	-0.27	-0.05	-0.23	-0.24	0.49	0.45	0.34	0.16	0.31	0.77
radius_se	0.57	0.69	0.26	0.70	0.74	0.30	0.49	0.63	0.70	0.30	-0.03	1.00	0.22	0.97	0.95	0.17	0.34	0.32	0.51	0.24	0.21	0.72	0.17	0.72	0.75	0.12	0.26	0.37	0.53	0.06	0.02
texture_se	0.02	-0.06	0.36	-0.05	-0.03	0.10	0.07	0.11	0.06	0.12	0.18	0.22	1.00	0.23	0.12	0.36	0.25	0.22	0.28	0.44	0.30	-0.08	0.38	-0.07	-0.06	-0.08	-0.08	-0.05	-0.08	-0.15	-0.03
perimeter_se	0.56	0.68	0.27	0.70	0.74	0.30	0.53	0.65	0.71	0.30	0.01	0.97	0.23	1.00	0.94	0.15	0.40	0.34	0.55	0.26	0.23	0.70	0.18	0.72	0.73	0.11	0.31	0.40	0.55	0.07	0.06
area_se	0.54	0.73	0.26	0.74	0.80	0.25	0.45	0.61	0.69	0.22	-0.11	0.95	0.12	0.94	1.00	0.08	0.27	0.25	0.41	0.13	0.12	0.75	0.18	0.75	0.81	0.11	0.26	0.37	0.53	0.04	-0.01
smoothness_se	-0.05	-0.20	-0.00	-0.18	-0.14	0.33	0.14	0.11	0.04	0.15	0.40	0.17	0.36	0.15	0.08	1.00	0.34	0.27	0.33	0.42	0.44	-0.21	-0.09	-0.20	-0.17	0.30	-0.05	-0.05	-0.09	-0.13	0.11
compactness_se	0.29	0.20	0.20	0.25	0.21	0.31	0.75	0.67	0.48	0.39	0.56	0.34	0.25	0.40	0.27	0.34	1.00	0.79	0.73	0.40	0.81	0.20	0.14	0.25	0.19	0.22	0.68	0.65	0.48	0.25	0.60
concavity_se	0.23	0.18	0.14	0.21	0.19	0.24	0.55	0.69	0.42	0.32	0.44	0.32	0.22	0.34	0.25	0.27	0.79	1.00	0.77	0.30	0.73	0.17	0.09	0.20	0.17	0.14	0.45	0.66	0.42	0.15	0.42
concave points_se	0.41	0.38	0.19	0.41	0.38	0.37	0.63	0.68	0.60	0.36	0.33	0.51	0.28	0.55	0.41	0.33	0.73	0.77	1.00	0.30	0.62	0.36	0.10	0.39	0.34	0.19	0.43	0.55	0.59	0.09	0.30
symmetry_se	-0.03	-0.11	-0.01	-0.09	-0.08	0.18	0.20	0.16	0.08	0.39	0.33	0.24	0.44	0.26	0.13	0.42	0.40	0.30	0.30	1.00	0.38	-0.14	-0.12	-0.12	-0.12	-0.04	0.03	0.01	-0.05	0.31	0.07
fractal_dimension_se	0.07	-0.05	0.05	-0.01	-0.02	0.27	0.51	0.45	0.25	0.31	0.69	0.21	0.30	0.23	0.12	0.44	0.81	0.73	0.62	0.38	1.00	-0.05	-0.01	-0.01	-0.03	0.15	0.38	0.38	0.21	0.08	0.58
radius_worst	0.78	0.97	0.37	0.97	0.96	0.22	0.54	0.68	0.83	0.21	-0.27	0.72	-0.08	0.70	0.75	-0.21	0.20	0.17	0.36	-0.14	-0.05	1.00	0.37	0.99	0.98	0.22	0.46	0.56	0.79	0.23	0.07
texture_worst	0.47	0.31	0.91	0.31	0.29	0.06	0.26	0.30	0.30	0.09	-0.05	0.17	0.38	0.18	0.18	-0.09	0.14	0.09	0.10	-0.12	-0.01	0.37	1.00	0.38	0.36	0.26	0.38	0.38	0.38	0.25	0.23
perimeter_worst	0.79	0.96	0.37	0.97	0.96	0.25	0.59	0.72	0.86	0.24	-0.23	0.72	-0.07	0.72	0.75	-0.20	0.25	0.20	0.39	-0.12	-0.01	0.99	0.38	1.00	0.98	0.23	0.51	0.60	0.81	0.25	0.12
area_worst	0.74	0.94	0.36	0.94	0.96	0.22	0.52	0.67	0.81	0.20	-0.24	0.75	-0.06	0.73	0.81	-0.17	0.19	0.17	0.34	-0.12	-0.03	0.98	0.36	0.98	1.00	0.21	0.42	0.53	0.75	0.19	0.06
smoothness_worst	0.42	0.11	0.11	0.14	0.12	0.79	0.56	0.43	0.44	0.44	0.49	0.12	-0.08	0.11	0.11	0.30	0.22	0.14	0.19	-0.04	0.15	0.22	0.26	0.23	0.21	1.00	0.57	0.52	0.55	0.52	0.62
compactness_worst	0.58	0.39	0.29	0.43	0.37	0.46	0.87	0.73	0.64	0.47	0.45	0.26	-0.08	0.31	0.26	-0.05	0.68	0.45	0.43	0.03	0.38	0.46	0.38	0.51	0.42	0.57	1.00	0.88	0.79	0.61	0.81
concavity_worst	0.65	0.52	0.32	0.55	0.50	0.44	0.82	0.88	0.74	0.44	0.34	0.37	-0.05	0.40	0.37	-0.05	0.65	0.66	0.55	0.01	0.38	0.56	0.38	0.60	0.53	0.52	0.88	1.00	0.85	0.51	0.68
concave points_worst	0.79	0.74	0.32	0.77	0.72	0.51	0.82	0.85	0.91	0.45	0.16	0.53	-0.08	0.55	0.53	-0.09	0.48	0.42	0.59	-0.05	0.21	0.79	0.38	0.81	0.75	0.55	0.79	0.85	1.00	0.49	0.50
symmetry_worst	0.39	0.14	0.11	0.16	0.12	0.39	0.49	0.37	0.34	0.68	0.31	0.06	-0.15	0.07	0.04	-0.13	0.25	0.15	0.09	0.31	0.08	0.23	0.25	0.25	0.19	0.52	0.61	0.51	0.49	1.00	0.54
fractal_dimension_worst	0.31	-0.02	0.13	0.03	-0.02	0.49	0.70	0.50	0.35	0.43	0.77	0.02	-0.03	0.06	-0.01	0.11	0.60	0.42	0.30	0.07	0.58	0.07	0.23	0.12	0.06	0.62	0.81	0.68	0.50	0.54	1.00

	Model	Description	Accuracy	Precision	Recall	F1	AUC
0	Xgboost	default	0.973684	0.97561	0.952381	0.963855	0.969246
1	Xgboost	corr_thr<0.98	0.964912	0.97500	0.928571	0.951220	0.957341
2	Xgboost	RFECV	0.956140	0.95122	0.928571	0.939759	0.950397

	Model	Description	Accuracy	Precision	Recall	F1	AUC
3	Xgboost	grid_search	0.973684	0.953488	0.976190	0.964706	0.974206
0	Xgboost	default	0.973684	0.975610	0.952381	0.963855	0.969246
1	Xgboost	corr_thr<0.98	0.964912	0.975000	0.928571	0.951220	0.957341
2	Xgboost	RFECV	0.956140	0.951220	0.928571	0.939759	0.950397

	Model	Description	Accuracy	Precision	Recall	F1	AUC
4	Xgboost	grid_search2	0.982456	0.976190	0.976190	0.976190	0.981151
3	Xgboost	grid_search	0.973684	0.953488	0.976190	0.964706	0.974206
0	Xgboost	default	0.973684	0.975610	0.952381	0.963855	0.969246
1	Xgboost	corr_thr<0.98	0.964912	0.975000	0.928571	0.951220	0.957341
2	Xgboost	RFECV	0.956140	0.951220	0.928571	0.939759	0.950397

Table of Contents

Introduction to Project¶

Imports¶

Useful Scripts¶

Load the data¶

Data Manipulation¶

Exploratory Data Analysis¶

Data Preparation for Modelling¶

Correlation¶

Modelling: Boosting Xgboost¶

default xgboost¶

Remove correlated features¶

Recursive Feature Elimination¶

HPO: GridSearch¶

HPO: Hyperopt¶

Model Evaluation¶

Model Interpretation¶

Time taken¶

	feature1	feature2	corr
31	radius_mean	perimeter_mean	0.998112
33	radius_worst	perimeter_worst	0.994136
35	radius_mean	area_mean	0.987089
37	perimeter_mean	area_mean	0.986662
39	radius_worst	area_worst	0.983782