References:
In this project we detect whether the given sample of medical data corresponds to cancer cell or not.
The data has 33 features and the target feature is diagnosis
.
import time
notebook_start_time = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max_rows',50)
pd.set_option('max_columns',50)
SEED = 100
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
sns.set()
%matplotlib inline
# modelling
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# boosting
import xgboost
from xgboost import XGBClassifier
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2021-02-13 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit sklearn 0.23.1 numpy 1.19.5 pandas 1.1.4 seaborn 0.11.0 xgboost 1.2.0
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
import bp
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
print(df_train.shape)
df_train.head()
(455, 33)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 905501 | B | 12.27 | 17.92 | 78.41 | 466.1 | 0.08685 | 0.06526 | 0.03211 | 0.02653 | 0.1966 | 0.05597 | 0.3342 | 1.7810 | 2.079 | 25.79 | 0.005888 | 0.02310 | 0.02059 | 0.010750 | 0.02578 | 0.002267 | 14.10 | 28.88 | 89.00 | 610.2 | 0.1240 | 0.1795 | 0.1377 | 0.09532 | 0.3455 | 0.06896 | NaN |
1 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.015570 | 0.01318 | 0.003892 | 18.98 | 34.12 | 126.70 | 1124.0 | 0.1139 | 0.3094 | 0.3403 | 0.14180 | 0.2218 | 0.07820 | NaN |
2 | 861103 | B | 11.45 | 20.97 | 73.81 | 401.5 | 0.11020 | 0.09362 | 0.04591 | 0.02233 | 0.1842 | 0.07005 | 0.3251 | 2.1740 | 2.077 | 24.62 | 0.010370 | 0.01706 | 0.02586 | 0.007506 | 0.01816 | 0.003976 | 13.11 | 32.16 | 84.53 | 525.1 | 0.1557 | 0.1676 | 0.1755 | 0.06127 | 0.2762 | 0.08851 | NaN |
3 | 86973702 | B | 14.44 | 15.18 | 93.97 | 640.1 | 0.09970 | 0.10210 | 0.08487 | 0.05532 | 0.1724 | 0.06081 | 0.2406 | 0.7394 | 2.120 | 21.20 | 0.005706 | 0.02297 | 0.03114 | 0.014930 | 0.01454 | 0.002528 | 15.85 | 19.85 | 108.60 | 766.9 | 0.1316 | 0.2735 | 0.3103 | 0.15990 | 0.2691 | 0.07683 | NaN |
4 | 8810703 | M | 28.11 | 18.47 | 188.50 | 2499.0 | 0.11420 | 0.15160 | 0.32010 | 0.15950 | 0.1648 | 0.05525 | 2.8730 | 1.4760 | 21.980 | 525.60 | 0.013450 | 0.02772 | 0.06389 | 0.014070 | 0.04783 | 0.004476 | 28.11 | 18.47 | 188.50 | 2499.0 | 0.1142 | 0.1516 | 0.3201 | 0.15950 | 0.1648 | 0.05525 | NaN |
target = 'diagnosis'
col_id = 'id'
cols_drop = ['id','Unnamed: 32' ]
df_train = df_train.drop(cols_drop, axis=1)
df_test = df_test.drop(cols_drop, axis=1)
df_train['diagnosis'] = df_train['diagnosis'].map({'B': 0, 'M': 1})
df_test['diagnosis'] = df_test['diagnosis'].map({'B': 0, 'M': 1})
# df_train.bp.describe()
cols = df_train.filter(regex='mean').columns
fig,ax = plt.subplots(5,2, figsize=(15,10))
df_train.query('diagnosis==0')[cols].plot(kind= 'density', subplots=True, sharex=False,
sharey=False,fontsize=12,ax=ax)
df_train.query('diagnosis==1')[cols].plot(kind= 'density', subplots=True, sharex=False,
sharey=False,fontsize=12,ax=ax,style='-.')
plt.suptitle('Density Plot for Benign (solid) and Malignant (dashdot) Cases',fontsize=18)
plt.savefig('images/densityplot_mean_features.png',dpi=300)
plt.show()
"""
Observation:
The density plots for benign and malignant cases are well separated.
This means the features we use here are useful for machine learning.
""";
df_train[target].value_counts(normalize=True)
0 0.626374 1 0.373626 Name: diagnosis, dtype: float64
df_test[target].value_counts(normalize=True)
# we have same distribution in train and test data.
# the data is imbalanced, there are almost double benign case than malignant.
0 0.631579 1 0.368421 Name: diagnosis, dtype: float64
# bp.show_methods(bp, contains='corr') # my local functions
# select only the mean features
cols = df_train.filter(regex='_mean').columns.tolist()
df1 = df_train[cols].rename(columns=lambda x: x.replace('_mean',''))
bp.plot_corr(df1,xrot=90)
bp.plot_corr_style(df_train)
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
diagnosis | 1.00 | 0.73 | 0.43 | 0.75 | 0.71 | 0.37 | 0.61 | 0.69 | 0.78 | 0.34 | -0.03 | 0.57 | 0.02 | 0.56 | 0.54 | -0.05 | 0.29 | 0.23 | 0.41 | -0.03 | 0.07 | 0.78 | 0.47 | 0.79 | 0.74 | 0.42 | 0.58 | 0.65 | 0.79 | 0.39 | 0.31 |
radius_mean | 0.73 | 1.00 | 0.34 | 1.00 | 0.99 | 0.18 | 0.51 | 0.68 | 0.83 | 0.17 | -0.33 | 0.69 | -0.06 | 0.68 | 0.73 | -0.20 | 0.20 | 0.18 | 0.38 | -0.11 | -0.05 | 0.97 | 0.31 | 0.96 | 0.94 | 0.11 | 0.39 | 0.52 | 0.74 | 0.14 | -0.02 |
texture_mean | 0.43 | 0.34 | 1.00 | 0.35 | 0.34 | 0.00 | 0.26 | 0.31 | 0.31 | 0.08 | -0.07 | 0.26 | 0.36 | 0.27 | 0.26 | -0.00 | 0.20 | 0.14 | 0.19 | -0.01 | 0.05 | 0.37 | 0.91 | 0.37 | 0.36 | 0.11 | 0.29 | 0.32 | 0.32 | 0.11 | 0.13 |
perimeter_mean | 0.75 | 1.00 | 0.35 | 1.00 | 0.99 | 0.22 | 0.56 | 0.71 | 0.86 | 0.20 | -0.28 | 0.70 | -0.05 | 0.70 | 0.74 | -0.18 | 0.25 | 0.21 | 0.41 | -0.09 | -0.01 | 0.97 | 0.31 | 0.97 | 0.94 | 0.14 | 0.43 | 0.55 | 0.77 | 0.16 | 0.03 |
area_mean | 0.71 | 0.99 | 0.34 | 0.99 | 1.00 | 0.19 | 0.51 | 0.69 | 0.83 | 0.17 | -0.29 | 0.74 | -0.03 | 0.74 | 0.80 | -0.14 | 0.21 | 0.19 | 0.38 | -0.08 | -0.02 | 0.96 | 0.29 | 0.96 | 0.96 | 0.12 | 0.37 | 0.50 | 0.72 | 0.12 | -0.02 |
smoothness_mean | 0.37 | 0.18 | 0.00 | 0.22 | 0.19 | 1.00 | 0.65 | 0.52 | 0.56 | 0.58 | 0.56 | 0.30 | 0.10 | 0.30 | 0.25 | 0.33 | 0.31 | 0.24 | 0.37 | 0.18 | 0.27 | 0.22 | 0.06 | 0.25 | 0.22 | 0.79 | 0.46 | 0.44 | 0.51 | 0.39 | 0.49 |
compactness_mean | 0.61 | 0.51 | 0.26 | 0.56 | 0.51 | 0.65 | 1.00 | 0.88 | 0.82 | 0.60 | 0.55 | 0.49 | 0.07 | 0.53 | 0.45 | 0.14 | 0.75 | 0.55 | 0.63 | 0.20 | 0.51 | 0.54 | 0.26 | 0.59 | 0.52 | 0.56 | 0.87 | 0.82 | 0.82 | 0.49 | 0.70 |
concavity_mean | 0.69 | 0.68 | 0.31 | 0.71 | 0.69 | 0.52 | 0.88 | 1.00 | 0.92 | 0.50 | 0.32 | 0.63 | 0.11 | 0.65 | 0.61 | 0.11 | 0.67 | 0.69 | 0.68 | 0.16 | 0.45 | 0.68 | 0.30 | 0.72 | 0.67 | 0.43 | 0.73 | 0.88 | 0.85 | 0.37 | 0.50 |
concave points_mean | 0.78 | 0.83 | 0.31 | 0.86 | 0.83 | 0.56 | 0.82 | 0.92 | 1.00 | 0.47 | 0.14 | 0.70 | 0.06 | 0.71 | 0.69 | 0.04 | 0.48 | 0.42 | 0.60 | 0.08 | 0.25 | 0.83 | 0.30 | 0.86 | 0.81 | 0.44 | 0.64 | 0.74 | 0.91 | 0.34 | 0.35 |
symmetry_mean | 0.34 | 0.17 | 0.08 | 0.20 | 0.17 | 0.58 | 0.60 | 0.50 | 0.47 | 1.00 | 0.46 | 0.30 | 0.12 | 0.30 | 0.22 | 0.15 | 0.39 | 0.32 | 0.36 | 0.39 | 0.31 | 0.21 | 0.09 | 0.24 | 0.20 | 0.44 | 0.47 | 0.44 | 0.45 | 0.68 | 0.43 |
fractal_dimension_mean | -0.03 | -0.33 | -0.07 | -0.28 | -0.29 | 0.56 | 0.55 | 0.32 | 0.14 | 0.46 | 1.00 | -0.03 | 0.18 | 0.01 | -0.11 | 0.40 | 0.56 | 0.44 | 0.33 | 0.33 | 0.69 | -0.27 | -0.05 | -0.23 | -0.24 | 0.49 | 0.45 | 0.34 | 0.16 | 0.31 | 0.77 |
radius_se | 0.57 | 0.69 | 0.26 | 0.70 | 0.74 | 0.30 | 0.49 | 0.63 | 0.70 | 0.30 | -0.03 | 1.00 | 0.22 | 0.97 | 0.95 | 0.17 | 0.34 | 0.32 | 0.51 | 0.24 | 0.21 | 0.72 | 0.17 | 0.72 | 0.75 | 0.12 | 0.26 | 0.37 | 0.53 | 0.06 | 0.02 |
texture_se | 0.02 | -0.06 | 0.36 | -0.05 | -0.03 | 0.10 | 0.07 | 0.11 | 0.06 | 0.12 | 0.18 | 0.22 | 1.00 | 0.23 | 0.12 | 0.36 | 0.25 | 0.22 | 0.28 | 0.44 | 0.30 | -0.08 | 0.38 | -0.07 | -0.06 | -0.08 | -0.08 | -0.05 | -0.08 | -0.15 | -0.03 |
perimeter_se | 0.56 | 0.68 | 0.27 | 0.70 | 0.74 | 0.30 | 0.53 | 0.65 | 0.71 | 0.30 | 0.01 | 0.97 | 0.23 | 1.00 | 0.94 | 0.15 | 0.40 | 0.34 | 0.55 | 0.26 | 0.23 | 0.70 | 0.18 | 0.72 | 0.73 | 0.11 | 0.31 | 0.40 | 0.55 | 0.07 | 0.06 |
area_se | 0.54 | 0.73 | 0.26 | 0.74 | 0.80 | 0.25 | 0.45 | 0.61 | 0.69 | 0.22 | -0.11 | 0.95 | 0.12 | 0.94 | 1.00 | 0.08 | 0.27 | 0.25 | 0.41 | 0.13 | 0.12 | 0.75 | 0.18 | 0.75 | 0.81 | 0.11 | 0.26 | 0.37 | 0.53 | 0.04 | -0.01 |
smoothness_se | -0.05 | -0.20 | -0.00 | -0.18 | -0.14 | 0.33 | 0.14 | 0.11 | 0.04 | 0.15 | 0.40 | 0.17 | 0.36 | 0.15 | 0.08 | 1.00 | 0.34 | 0.27 | 0.33 | 0.42 | 0.44 | -0.21 | -0.09 | -0.20 | -0.17 | 0.30 | -0.05 | -0.05 | -0.09 | -0.13 | 0.11 |
compactness_se | 0.29 | 0.20 | 0.20 | 0.25 | 0.21 | 0.31 | 0.75 | 0.67 | 0.48 | 0.39 | 0.56 | 0.34 | 0.25 | 0.40 | 0.27 | 0.34 | 1.00 | 0.79 | 0.73 | 0.40 | 0.81 | 0.20 | 0.14 | 0.25 | 0.19 | 0.22 | 0.68 | 0.65 | 0.48 | 0.25 | 0.60 |
concavity_se | 0.23 | 0.18 | 0.14 | 0.21 | 0.19 | 0.24 | 0.55 | 0.69 | 0.42 | 0.32 | 0.44 | 0.32 | 0.22 | 0.34 | 0.25 | 0.27 | 0.79 | 1.00 | 0.77 | 0.30 | 0.73 | 0.17 | 0.09 | 0.20 | 0.17 | 0.14 | 0.45 | 0.66 | 0.42 | 0.15 | 0.42 |
concave points_se | 0.41 | 0.38 | 0.19 | 0.41 | 0.38 | 0.37 | 0.63 | 0.68 | 0.60 | 0.36 | 0.33 | 0.51 | 0.28 | 0.55 | 0.41 | 0.33 | 0.73 | 0.77 | 1.00 | 0.30 | 0.62 | 0.36 | 0.10 | 0.39 | 0.34 | 0.19 | 0.43 | 0.55 | 0.59 | 0.09 | 0.30 |
symmetry_se | -0.03 | -0.11 | -0.01 | -0.09 | -0.08 | 0.18 | 0.20 | 0.16 | 0.08 | 0.39 | 0.33 | 0.24 | 0.44 | 0.26 | 0.13 | 0.42 | 0.40 | 0.30 | 0.30 | 1.00 | 0.38 | -0.14 | -0.12 | -0.12 | -0.12 | -0.04 | 0.03 | 0.01 | -0.05 | 0.31 | 0.07 |
fractal_dimension_se | 0.07 | -0.05 | 0.05 | -0.01 | -0.02 | 0.27 | 0.51 | 0.45 | 0.25 | 0.31 | 0.69 | 0.21 | 0.30 | 0.23 | 0.12 | 0.44 | 0.81 | 0.73 | 0.62 | 0.38 | 1.00 | -0.05 | -0.01 | -0.01 | -0.03 | 0.15 | 0.38 | 0.38 | 0.21 | 0.08 | 0.58 |
radius_worst | 0.78 | 0.97 | 0.37 | 0.97 | 0.96 | 0.22 | 0.54 | 0.68 | 0.83 | 0.21 | -0.27 | 0.72 | -0.08 | 0.70 | 0.75 | -0.21 | 0.20 | 0.17 | 0.36 | -0.14 | -0.05 | 1.00 | 0.37 | 0.99 | 0.98 | 0.22 | 0.46 | 0.56 | 0.79 | 0.23 | 0.07 |
texture_worst | 0.47 | 0.31 | 0.91 | 0.31 | 0.29 | 0.06 | 0.26 | 0.30 | 0.30 | 0.09 | -0.05 | 0.17 | 0.38 | 0.18 | 0.18 | -0.09 | 0.14 | 0.09 | 0.10 | -0.12 | -0.01 | 0.37 | 1.00 | 0.38 | 0.36 | 0.26 | 0.38 | 0.38 | 0.38 | 0.25 | 0.23 |
perimeter_worst | 0.79 | 0.96 | 0.37 | 0.97 | 0.96 | 0.25 | 0.59 | 0.72 | 0.86 | 0.24 | -0.23 | 0.72 | -0.07 | 0.72 | 0.75 | -0.20 | 0.25 | 0.20 | 0.39 | -0.12 | -0.01 | 0.99 | 0.38 | 1.00 | 0.98 | 0.23 | 0.51 | 0.60 | 0.81 | 0.25 | 0.12 |
area_worst | 0.74 | 0.94 | 0.36 | 0.94 | 0.96 | 0.22 | 0.52 | 0.67 | 0.81 | 0.20 | -0.24 | 0.75 | -0.06 | 0.73 | 0.81 | -0.17 | 0.19 | 0.17 | 0.34 | -0.12 | -0.03 | 0.98 | 0.36 | 0.98 | 1.00 | 0.21 | 0.42 | 0.53 | 0.75 | 0.19 | 0.06 |
smoothness_worst | 0.42 | 0.11 | 0.11 | 0.14 | 0.12 | 0.79 | 0.56 | 0.43 | 0.44 | 0.44 | 0.49 | 0.12 | -0.08 | 0.11 | 0.11 | 0.30 | 0.22 | 0.14 | 0.19 | -0.04 | 0.15 | 0.22 | 0.26 | 0.23 | 0.21 | 1.00 | 0.57 | 0.52 | 0.55 | 0.52 | 0.62 |
compactness_worst | 0.58 | 0.39 | 0.29 | 0.43 | 0.37 | 0.46 | 0.87 | 0.73 | 0.64 | 0.47 | 0.45 | 0.26 | -0.08 | 0.31 | 0.26 | -0.05 | 0.68 | 0.45 | 0.43 | 0.03 | 0.38 | 0.46 | 0.38 | 0.51 | 0.42 | 0.57 | 1.00 | 0.88 | 0.79 | 0.61 | 0.81 |
concavity_worst | 0.65 | 0.52 | 0.32 | 0.55 | 0.50 | 0.44 | 0.82 | 0.88 | 0.74 | 0.44 | 0.34 | 0.37 | -0.05 | 0.40 | 0.37 | -0.05 | 0.65 | 0.66 | 0.55 | 0.01 | 0.38 | 0.56 | 0.38 | 0.60 | 0.53 | 0.52 | 0.88 | 1.00 | 0.85 | 0.51 | 0.68 |
concave points_worst | 0.79 | 0.74 | 0.32 | 0.77 | 0.72 | 0.51 | 0.82 | 0.85 | 0.91 | 0.45 | 0.16 | 0.53 | -0.08 | 0.55 | 0.53 | -0.09 | 0.48 | 0.42 | 0.59 | -0.05 | 0.21 | 0.79 | 0.38 | 0.81 | 0.75 | 0.55 | 0.79 | 0.85 | 1.00 | 0.49 | 0.50 |
symmetry_worst | 0.39 | 0.14 | 0.11 | 0.16 | 0.12 | 0.39 | 0.49 | 0.37 | 0.34 | 0.68 | 0.31 | 0.06 | -0.15 | 0.07 | 0.04 | -0.13 | 0.25 | 0.15 | 0.09 | 0.31 | 0.08 | 0.23 | 0.25 | 0.25 | 0.19 | 0.52 | 0.61 | 0.51 | 0.49 | 1.00 | 0.54 |
fractal_dimension_worst | 0.31 | -0.02 | 0.13 | 0.03 | -0.02 | 0.49 | 0.70 | 0.50 | 0.35 | 0.43 | 0.77 | 0.02 | -0.03 | 0.06 | -0.01 | 0.11 | 0.60 | 0.42 | 0.30 | 0.07 | 0.58 | 0.07 | 0.23 | 0.12 | 0.06 | 0.62 | 0.81 | 0.68 | 0.50 | 0.54 | 1.00 |
bp.get_high_correlated_features_df(df_train).head()
feature1 | feature2 | corr | |
---|---|---|---|
31 | radius_mean | perimeter_mean | 0.998112 |
33 | radius_worst | perimeter_worst | 0.994136 |
35 | radius_mean | area_mean | 0.987089 |
37 | perimeter_mean | area_mean | 0.986662 |
39 | radius_worst | area_worst | 0.983782 |
# bp.show_methods(df_train.bp)
df_train.bp.corr_high(thr=0.98)
cols_high_corr = ['area_mean', 'radius_worst', 'radius_mean', 'area_worst', 'perimeter_worst', 'perimeter_mean'] cols_high_corr1 = ['radius_mean', 'radius_worst', 'radius_mean', 'perimeter_mean', 'radius_worst'] cols_high_corr2 = ['perimeter_mean', 'perimeter_worst', 'area_mean', 'area_mean', 'area_worst'] cols_high_corr_drop = ['radius_mean', 'radius_worst']
feature1 | feature2 | corr | |
---|---|---|---|
0 | radius_mean | perimeter_mean | 0.998112 |
1 | radius_worst | perimeter_worst | 0.994136 |
2 | radius_mean | area_mean | 0.987089 |
3 | perimeter_mean | area_mean | 0.986662 |
4 | radius_worst | area_worst | 0.983782 |
cols_high_corr_drop = ['area_worst', 'perimeter_mean', 'perimeter_se']
df_train2 = df_train.drop(cols_high_corr_drop,axis=1)
df_test2 = df_test.drop(cols_high_corr_drop,axis=1)
import xgboost as xgb
import sklearn.metrics as skmetrics
def get_row_eval(model,desc,df_eval,sort='F1'):
model.fit(df_Xtrain, ser_ytrain)
ypreds = model.predict(df_Xtest)
ytx = np.array(ser_ytest).flatten()
average = 'binary'
row_eval = ['Xgboost',desc,
skmetrics.accuracy_score(ytx, ypreds),
skmetrics.precision_score(ytx, ypreds, average=average),
skmetrics.recall_score(ytx, ypreds, average=average),
skmetrics.f1_score(ytx, ypreds, average=average),
skmetrics.roc_auc_score(ytx, ypreds)]
return row_eval,ypreds
df_Xtrain = df_train.drop(target,axis=1)
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)
ser_ytest = df_test[target]
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'default',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
0 | Xgboost | default | 0.973684 | 0.97561 | 0.952381 | 0.963855 | 0.969246 |
df_Xtrain = df_train2.drop(target,axis=1)
ser_ytrain = df_train2[target]
df_Xtest = df_test2.drop(target,axis=1)
ser_ytest = df_test2[target]
# fitting
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'corr_thr<0.98',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)
# removing correlated features gave worse result.
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
0 | Xgboost | default | 0.973684 | 0.97561 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.97500 | 0.928571 | 0.951220 | 0.957341 |
from sklearn.feature_selection import RFECV
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
est = RFECV(model,step=1,cv=5,scoring='roc_auc',n_jobs=-1)
est.fit(df_Xtrain,ser_ytrain)
print('Optimal features =',est.n_features_)
print(' Best features =', df_Xtrain.columns[est.support_])
Optimal features = 15 Best features = Index(['texture_mean', 'smoothness_mean', 'concave points_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst'], dtype='object')
cols = ['texture_mean', 'area_mean', 'smoothness_mean', 'concave points_mean',
'radius_se', 'area_se', 'symmetry_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst', 'concavity_worst',
'concave points_worst']
df_Xtrain = df_train.drop(target,axis=1)[cols]
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)[cols]
ser_ytest = df_test[target]
# fitting
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'RFECV',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)
# rfecv gave worse result. so reset data
df_Xtrain = df_train.drop(target,axis=1)
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)
ser_ytest = df_test[target]
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
0 | Xgboost | default | 0.973684 | 0.97561 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.97500 | 0.928571 | 0.951220 | 0.957341 |
2 | Xgboost | RFECV | 0.956140 | 0.95122 | 0.928571 | 0.939759 | 0.950397 |
Important Parameters:
learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
max_depth: determines how deeply each tree is allowed to grow during any boosting round.
subsample: percentage of samples used per tree. Low value can lead to underfitting.
colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
n_estimators: number of trees you want to build.
Regularization parameters:
gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
alpha: L1 regularization on leaf weights. A large value leads to more regularization.
lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
model
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
# %%time
# params_grid = {
# 'max_depth': [4,5,6,7,8,9,10,11,None],
# 'subsample': [0.6,0.7,0.8,0.9,1],
# 'scale_pos_weight': [1,2,3,5,10,30,40],
# }
# model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
# skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
# est = GridSearchCV(model,params_grid,
# cv = skf,
# verbose=2,
# n_jobs = -1,
# scoring='f1')
# # Fit the random search model
# est.fit(df_Xtrain, ser_ytrain) # comment this
# params_best = est.best_params_
# NOTE: comment grid search after done.
# Wall time: 7min 8s
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
params_best
{'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED, **params_best)
row_eval,ypreds = get_row_eval(model,'grid_search',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
3 | Xgboost | grid_search | 0.973684 | 0.953488 | 0.976190 | 0.964706 | 0.974206 |
0 | Xgboost | default | 0.973684 | 0.975610 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.975000 | 0.928571 | 0.951220 | 0.957341 |
2 | Xgboost | RFECV | 0.956140 | 0.951220 | 0.928571 | 0.939759 | 0.950397 |
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,n_estimators=1000,**params_best)
row_eval,ypreds = get_row_eval(model,'grid_search2',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)
ypreds_best = ypreds
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
4 | Xgboost | grid_search2 | 0.982456 | 0.976190 | 0.976190 | 0.976190 | 0.981151 |
3 | Xgboost | grid_search | 0.973684 | 0.953488 | 0.976190 | 0.964706 | 0.974206 |
0 | Xgboost | default | 0.973684 | 0.975610 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.975000 | 0.928571 | 0.951220 | 0.957341 |
2 | Xgboost | RFECV | 0.956140 | 0.951220 | 0.928571 | 0.939759 | 0.950397 |
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
def hpo_hyperopt(param_space, Xtrain, ytrain, Xtest, ytest, num_eval,cv=5,fixed_params={}):
"""HPO using hyperopt package.
"""
# time
time_start = time.time()
# define objective function
def objective_function(params):
model = xgb.XGBClassifier(**params,**fixed_params)
score = sklearn.model_selection.cross_val_score(model,
Xtrain, ytrain,
cv=cv,scoring='f1')
score = score.mean()
return {'loss': -score, 'status': STATUS_OK}
# keep track of trials
trials = Trials()
# best params
best_param = fmin(objective_function,
param_space,
algo=tpe.suggest,
max_evals=num_eval,
trials=trials,
rstate= np.random.RandomState(SEED))
# dict best params
dict_best_params = copy.copy(best_param)
if 'boosting_type' in dict_best_params:
dict_best_params['boosting_type'] = 'gbdt' if dict_best_params['boosting_type'] == 0 else 'dart'
int_params = ['max_depth','num_leaves','n_estimators']
for int_param in int_params:
# make integer if exist
if int_param in dict_best_params:
dict_best_params[int_param] = int(dict_best_params[int_param])
# loss
loss = [x['result']['loss'] for x in trials.trials]
# best model
model_best = xgb.XGBClassifier(**dict_best_params)
model_best.fit(Xtrain, ytrain)
time_taken = time.time() - time_start
print("\nResults\n" + '='*50)
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print("Number of parameter combinations tested: ", num_eval)
print("Train Score Best : {:.4f} ".format(min(loss)*-1))
print("Test Score : {:.4f} ".format(model_best.score(Xtest, ytest)))
print("Best parameters:")
pp.pprint(dict_best_params)
return trials, dict_best_params
params_hyp= {
'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
# 'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 50)),
'subsample': hp.uniform ('subsample', 0.5, 1),
'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
'scale_pos_weight': hp.quniform('scale_pos_weight',1,100,1),
# regularization
'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.1),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.1),
'gamma' : hp.uniform ('gamma', 0.1,0.5),
}
# current values
Xtr = df_Xtrain
ytr = ser_ytrain
Xtx = df_Xtest
ytx = ser_ytest
# fixed_params = {'n_estimators': 1000}
# trials, dict_best_params = hpo_hyperopt(params_hyp, Xtr, ytr, Xtx, ytx,
# num_eval=100,fixed_params=fixed_params)
dict_best_params = {
'gamma': 0.29426928529915647,
'learning_rate': 0.0227779530532774,
'max_depth': 4,
'min_child_weight': 1.0,
'reg_alpha': 0.019685023503677693,
'reg_lambda': 0.0538168932849033,
'scale_pos_weight': 2.0,
'subsample': 0.7231941501698588}
model = XGBClassifier(n_jobs=-1, random_state=SEED,**dict_best_params)
row_eval,ypreds = get_row_eval(model,'hyperopt',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
display(df_eval)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
4 | Xgboost | grid_search2 | 0.982456 | 0.976190 | 0.976190 | 0.976190 | 0.981151 |
3 | Xgboost | grid_search | 0.973684 | 0.953488 | 0.976190 | 0.964706 | 0.974206 |
0 | Xgboost | default | 0.973684 | 0.975610 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.975000 | 0.928571 | 0.951220 | 0.957341 |
5 | Xgboost | hyperopt | 0.956140 | 0.930233 | 0.952381 | 0.941176 | 0.955357 |
2 | Xgboost | RFECV | 0.956140 | 0.951220 | 0.928571 | 0.939759 | 0.950397 |
from bp import plotly_binary_clf_evaluation
# help(plotly_binary_clf_evaluation)
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,
n_estimators=1000,**params_best)
model.fit(df_Xtrain, ser_ytrain)
ypreds = model.predict(df_Xtest)
yprobs = model.predict_proba(df_Xtest)
yprobs = yprobs[:,0] # take only first column
plotly_binary_clf_evaluation('xgb_gridsearch2',model,ytx,ypreds,yprobs,df_Xtrain)
show_methods(bp, contains='classi')
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | get_binary_classification_report | get_binary_classification_scalar_metrics | get_binary_classification_scalar_metrics2 |
df_clf_report = bp.get_binary_classification_report(
'xgboost',
ytx,
ypreds,
desc='gridsearch2',
df_clf_report=None,
style_col='Recall_1',
show=True,
)
Model | Description | Precision_0 | Precision_1 | Recall_0 | Recall_1 | F1_Score_0 | F1_Score_1 | Support_0 | Support_1 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | xgboost | gridsearch2 | 0.986111 | 0.976190 | 0.986111 | 0.976190 | 0.986111 | 0.976190 | 72.000000 | 42.000000 |
df_eval = bp.get_binary_classification_scalar_metrics(
'xgboost',
model,
df_Xtest,
ytx,
ypreds,
desc='gridsearch2',
df_eval=None,
style_col='Recall',
show=True,
round_=None,
)
Model | Description | Accuracy | Precision | Recall | F1 | Mathews_Correlation_Coefficient | Cohens_Kappa | Area_Under_Precision_Curve | Area_Under_ROC_Curve | |
---|---|---|---|---|---|---|---|---|---|---|
0 | xgboost | gridsearch2 | 0.982456 | 0.976190 | 0.976190 | 0.976190 | 0.962302 | 0.962302 | 0.995281 | 0.997024 |
import shap
shap.initjs()
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,
n_estimators=1000,**params_best)
model.fit(df_Xtrain, ser_ytrain)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=1000, n_jobs=-1, num_parallel_tree=1, random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=3, subsample=0.6, tree_method='exact', validate_parameters=1, verbosity=None)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
print(df_Xtest.shape, shap_values.shape)
shap_values[0][:2]
(114, 30) (114, 30)
array([-0.16933775, -1.2920814 ], dtype=float32)
max_display = 30
shap.summary_plot(shap_values, df_Xtest, plot_type="bar",
max_display = max_display)
shap.summary_plot(shap_values, df_Xtest, plot_type='dot', max_display = max_display)
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole noteook: 0 hr 0 min 19 secs