In this project we detect whether the given sample of medical data corresponds to cancer cell or not.
The data has 33 features and the target feature is diagnosis
import time
notebook_start_time = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
SEED = 100
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})'ggplot')
%matplotlib inline
# modelling
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# boosting
import xgboost
from xgboost import XGBClassifier
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv
Bhishan Poudel 2021-02-13 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit sklearn 0.23.1 numpy 1.19.5 pandas 1.1.4 seaborn 0.11.0 xgboost 1.2.0
# my local library
import sys
import bp
df_eval = pd.DataFrame({'Model': [],
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
(455, 33)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
0 | 905501 | B | 12.27 | 17.92 | 78.41 | 466.1 | 0.08685 | 0.06526 | 0.03211 | 0.02653 | 0.1966 | 0.05597 | 0.3342 | 1.7810 | 2.079 | 25.79 | 0.005888 | 0.02310 | 0.02059 | 0.010750 | 0.02578 | 0.002267 | 14.10 | 28.88 | 89.00 | 610.2 | 0.1240 | 0.1795 | 0.1377 | 0.09532 | 0.3455 | 0.06896 | NaN |
1 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.015570 | 0.01318 | 0.003892 | 18.98 | 34.12 | 126.70 | 1124.0 | 0.1139 | 0.3094 | 0.3403 | 0.14180 | 0.2218 | 0.07820 | NaN |
2 | 861103 | B | 11.45 | 20.97 | 73.81 | 401.5 | 0.11020 | 0.09362 | 0.04591 | 0.02233 | 0.1842 | 0.07005 | 0.3251 | 2.1740 | 2.077 | 24.62 | 0.010370 | 0.01706 | 0.02586 | 0.007506 | 0.01816 | 0.003976 | 13.11 | 32.16 | 84.53 | 525.1 | 0.1557 | 0.1676 | 0.1755 | 0.06127 | 0.2762 | 0.08851 | NaN |
3 | 86973702 | B | 14.44 | 15.18 | 93.97 | 640.1 | 0.09970 | 0.10210 | 0.08487 | 0.05532 | 0.1724 | 0.06081 | 0.2406 | 0.7394 | 2.120 | 21.20 | 0.005706 | 0.02297 | 0.03114 | 0.014930 | 0.01454 | 0.002528 | 15.85 | 19.85 | 108.60 | 766.9 | 0.1316 | 0.2735 | 0.3103 | 0.15990 | 0.2691 | 0.07683 | NaN |
4 | 8810703 | M | 28.11 | 18.47 | 188.50 | 2499.0 | 0.11420 | 0.15160 | 0.32010 | 0.15950 | 0.1648 | 0.05525 | 2.8730 | 1.4760 | 21.980 | 525.60 | 0.013450 | 0.02772 | 0.06389 | 0.014070 | 0.04783 | 0.004476 | 28.11 | 18.47 | 188.50 | 2499.0 | 0.1142 | 0.1516 | 0.3201 | 0.15950 | 0.1648 | 0.05525 | NaN |
target = 'diagnosis'
col_id = 'id'
cols_drop = ['id','Unnamed: 32' ]
df_train = df_train.drop(cols_drop, axis=1)
df_test = df_test.drop(cols_drop, axis=1)
df_train['diagnosis'] = df_train['diagnosis'].map({'B': 0, 'M': 1})
df_test['diagnosis'] = df_test['diagnosis'].map({'B': 0, 'M': 1})
# df_train.bp.describe()
cols = df_train.filter(regex='mean').columns
fig,ax = plt.subplots(5,2, figsize=(15,10))
df_train.query('diagnosis==0')[cols].plot(kind= 'density', subplots=True, sharex=False,
df_train.query('diagnosis==1')[cols].plot(kind= 'density', subplots=True, sharex=False,
plt.suptitle('Density Plot for Benign (solid) and Malignant (dashdot) Cases',fontsize=18)
The density plots for benign and malignant cases are well separated.
This means the features we use here are useful for machine learning.
0 0.626374 1 0.373626 Name: diagnosis, dtype: float64
# we have same distribution in train and test data.
# the data is imbalanced, there are almost double benign case than malignant.
0 0.631579 1 0.368421 Name: diagnosis, dtype: float64
# bp.show_methods(bp, contains='corr') # my local functions
# select only the mean features
cols = df_train.filter(regex='_mean').columns.tolist()
df1 = df_train[cols].rename(columns=lambda x: x.replace('_mean',''))
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
diagnosis | 1.00 | 0.73 | 0.43 | 0.75 | 0.71 | 0.37 | 0.61 | 0.69 | 0.78 | 0.34 | -0.03 | 0.57 | 0.02 | 0.56 | 0.54 | -0.05 | 0.29 | 0.23 | 0.41 | -0.03 | 0.07 | 0.78 | 0.47 | 0.79 | 0.74 | 0.42 | 0.58 | 0.65 | 0.79 | 0.39 | 0.31 |
radius_mean | 0.73 | 1.00 | 0.34 | 1.00 | 0.99 | 0.18 | 0.51 | 0.68 | 0.83 | 0.17 | -0.33 | 0.69 | -0.06 | 0.68 | 0.73 | -0.20 | 0.20 | 0.18 | 0.38 | -0.11 | -0.05 | 0.97 | 0.31 | 0.96 | 0.94 | 0.11 | 0.39 | 0.52 | 0.74 | 0.14 | -0.02 |
texture_mean | 0.43 | 0.34 | 1.00 | 0.35 | 0.34 | 0.00 | 0.26 | 0.31 | 0.31 | 0.08 | -0.07 | 0.26 | 0.36 | 0.27 | 0.26 | -0.00 | 0.20 | 0.14 | 0.19 | -0.01 | 0.05 | 0.37 | 0.91 | 0.37 | 0.36 | 0.11 | 0.29 | 0.32 | 0.32 | 0.11 | 0.13 |
perimeter_mean | 0.75 | 1.00 | 0.35 | 1.00 | 0.99 | 0.22 | 0.56 | 0.71 | 0.86 | 0.20 | -0.28 | 0.70 | -0.05 | 0.70 | 0.74 | -0.18 | 0.25 | 0.21 | 0.41 | -0.09 | -0.01 | 0.97 | 0.31 | 0.97 | 0.94 | 0.14 | 0.43 | 0.55 | 0.77 | 0.16 | 0.03 |
area_mean | 0.71 | 0.99 | 0.34 | 0.99 | 1.00 | 0.19 | 0.51 | 0.69 | 0.83 | 0.17 | -0.29 | 0.74 | -0.03 | 0.74 | 0.80 | -0.14 | 0.21 | 0.19 | 0.38 | -0.08 | -0.02 | 0.96 | 0.29 | 0.96 | 0.96 | 0.12 | 0.37 | 0.50 | 0.72 | 0.12 | -0.02 |
smoothness_mean | 0.37 | 0.18 | 0.00 | 0.22 | 0.19 | 1.00 | 0.65 | 0.52 | 0.56 | 0.58 | 0.56 | 0.30 | 0.10 | 0.30 | 0.25 | 0.33 | 0.31 | 0.24 | 0.37 | 0.18 | 0.27 | 0.22 | 0.06 | 0.25 | 0.22 | 0.79 | 0.46 | 0.44 | 0.51 | 0.39 | 0.49 |
compactness_mean | 0.61 | 0.51 | 0.26 | 0.56 | 0.51 | 0.65 | 1.00 | 0.88 | 0.82 | 0.60 | 0.55 | 0.49 | 0.07 | 0.53 | 0.45 | 0.14 | 0.75 | 0.55 | 0.63 | 0.20 | 0.51 | 0.54 | 0.26 | 0.59 | 0.52 | 0.56 | 0.87 | 0.82 | 0.82 | 0.49 | 0.70 |
concavity_mean | 0.69 | 0.68 | 0.31 | 0.71 | 0.69 | 0.52 | 0.88 | 1.00 | 0.92 | 0.50 | 0.32 | 0.63 | 0.11 | 0.65 | 0.61 | 0.11 | 0.67 | 0.69 | 0.68 | 0.16 | 0.45 | 0.68 | 0.30 | 0.72 | 0.67 | 0.43 | 0.73 | 0.88 | 0.85 | 0.37 | 0.50 |
concave points_mean | 0.78 | 0.83 | 0.31 | 0.86 | 0.83 | 0.56 | 0.82 | 0.92 | 1.00 | 0.47 | 0.14 | 0.70 | 0.06 | 0.71 | 0.69 | 0.04 | 0.48 | 0.42 | 0.60 | 0.08 | 0.25 | 0.83 | 0.30 | 0.86 | 0.81 | 0.44 | 0.64 | 0.74 | 0.91 | 0.34 | 0.35 |
symmetry_mean | 0.34 | 0.17 | 0.08 | 0.20 | 0.17 | 0.58 | 0.60 | 0.50 | 0.47 | 1.00 | 0.46 | 0.30 | 0.12 | 0.30 | 0.22 | 0.15 | 0.39 | 0.32 | 0.36 | 0.39 | 0.31 | 0.21 | 0.09 | 0.24 | 0.20 | 0.44 | 0.47 | 0.44 | 0.45 | 0.68 | 0.43 |
fractal_dimension_mean | -0.03 | -0.33 | -0.07 | -0.28 | -0.29 | 0.56 | 0.55 | 0.32 | 0.14 | 0.46 | 1.00 | -0.03 | 0.18 | 0.01 | -0.11 | 0.40 | 0.56 | 0.44 | 0.33 | 0.33 | 0.69 | -0.27 | -0.05 | -0.23 | -0.24 | 0.49 | 0.45 | 0.34 | 0.16 | 0.31 | 0.77 |
radius_se | 0.57 | 0.69 | 0.26 | 0.70 | 0.74 | 0.30 | 0.49 | 0.63 | 0.70 | 0.30 | -0.03 | 1.00 | 0.22 | 0.97 | 0.95 | 0.17 | 0.34 | 0.32 | 0.51 | 0.24 | 0.21 | 0.72 | 0.17 | 0.72 | 0.75 | 0.12 | 0.26 | 0.37 | 0.53 | 0.06 | 0.02 |
texture_se | 0.02 | -0.06 | 0.36 | -0.05 | -0.03 | 0.10 | 0.07 | 0.11 | 0.06 | 0.12 | 0.18 | 0.22 | 1.00 | 0.23 | 0.12 | 0.36 | 0.25 | 0.22 | 0.28 | 0.44 | 0.30 | -0.08 | 0.38 | -0.07 | -0.06 | -0.08 | -0.08 | -0.05 | -0.08 | -0.15 | -0.03 |
perimeter_se | 0.56 | 0.68 | 0.27 | 0.70 | 0.74 | 0.30 | 0.53 | 0.65 | 0.71 | 0.30 | 0.01 | 0.97 | 0.23 | 1.00 | 0.94 | 0.15 | 0.40 | 0.34 | 0.55 | 0.26 | 0.23 | 0.70 | 0.18 | 0.72 | 0.73 | 0.11 | 0.31 | 0.40 | 0.55 | 0.07 | 0.06 |
area_se | 0.54 | 0.73 | 0.26 | 0.74 | 0.80 | 0.25 | 0.45 | 0.61 | 0.69 | 0.22 | -0.11 | 0.95 | 0.12 | 0.94 | 1.00 | 0.08 | 0.27 | 0.25 | 0.41 | 0.13 | 0.12 | 0.75 | 0.18 | 0.75 | 0.81 | 0.11 | 0.26 | 0.37 | 0.53 | 0.04 | -0.01 |
smoothness_se | -0.05 | -0.20 | -0.00 | -0.18 | -0.14 | 0.33 | 0.14 | 0.11 | 0.04 | 0.15 | 0.40 | 0.17 | 0.36 | 0.15 | 0.08 | 1.00 | 0.34 | 0.27 | 0.33 | 0.42 | 0.44 | -0.21 | -0.09 | -0.20 | -0.17 | 0.30 | -0.05 | -0.05 | -0.09 | -0.13 | 0.11 |
compactness_se | 0.29 | 0.20 | 0.20 | 0.25 | 0.21 | 0.31 | 0.75 | 0.67 | 0.48 | 0.39 | 0.56 | 0.34 | 0.25 | 0.40 | 0.27 | 0.34 | 1.00 | 0.79 | 0.73 | 0.40 | 0.81 | 0.20 | 0.14 | 0.25 | 0.19 | 0.22 | 0.68 | 0.65 | 0.48 | 0.25 | 0.60 |
concavity_se | 0.23 | 0.18 | 0.14 | 0.21 | 0.19 | 0.24 | 0.55 | 0.69 | 0.42 | 0.32 | 0.44 | 0.32 | 0.22 | 0.34 | 0.25 | 0.27 | 0.79 | 1.00 | 0.77 | 0.30 | 0.73 | 0.17 | 0.09 | 0.20 | 0.17 | 0.14 | 0.45 | 0.66 | 0.42 | 0.15 | 0.42 |
concave points_se | 0.41 | 0.38 | 0.19 | 0.41 | 0.38 | 0.37 | 0.63 | 0.68 | 0.60 | 0.36 | 0.33 | 0.51 | 0.28 | 0.55 | 0.41 | 0.33 | 0.73 | 0.77 | 1.00 | 0.30 | 0.62 | 0.36 | 0.10 | 0.39 | 0.34 | 0.19 | 0.43 | 0.55 | 0.59 | 0.09 | 0.30 |
symmetry_se | -0.03 | -0.11 | -0.01 | -0.09 | -0.08 | 0.18 | 0.20 | 0.16 | 0.08 | 0.39 | 0.33 | 0.24 | 0.44 | 0.26 | 0.13 | 0.42 | 0.40 | 0.30 | 0.30 | 1.00 | 0.38 | -0.14 | -0.12 | -0.12 | -0.12 | -0.04 | 0.03 | 0.01 | -0.05 | 0.31 | 0.07 |
fractal_dimension_se | 0.07 | -0.05 | 0.05 | -0.01 | -0.02 | 0.27 | 0.51 | 0.45 | 0.25 | 0.31 | 0.69 | 0.21 | 0.30 | 0.23 | 0.12 | 0.44 | 0.81 | 0.73 | 0.62 | 0.38 | 1.00 | -0.05 | -0.01 | -0.01 | -0.03 | 0.15 | 0.38 | 0.38 | 0.21 | 0.08 | 0.58 |
radius_worst | 0.78 | 0.97 | 0.37 | 0.97 | 0.96 | 0.22 | 0.54 | 0.68 | 0.83 | 0.21 | -0.27 | 0.72 | -0.08 | 0.70 | 0.75 | -0.21 | 0.20 | 0.17 | 0.36 | -0.14 | -0.05 | 1.00 | 0.37 | 0.99 | 0.98 | 0.22 | 0.46 | 0.56 | 0.79 | 0.23 | 0.07 |
texture_worst | 0.47 | 0.31 | 0.91 | 0.31 | 0.29 | 0.06 | 0.26 | 0.30 | 0.30 | 0.09 | -0.05 | 0.17 | 0.38 | 0.18 | 0.18 | -0.09 | 0.14 | 0.09 | 0.10 | -0.12 | -0.01 | 0.37 | 1.00 | 0.38 | 0.36 | 0.26 | 0.38 | 0.38 | 0.38 | 0.25 | 0.23 |
perimeter_worst | 0.79 | 0.96 | 0.37 | 0.97 | 0.96 | 0.25 | 0.59 | 0.72 | 0.86 | 0.24 | -0.23 | 0.72 | -0.07 | 0.72 | 0.75 | -0.20 | 0.25 | 0.20 | 0.39 | -0.12 | -0.01 | 0.99 | 0.38 | 1.00 | 0.98 | 0.23 | 0.51 | 0.60 | 0.81 | 0.25 | 0.12 |
area_worst | 0.74 | 0.94 | 0.36 | 0.94 | 0.96 | 0.22 | 0.52 | 0.67 | 0.81 | 0.20 | -0.24 | 0.75 | -0.06 | 0.73 | 0.81 | -0.17 | 0.19 | 0.17 | 0.34 | -0.12 | -0.03 | 0.98 | 0.36 | 0.98 | 1.00 | 0.21 | 0.42 | 0.53 | 0.75 | 0.19 | 0.06 |
smoothness_worst | 0.42 | 0.11 | 0.11 | 0.14 | 0.12 | 0.79 | 0.56 | 0.43 | 0.44 | 0.44 | 0.49 | 0.12 | -0.08 | 0.11 | 0.11 | 0.30 | 0.22 | 0.14 | 0.19 | -0.04 | 0.15 | 0.22 | 0.26 | 0.23 | 0.21 | 1.00 | 0.57 | 0.52 | 0.55 | 0.52 | 0.62 |
compactness_worst | 0.58 | 0.39 | 0.29 | 0.43 | 0.37 | 0.46 | 0.87 | 0.73 | 0.64 | 0.47 | 0.45 | 0.26 | -0.08 | 0.31 | 0.26 | -0.05 | 0.68 | 0.45 | 0.43 | 0.03 | 0.38 | 0.46 | 0.38 | 0.51 | 0.42 | 0.57 | 1.00 | 0.88 | 0.79 | 0.61 | 0.81 |
concavity_worst | 0.65 | 0.52 | 0.32 | 0.55 | 0.50 | 0.44 | 0.82 | 0.88 | 0.74 | 0.44 | 0.34 | 0.37 | -0.05 | 0.40 | 0.37 | -0.05 | 0.65 | 0.66 | 0.55 | 0.01 | 0.38 | 0.56 | 0.38 | 0.60 | 0.53 | 0.52 | 0.88 | 1.00 | 0.85 | 0.51 | 0.68 |
concave points_worst | 0.79 | 0.74 | 0.32 | 0.77 | 0.72 | 0.51 | 0.82 | 0.85 | 0.91 | 0.45 | 0.16 | 0.53 | -0.08 | 0.55 | 0.53 | -0.09 | 0.48 | 0.42 | 0.59 | -0.05 | 0.21 | 0.79 | 0.38 | 0.81 | 0.75 | 0.55 | 0.79 | 0.85 | 1.00 | 0.49 | 0.50 |
symmetry_worst | 0.39 | 0.14 | 0.11 | 0.16 | 0.12 | 0.39 | 0.49 | 0.37 | 0.34 | 0.68 | 0.31 | 0.06 | -0.15 | 0.07 | 0.04 | -0.13 | 0.25 | 0.15 | 0.09 | 0.31 | 0.08 | 0.23 | 0.25 | 0.25 | 0.19 | 0.52 | 0.61 | 0.51 | 0.49 | 1.00 | 0.54 |
fractal_dimension_worst | 0.31 | -0.02 | 0.13 | 0.03 | -0.02 | 0.49 | 0.70 | 0.50 | 0.35 | 0.43 | 0.77 | 0.02 | -0.03 | 0.06 | -0.01 | 0.11 | 0.60 | 0.42 | 0.30 | 0.07 | 0.58 | 0.07 | 0.23 | 0.12 | 0.06 | 0.62 | 0.81 | 0.68 | 0.50 | 0.54 | 1.00 |
feature1 | feature2 | corr | |
31 | radius_mean | perimeter_mean | 0.998112 |
33 | radius_worst | perimeter_worst | 0.994136 |
35 | radius_mean | area_mean | 0.987089 |
37 | perimeter_mean | area_mean | 0.986662 |
39 | radius_worst | area_worst | 0.983782 |
# bp.show_methods(df_train.bp)
cols_high_corr = ['area_mean', 'radius_worst', 'radius_mean', 'area_worst', 'perimeter_worst', 'perimeter_mean'] cols_high_corr1 = ['radius_mean', 'radius_worst', 'radius_mean', 'perimeter_mean', 'radius_worst'] cols_high_corr2 = ['perimeter_mean', 'perimeter_worst', 'area_mean', 'area_mean', 'area_worst'] cols_high_corr_drop = ['radius_mean', 'radius_worst']
feature1 | feature2 | corr | |
0 | radius_mean | perimeter_mean | 0.998112 |
1 | radius_worst | perimeter_worst | 0.994136 |
2 | radius_mean | area_mean | 0.987089 |
3 | perimeter_mean | area_mean | 0.986662 |
4 | radius_worst | area_worst | 0.983782 |
cols_high_corr_drop = ['area_worst', 'perimeter_mean', 'perimeter_se']
df_train2 = df_train.drop(cols_high_corr_drop,axis=1)
df_test2 = df_test.drop(cols_high_corr_drop,axis=1)
import xgboost as xgb
import sklearn.metrics as skmetrics
def get_row_eval(model,desc,df_eval,sort='F1'):, ser_ytrain)
ypreds = model.predict(df_Xtest)
ytx = np.array(ser_ytest).flatten()
average = 'binary'
row_eval = ['Xgboost',desc,
skmetrics.accuracy_score(ytx, ypreds),
skmetrics.precision_score(ytx, ypreds, average=average),
skmetrics.recall_score(ytx, ypreds, average=average),
skmetrics.f1_score(ytx, ypreds, average=average),
skmetrics.roc_auc_score(ytx, ypreds)]
return row_eval,ypreds
df_Xtrain = df_train.drop(target,axis=1)
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)
ser_ytest = df_test[target]
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'default',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
0 | Xgboost | default | 0.973684 | 0.97561 | 0.952381 | 0.963855 | 0.969246 |
df_Xtrain = df_train2.drop(target,axis=1)
ser_ytrain = df_train2[target]
df_Xtest = df_test2.drop(target,axis=1)
ser_ytest = df_test2[target]
# fitting
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'corr_thr<0.98',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
# removing correlated features gave worse result.
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
0 | Xgboost | default | 0.973684 | 0.97561 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.97500 | 0.928571 | 0.951220 | 0.957341 |
from sklearn.feature_selection import RFECV
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
est = RFECV(model,step=1,cv=5,scoring='roc_auc',n_jobs=-1),ser_ytrain)
print('Optimal features =',est.n_features_)
print(' Best features =', df_Xtrain.columns[est.support_])
Optimal features = 15 Best features = Index(['texture_mean', 'smoothness_mean', 'concave points_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst'], dtype='object')
cols = ['texture_mean', 'area_mean', 'smoothness_mean', 'concave points_mean',
'radius_se', 'area_se', 'symmetry_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst', 'concavity_worst',
'concave points_worst']
df_Xtrain = df_train.drop(target,axis=1)[cols]
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)[cols]
ser_ytest = df_test[target]
# fitting
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
row_eval,ypreds = get_row_eval(model,'RFECV',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
# rfecv gave worse result. so reset data
df_Xtrain = df_train.drop(target,axis=1)
ser_ytrain = df_train[target]
df_Xtest = df_test.drop(target,axis=1)
ser_ytest = df_test[target]
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
0 | Xgboost | default | 0.973684 | 0.97561 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.97500 | 0.928571 | 0.951220 | 0.957341 |
2 | Xgboost | RFECV | 0.956140 | 0.95122 | 0.928571 | 0.939759 | 0.950397 |
Important Parameters:
learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
max_depth: determines how deeply each tree is allowed to grow during any boosting round.
subsample: percentage of samples used per tree. Low value can lead to underfitting.
colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
n_estimators: number of trees you want to build.
Regularization parameters:
gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
alpha: L1 regularization on leaf weights. A large value leads to more regularization.
lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
# %%time
# params_grid = {
# 'max_depth': [4,5,6,7,8,9,10,11,None],
# 'subsample': [0.6,0.7,0.8,0.9,1],
# 'scale_pos_weight': [1,2,3,5,10,30,40],
# }
# model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED)
# skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
# est = GridSearchCV(model,params_grid,
# cv = skf,
# verbose=2,
# n_jobs = -1,
# scoring='f1')
# # Fit the random search model
#, ser_ytrain) # comment this
# params_best = est.best_params_
# NOTE: comment grid search after done.
# Wall time: 7min 8s
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
{'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED, **params_best)
row_eval,ypreds = get_row_eval(model,'grid_search',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
3 | Xgboost | grid_search | 0.973684 | 0.953488 | 0.976190 | 0.964706 | 0.974206 |
0 | Xgboost | default | 0.973684 | 0.975610 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.975000 | 0.928571 | 0.951220 | 0.957341 |
2 | Xgboost | RFECV | 0.956140 | 0.951220 | 0.928571 | 0.939759 | 0.950397 |
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,n_estimators=1000,**params_best)
row_eval,ypreds = get_row_eval(model,'grid_search2',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
ypreds_best = ypreds
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
4 | Xgboost | grid_search2 | 0.982456 | 0.976190 | 0.976190 | 0.976190 | 0.981151 |
3 | Xgboost | grid_search | 0.973684 | 0.953488 | 0.976190 | 0.964706 | 0.974206 |
0 | Xgboost | default | 0.973684 | 0.975610 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.975000 | 0.928571 | 0.951220 | 0.957341 |
2 | Xgboost | RFECV | 0.956140 | 0.951220 | 0.928571 | 0.939759 | 0.950397 |
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
def hpo_hyperopt(param_space, Xtrain, ytrain, Xtest, ytest, num_eval,cv=5,fixed_params={}):
"""HPO using hyperopt package.
# time
time_start = time.time()
# define objective function
def objective_function(params):
model = xgb.XGBClassifier(**params,**fixed_params)
score = sklearn.model_selection.cross_val_score(model,
Xtrain, ytrain,
score = score.mean()
return {'loss': -score, 'status': STATUS_OK}
# keep track of trials
trials = Trials()
# best params
best_param = fmin(objective_function,
rstate= np.random.RandomState(SEED))
# dict best params
dict_best_params = copy.copy(best_param)
if 'boosting_type' in dict_best_params:
dict_best_params['boosting_type'] = 'gbdt' if dict_best_params['boosting_type'] == 0 else 'dart'
int_params = ['max_depth','num_leaves','n_estimators']
for int_param in int_params:
# make integer if exist
if int_param in dict_best_params:
dict_best_params[int_param] = int(dict_best_params[int_param])
# loss
loss = [x['result']['loss'] for x in trials.trials]
# best model
model_best = xgb.XGBClassifier(**dict_best_params), ytrain)
time_taken = time.time() - time_start
print("\nResults\n" + '='*50)
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print("Number of parameter combinations tested: ", num_eval)
print("Train Score Best : {:.4f} ".format(min(loss)*-1))
print("Test Score : {:.4f} ".format(model_best.score(Xtest, ytest)))
print("Best parameters:")
return trials, dict_best_params
params_hyp= {
'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
'max_depth':'max_depth', 3, 15, 1)),
# 'n_estimators':'n_estimators', 100, 500, 50)),
'subsample': hp.uniform ('subsample', 0.5, 1),
'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
'scale_pos_weight': hp.quniform('scale_pos_weight',1,100,1),
# regularization
'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.1),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.1),
'gamma' : hp.uniform ('gamma', 0.1,0.5),
# current values
Xtr = df_Xtrain
ytr = ser_ytrain
Xtx = df_Xtest
ytx = ser_ytest
# fixed_params = {'n_estimators': 1000}
# trials, dict_best_params = hpo_hyperopt(params_hyp, Xtr, ytr, Xtx, ytx,
# num_eval=100,fixed_params=fixed_params)
dict_best_params = {
'gamma': 0.29426928529915647,
'learning_rate': 0.0227779530532774,
'max_depth': 4,
'min_child_weight': 1.0,
'reg_alpha': 0.019685023503677693,
'reg_lambda': 0.0538168932849033,
'scale_pos_weight': 2.0,
'subsample': 0.7231941501698588}
model = XGBClassifier(n_jobs=-1, random_state=SEED,**dict_best_params)
row_eval,ypreds = get_row_eval(model,'hyperopt',df_eval)
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
df_eval = df_eval.sort_values('F1',ascending=False)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
4 | Xgboost | grid_search2 | 0.982456 | 0.976190 | 0.976190 | 0.976190 | 0.981151 |
3 | Xgboost | grid_search | 0.973684 | 0.953488 | 0.976190 | 0.964706 | 0.974206 |
0 | Xgboost | default | 0.973684 | 0.975610 | 0.952381 | 0.963855 | 0.969246 |
1 | Xgboost | corr_thr<0.98 | 0.964912 | 0.975000 | 0.928571 | 0.951220 | 0.957341 |
5 | Xgboost | hyperopt | 0.956140 | 0.930233 | 0.952381 | 0.941176 | 0.955357 |
2 | Xgboost | RFECV | 0.956140 | 0.951220 | 0.928571 | 0.939759 | 0.950397 |
from bp import plotly_binary_clf_evaluation
# help(plotly_binary_clf_evaluation)
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,
n_estimators=1000,**params_best), ser_ytrain)
ypreds = model.predict(df_Xtest)
yprobs = model.predict_proba(df_Xtest)
yprobs = yprobs[:,0] # take only first column
show_methods(bp, contains='classi')
0 | 1 | 2 | 3 | |
0 | get_binary_classification_report | get_binary_classification_scalar_metrics | get_binary_classification_scalar_metrics2 |
df_clf_report = bp.get_binary_classification_report(
Model | Description | Precision_0 | Precision_1 | Recall_0 | Recall_1 | F1_Score_0 | F1_Score_1 | Support_0 | Support_1 | |
0 | xgboost | gridsearch2 | 0.986111 | 0.976190 | 0.986111 | 0.976190 | 0.986111 | 0.976190 | 72.000000 | 42.000000 |
df_eval = bp.get_binary_classification_scalar_metrics(
Model | Description | Accuracy | Precision | Recall | F1 | Mathews_Correlation_Coefficient | Cohens_Kappa | Area_Under_Precision_Curve | Area_Under_ROC_Curve | |
0 | xgboost | gridsearch2 | 0.982456 | 0.976190 | 0.976190 | 0.976190 | 0.962302 | 0.962302 | 0.995281 | 0.997024 |
import shap
params_best = {'max_depth': 5, 'scale_pos_weight': 3, 'subsample': 0.6}
model = xgb.XGBClassifier(n_jobs=-1, random_state=SEED,
n_estimators=1000,**params_best), ser_ytrain)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=1000, n_jobs=-1, num_parallel_tree=1, random_state=100, reg_alpha=0, reg_lambda=1, scale_pos_weight=3, subsample=0.6, tree_method='exact', validate_parameters=1, verbosity=None)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
print(df_Xtest.shape, shap_values.shape)
(114, 30) (114, 30)
array([-0.16933775, -1.2920814 ], dtype=float32)
max_display = 30
shap.summary_plot(shap_values, df_Xtest, plot_type="bar",
max_display = max_display)
shap.summary_plot(shap_values, df_Xtest, plot_type='dot', max_display = max_display)
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole noteook: 0 hr 0 min 19 secs