References:
Some samples:
$\underline{\text{Data specification}}$
${\text{Number of samples }} : n = 569 \\ {\text{Full dataset}} : \ D = \{ X, Y \} = \{ x_i, y_i \}_{i=1}^{n}$
$ \text{Input space : } \dim(X) = ( \underset{\text{case :}}{n \times 10}) \times \underset{\text{a, b, c}}{3} \\ \text{Output space : }\dim(Y) = n $
Here we say case a is the mean, case b is the SE (standard error $\frac{\sigma}{\sqrt{n}})$ and case c is Worst. Worst means mean of 3 maximum values.
$\underline{\text{Dependent variable / Target}} \ \big( \ y_i = \{B, \, M \} \ \big)$
$\underline{\text{Independent variable / Feature}} \,\ \big( \ x_i \in \mathbb{R}^{10} \ \big)$
To sum up, we can say that the dataset fulfills - $X = [X^{(a)}_{\text{Mean}}, \, X^{(b)}_{\text{SE}}, \, X^{(c)}_{\text{Worst}}] \in \mathbb{R}^{(n \times 10) \times 3}$ :
Feature | Mean | SE | Worst |
---|---|---|---|
1.Radius | $\quad \vdots$ | $\ \vdots$ | $\quad \vdots$ |
$\quad \cdots$ | $\quad \vdots$ | $\ \vdots$ | $\quad \vdots$ |
10.Fractal dim. | $\quad \vdots$ | $\ \vdots$ | $\quad \vdots$ |
This means there are 10 features with 3 statistics for each samples.
import time
notebook_start_time = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max_rows',50)
pd.set_option('max_columns',50)
SEED = 100
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
sns.set()
%matplotlib inline
# modelling
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# boosting
import xgboost
from xgboost import XGBClassifier
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2021-02-13 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit pandas 1.1.4 sklearn 0.23.1 numpy 1.19.5 seaborn 0.11.0 xgboost 1.2.0
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
import bp
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
df = pd.read_csv('../data/raw/data.csv')
print(df.shape)
df.head()
(569, 33)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
target = 'diagnosis'
col_id = 'id'
cols_drop = ['id','Unnamed: 32' ]
df = df.drop(cols_drop, axis=1)
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})
features = df.filter(regex='_mean').columns.tolist()
features = [i.replace('_mean','') for i in features]
df_stats = pd.DataFrame(columns=features)
df_stats
radius | texture | perimeter | area | smoothness | compactness | concavity | concave points | symmetry | fractal_dimension |
---|
dfa = df.filter(regex='_mean')
dfb = df.filter(regex='_se')
dfc = df.filter(regex='_worst')
dfa.describe().loc[['mean','std','min','max']]
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | |
---|---|---|---|---|---|---|---|---|---|---|
mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 |
std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 |
min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 |
max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 |
df_stats.loc['mean_a'] = df.filter(regex='_mean').mean(axis=0).values
df_stats
radius | texture | perimeter | area | smoothness | compactness | concavity | concave points | symmetry | fractal_dimension | |
---|---|---|---|---|---|---|---|---|---|---|
mean_a | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.09636 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 |
for stat in ['mean','std','min','max']:
for aspect,abc in zip(['mean','se','worst'],list('abc')):
df_stats.loc[f'{stat}_{abc}'] = getattr(df.filter(regex=f'_{aspect}'),stat)(axis=0).values
print('a means Mean, b means SE, c means Worst')
df_stats
a means Mean, b means SE, c means Worst
radius | texture | perimeter | area | smoothness | compactness | concavity | concave points | symmetry | fractal_dimension | |
---|---|---|---|---|---|---|---|---|---|---|
mean_a | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 |
mean_b | 0.405172 | 1.216853 | 2.866059 | 40.337079 | 0.007041 | 0.025478 | 0.031894 | 0.011796 | 0.020542 | 0.003795 |
mean_c | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
std_a | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 |
std_b | 0.277313 | 0.551648 | 2.021855 | 45.491006 | 0.003003 | 0.017908 | 0.030186 | 0.006170 | 0.008266 | 0.002646 |
std_c | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
min_a | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 |
min_b | 0.111500 | 0.360200 | 0.757000 | 6.802000 | 0.001713 | 0.002252 | 0.000000 | 0.000000 | 0.007882 | 0.000895 |
min_c | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
max_a | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 |
max_b | 2.873000 | 4.885000 | 21.980000 | 542.200000 | 0.031130 | 0.135400 | 0.396000 | 0.052790 | 0.078950 | 0.029840 |
max_c | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
cols = df.filter(regex='mean').columns
df1 = df[cols]
df1.plot(kind= 'density', subplots=True, layout=(5,2), sharex=False,
sharey=False,fontsize=12, figsize=(15,10))
plt.show()
df[target].value_counts(normalize=True)
0 0.627417 1 0.372583 Name: diagnosis, dtype: float64
cols = df.filter(regex='mean').columns
fig,ax = plt.subplots(5,2, figsize=(15,10))
df.query('diagnosis==0')[cols].plot(kind= 'density', subplots=True, sharex=False,
sharey=False,fontsize=12,ax=ax)
df.query('diagnosis==1')[cols].plot(kind= 'density', subplots=True, sharex=False,
sharey=False,fontsize=12,ax=ax,style='-.')
plt.suptitle('Density Plot for Benign (solid) and Malignant (dashdot) Cases',fontsize=18)
plt.savefig('images/densityplot_mean_features.png',dpi=300)
plt.show()
"""
Observation:
The density plots for benign and malignant cases are well separated.
This means the features we use here are useful for machine learning.
""";
# bp.show_methods(bp, contains='corr') # my local functions
# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[cols].rename(columns=lambda x: x.replace('_mean',''))
bp.plot_corr_style(df1)
radius | texture | perimeter | area | smoothness | compactness | concavity | concave points | symmetry | fractal_dimension | |
---|---|---|---|---|---|---|---|---|---|---|
radius | 1.00 | 0.32 | 1.00 | 0.99 | 0.17 | 0.51 | 0.68 | 0.82 | 0.15 | -0.31 |
texture | 0.32 | 1.00 | 0.33 | 0.32 | -0.02 | 0.24 | 0.30 | 0.29 | 0.07 | -0.08 |
perimeter | 1.00 | 0.33 | 1.00 | 0.99 | 0.21 | 0.56 | 0.72 | 0.85 | 0.18 | -0.26 |
area | 0.99 | 0.32 | 0.99 | 1.00 | 0.18 | 0.50 | 0.69 | 0.82 | 0.15 | -0.28 |
smoothness | 0.17 | -0.02 | 0.21 | 0.18 | 1.00 | 0.66 | 0.52 | 0.55 | 0.56 | 0.58 |
compactness | 0.51 | 0.24 | 0.56 | 0.50 | 0.66 | 1.00 | 0.88 | 0.83 | 0.60 | 0.57 |
concavity | 0.68 | 0.30 | 0.72 | 0.69 | 0.52 | 0.88 | 1.00 | 0.92 | 0.50 | 0.34 |
concave points | 0.82 | 0.29 | 0.85 | 0.82 | 0.55 | 0.83 | 0.92 | 1.00 | 0.46 | 0.17 |
symmetry | 0.15 | 0.07 | 0.18 | 0.15 | 0.56 | 0.60 | 0.50 | 0.46 | 1.00 | 0.48 |
fractal_dimension | -0.31 | -0.08 | -0.26 | -0.28 | 0.58 | 0.57 | 0.34 | 0.17 | 0.48 | 1.00 |
# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[cols].rename(columns=lambda x: x.replace('_mean',''))
bp.plot_corr(df1,xrot=90)
"""
Observation:
1. Highly correlated features
radius vs perieter
radius vs area
concavity vs concave points
For linear methods (Logistic Regression) we may consider deleting correlated
features. For tree based models (Random Forest, Boosting), there is automatic
feature selection and we may not drop these correlated features.
""";
# bp.plot_corr?
bp.get_high_correlated_features_df(df).head()
feature1 | feature2 | corr | |
---|---|---|---|
31 | perimeter_mean | radius_mean | 0.997855 |
33 | perimeter_worst | radius_worst | 0.993708 |
35 | radius_mean | area_mean | 0.987357 |
37 | perimeter_mean | area_mean | 0.986507 |
39 | area_worst | radius_worst | 0.984015 |
# bp.show_methods(df_train.bp)
df.bp.corr_high(thr=0.98)
cols_high_corr = ['radius_mean', 'area_mean', 'area_worst', 'radius_worst', 'perimeter_mean', 'perimeter_worst'] cols_high_corr1 = ['perimeter_mean', 'perimeter_worst', 'radius_mean', 'perimeter_mean', 'area_worst'] cols_high_corr2 = ['radius_mean', 'radius_worst', 'area_mean', 'area_mean', 'radius_worst'] cols_high_corr_drop = ['area_worst', 'perimeter_mean', 'perimeter_worst']
feature1 | feature2 | corr | |
---|---|---|---|
0 | perimeter_mean | radius_mean | 0.997855 |
1 | perimeter_worst | radius_worst | 0.993708 |
2 | radius_mean | area_mean | 0.987357 |
3 | perimeter_mean | area_mean | 0.986507 |
4 | area_worst | radius_worst | 0.984015 |
df.head(2)
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 17.99 | 10.38 | 122.8 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.6 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 1 | 20.57 | 17.77 | 132.9 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.8 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[[target]+ cols].rename(columns=lambda x: x.replace('_mean',''))
df1.head(2)
diagnosis | radius | texture | perimeter | area | smoothness | compactness | concavity | concave points | symmetry | fractal_dimension | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 17.99 | 10.38 | 122.8 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 |
1 | 1 | 20.57 | 17.77 | 132.9 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 |
# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[[target]+ cols].rename(columns=lambda x: x.replace('_mean',''))
bp.plotly_radar_plot(df1,target,
names=['Benign','Malignant'],
colors=['green','red'],
# show_data=True,
)
Observations:
df1[target].sort_values().unique()
array([0, 1])
cols = df.filter(regex='_mean').columns.tolist() # mean features
df1 = df[[target]+ cols].rename(columns=lambda x: x.replace('_mean',''))
# sort values by target
df1 = df1.sort_values(target)
u = df1[target].sort_values().unique()
bool_0 = df1[target]==u[0]
bool_1 = df1[target]==u[1]
# drop target and keep only featrues
df2 = df1.drop(target,axis=1)
df20 = df2[bool_0]
df21 = df2[bool_1]
# min max scaling
df2_max_min = df2.max(axis=0) - df2.min(axis=0)
# values for spider diagram
# NOTE: Here we use the whole data min df2.min NOT df20.min and so on.
ZERO = (( df20 - df2.min() ) / df2_max_min).mean(axis=0)
ONE = (( df21 - df2.min() ) / df2_max_min).mean(axis=0)
ZERO
radius 0.244476 texture 0.277469 perimeter 0.236925 area 0.135436 smoothness 0.359733 compactness 0.186199 concavity 0.107914 concave points 0.127820 symmetry 0.344374 fractal_dimension 0.271849 dtype: float64