import time
notebook_start_time = time.time()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('max_rows',50)
pd.set_option('max_columns',50)
SEED = 100

plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
sns.set()
%matplotlib inline

# modelling
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint


# boosting
import xgboost
from xgboost import XGBClassifier


# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2021-02-13 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

pandas  1.1.4
sklearn 0.23.1
numpy   1.19.5
seaborn 0.11.0
xgboost 1.2.0


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
import bp


df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                    })


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


df = pd.read_csv('../data/raw/data.csv')

print(df.shape)
df.head()

(569, 33)


target = 'diagnosis'
col_id = 'id'


cols_drop = ['id','Unnamed: 32' ]
df = df.drop(cols_drop, axis=1)


df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})


features = df.filter(regex='_mean').columns.tolist()
features = [i.replace('_mean','') for i in features]

df_stats = pd.DataFrame(columns=features)
df_stats


dfa = df.filter(regex='_mean')
dfb = df.filter(regex='_se')
dfc = df.filter(regex='_worst')



dfa.describe().loc[['mean','std','min','max']]


df_stats.loc['mean_a'] = df.filter(regex='_mean').mean(axis=0).values
df_stats


for stat in ['mean','std','min','max']:
    for aspect,abc in zip(['mean','se','worst'],list('abc')):
        df_stats.loc[f'{stat}_{abc}'] = getattr(df.filter(regex=f'_{aspect}'),stat)(axis=0).values

print('a means Mean, b means SE, c means Worst')
df_stats

a means Mean, b means SE, c means Worst


cols = df.filter(regex='mean').columns
df1 = df[cols]

df1.plot(kind= 'density', subplots=True, layout=(5,2), sharex=False, 
                     sharey=False,fontsize=12, figsize=(15,10))

plt.show()


df[target].value_counts(normalize=True)

0    0.627417
1    0.372583
Name: diagnosis, dtype: float64


cols = df.filter(regex='mean').columns

fig,ax = plt.subplots(5,2, figsize=(15,10))

df.query('diagnosis==0')[cols].plot(kind= 'density', subplots=True, sharex=False, 
                     sharey=False,fontsize=12,ax=ax)


df.query('diagnosis==1')[cols].plot(kind= 'density', subplots=True, sharex=False, 
                     sharey=False,fontsize=12,ax=ax,style='-.')

plt.suptitle('Density Plot for Benign (solid) and Malignant (dashdot) Cases',fontsize=18)

plt.savefig('images/densityplot_mean_features.png',dpi=300)
plt.show()


"""
Observation:

The density plots for benign and malignant cases are well separated.
This means the features we use here are useful for machine learning.

""";


# bp.show_methods(bp, contains='corr') # my local functions


# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[cols].rename(columns=lambda x: x.replace('_mean',''))

bp.plot_corr_style(df1)


# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[cols].rename(columns=lambda x: x.replace('_mean',''))

bp.plot_corr(df1,xrot=90)


"""
Observation:

1. Highly correlated features
   radius vs perieter
   radius vs area
   concavity vs concave points
   
   For linear methods (Logistic Regression) we may consider deleting correlated
   features. For tree based models (Random Forest, Boosting), there is automatic
   feature selection and we may not drop these correlated features.

""";


# bp.plot_corr?


bp.get_high_correlated_features_df(df).head()


# bp.show_methods(df_train.bp)


df.bp.corr_high(thr=0.98)

cols_high_corr = ['radius_mean', 'area_mean', 'area_worst', 'radius_worst', 'perimeter_mean', 'perimeter_worst']
cols_high_corr1 = ['perimeter_mean', 'perimeter_worst', 'radius_mean', 'perimeter_mean', 'area_worst']
cols_high_corr2 = ['radius_mean', 'radius_worst', 'area_mean', 'area_mean', 'radius_worst']
cols_high_corr_drop = ['area_worst', 'perimeter_mean', 'perimeter_worst']


df.head(2)


# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[[target]+ cols].rename(columns=lambda x: x.replace('_mean',''))

df1.head(2)


# select only the mean features
cols = df.filter(regex='_mean').columns.tolist()
df1 = df[[target]+ cols].rename(columns=lambda x: x.replace('_mean',''))

bp.plotly_radar_plot(df1,target,
                     names=['Benign','Malignant'],
                     colors=['green','red'],
#                      show_data=True,
                    )


df1[target].sort_values().unique()

array([0, 1])


cols = df.filter(regex='_mean').columns.tolist() # mean features
df1  = df[[target]+ cols].rename(columns=lambda x: x.replace('_mean',''))

# sort values by target
df1 = df1.sort_values(target)
u = df1[target].sort_values().unique()

bool_0 = df1[target]==u[0]
bool_1 = df1[target]==u[1]

# drop target and keep only featrues
df2 = df1.drop(target,axis=1)

df20 = df2[bool_0]
df21 = df2[bool_1]
  
# min max scaling
df2_max_min = df2.max(axis=0) - df2.min(axis=0)

# values for spider diagram
# NOTE: Here we use the whole data min df2.min NOT df20.min and so on.
ZERO = (( df20 - df2.min() ) / df2_max_min).mean(axis=0)
ONE  = (( df21 - df2.min() ) / df2_max_min).mean(axis=0)

ZERO

radius               0.244476
texture              0.277469
perimeter            0.236925
area                 0.135436
smoothness           0.359733
compactness          0.186199
concavity            0.107914
concave points       0.127820
symmetry             0.344374
fractal_dimension    0.271849
dtype: float64

Feature	Mean	SE	Worst
1.Radius	$\quad \vdots$	$\ \vdots$	$\quad \vdots$
$\quad \cdots$	$\quad \vdots$	$\ \vdots$	$\quad \vdots$
10.Fractal dim.	$\quad \vdots$	$\ \vdots$	$\quad \vdots$

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	1.0950	0.9053	8.589	153.40	0.006399	0.04904	0.05373	0.01587	0.03003	0.006193	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890	NaN
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	0.5435	0.7339	3.398	74.08	0.005225	0.01308	0.01860	0.01340	0.01389	0.003532	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902	NaN
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	0.7456	0.7869	4.585	94.03	0.006150	0.04006	0.03832	0.02058	0.02250	0.004571	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758	NaN
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	0.4956	1.1560	3.445	27.23	0.009110	0.07458	0.05661	0.01867	0.05963	0.009208	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300	NaN
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	0.7572	0.7813	5.438	94.44	0.011490	0.02461	0.05688	0.01885	0.01756	0.005115	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678	NaN

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440

	radius	texture	perimeter	area	smoothness	compactness	concavity	concave points	symmetry	fractal_dimension
mean_a	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798
mean_b	0.405172	1.216853	2.866059	40.337079	0.007041	0.025478	0.031894	0.011796	0.020542	0.003795
mean_c	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std_a	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060
std_b	0.277313	0.551648	2.021855	45.491006	0.003003	0.017908	0.030186	0.006170	0.008266	0.002646
std_c	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min_a	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960
min_b	0.111500	0.360200	0.757000	6.802000	0.001713	0.002252	0.000000	0.000000	0.007882	0.000895
min_c	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
max_a	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440
max_b	2.873000	4.885000	21.980000	542.200000	0.031130	0.135400	0.396000	0.052790	0.078950	0.029840
max_c	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500

	radius	texture	perimeter	area	smoothness	compactness	concavity	concave points	symmetry	fractal_dimension
radius	1.00	0.32	1.00	0.99	0.17	0.51	0.68	0.82	0.15	-0.31
texture	0.32	1.00	0.33	0.32	-0.02	0.24	0.30	0.29	0.07	-0.08
perimeter	1.00	0.33	1.00	0.99	0.21	0.56	0.72	0.85	0.18	-0.26
area	0.99	0.32	0.99	1.00	0.18	0.50	0.69	0.82	0.15	-0.28
smoothness	0.17	-0.02	0.21	0.18	1.00	0.66	0.52	0.55	0.56	0.58
compactness	0.51	0.24	0.56	0.50	0.66	1.00	0.88	0.83	0.60	0.57
concavity	0.68	0.30	0.72	0.69	0.52	0.88	1.00	0.92	0.50	0.34
concave points	0.82	0.29	0.85	0.82	0.55	0.83	0.92	1.00	0.46	0.17
symmetry	0.15	0.07	0.18	0.15	0.56	0.60	0.50	0.46	1.00	0.48
fractal_dimension	-0.31	-0.08	-0.26	-0.28	0.58	0.57	0.34	0.17	0.48	1.00

Table of Contents

Introduction to Project¶

Imports¶

Useful Scripts¶

Load the data¶

Data Manipulation¶

Exploratory Data Analysis¶

Statistics of 10 features¶

Density Plots¶

Correlation¶

Radar Chart (Spider diagram or Polygon Plot) for minmax normalized Mean features¶

	feature1	feature2	corr
31	perimeter_mean	radius_mean	0.997855
33	perimeter_worst	radius_worst	0.993708
35	radius_mean	area_mean	0.987357
37	perimeter_mean	area_mean	0.986507
39	area_worst	radius_worst	0.984015

	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	1	17.99	10.38	122.8	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	1.0950	0.9053	8.589	153.40	0.006399	0.04904	0.05373	0.01587	0.03003	0.006193	25.38	17.33	184.6	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	1	20.57	17.77	132.9	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	0.5435	0.7339	3.398	74.08	0.005225	0.01308	0.01860	0.01340	0.01389	0.003532	24.99	23.41	158.8	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902