import numpy as np
import pandas as pd
import os,sys,pathlib

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import vaex
from tqdm import tqdm

%matplotlib inline
%load_ext watermark
%watermark -iv

pandas    : 1.3.1
numpy     : 1.21.0
seaborn   : 0.11.1
sys       : 3.9.5 (default, May 18 2021, 12:31:01) 
[Clang 10.0.0 ]
vaex      : 4.4.0
matplotlib: 3.4.2


!ls ~/data

diabetes_project_dataset.csv  diabetes_project_dataset.hdf5


# %%time
# ifile = os.path.expanduser("~/data/diabetes_project_dataset.csv")
# vdf = vaex.from_csv(ifile) # csv takes longer time, convert to hdf5

# Wall time: 10min 20s


# vdf.export_hdf5(ifile.replace(".csv",".hdf5"), progress=True)


%%time
ifile = os.path.expanduser("~/data/diabetes_project_dataset.hdf5")
vdf = vaex.open(ifile) # csv takes longer time, convert to hdf5

CPU times: user 57.2 s, sys: 980 ms, total: 58.1 s
Wall time: 58.5 s


cols = list(vdf.columns)[:50]
len(cols), cols[:2], cols[-2:]

(50, ['SampleID', 'existing_diabetes'], ['mtb_1940724', 'mtb_18238'])


%%time
vdf = vdf[cols]

CPU times: user 24.1 s, sys: 78.5 ms, total: 24.1 s
Wall time: 24.4 s


vdf.head()


%%time
vdf = vdf[vdf.existing_diabetes == 0.0]
vdf = vdf.drop('existing_diabetes') # vaex has NO axis=1, it only drops columns

CPU times: user 3.58 ms, sys: 609 µs, total: 4.19 ms
Wall time: 4.04 ms


len(vdf)

7697


vdf['incident_diabetes'].unique()

[0.0, 1.0]


%%time
# if there are nans, remove missing values
vdf = vdf.dropna(['incident_diabetes']) # vaex has NO argument subset

len(vdf)

CPU times: user 5.74 ms, sys: 1.92 ms, total: 7.66 ms
Wall time: 8.02 ms

7697


%%time
vdf['incident_diabetes'].isna().sum()

CPU times: user 11.8 ms, sys: 2.87 ms, total: 14.6 ms
Wall time: 15.4 ms

array(0)


%%time
target = 'incident_diabetes'

ser = vdf.incident_diabetes.to_pandas_series()

CPU times: user 4.75 ms, sys: 1.74 ms, total: 6.49 ms
Wall time: 8.48 ms


ser.value_counts().plot.bar()

<AxesSubplot:>


ser.value_counts(normalize=True)

0.0    0.908536
1.0    0.091464
dtype: float64


vdf_ones = vdf[vdf.incident_diabetes==1.0]


vdf_zeros = vdf[vdf.incident_diabetes==0.0]


vdf_zeros_small = vdf_zeros.sample(n=len(vdf_ones))


len(vdf)

7697


%%time
vdf = vaex.concat([vdf_ones,vdf_zeros_small])

len(vdf)

CPU times: user 6.92 ms, sys: 1.53 ms, total: 8.45 ms
Wall time: 10.8 ms

1408


%%time
# we must shuffle before split
vdf = vdf.sample(frac=1)
vdf_train, vdf_test = vdf.ml.train_test_split(test_size=0.2)

CPU times: user 108 ms, sys: 6.36 ms, total: 115 ms
Wall time: 137 ms

/Users/poudel/opt/miniconda3/envs/dsk/lib/python3.9/site-packages/vaex/ml/__init__.py:31: UserWarning: Make sure the DataFrame is shuffled
  warnings.warn('Make sure the DataFrame is shuffled')


vdf_train['LDL'].isna().sum()

array(0)


vdf_train['LDL'].isna().sum() / len(vdf_train)

0.0


vdf_train.columns

<vaex.dataframe.ColumnProxy at 0x7f960bb3e280>


list(vdf_train.columns)[:5]

['SampleID',
 '__existing_diabetes',
 'incident_diabetes',
 'diabetes_time',
 'age']


cols = list(vdf_train.get_column_names())  # list(vdf_train.columns) may give dropped featues with __xxx
cols = [i for i in cols if i not in [target]]
print(target)
print(cols[:20])

incident_diabetes
['SampleID', 'diabetes_time', 'age', 'male', 'BMI', 'HDL', 'LDL', 'trig', 'SBP', 'DBP', 'hypertension', 'fasting', 'fasting_glucose', 'fasting_insulin', 'HbA1c', 'current_smoker', 'ex_smoker', 'exercise', 'healthy_vegetables', 'junk_food']


cols_dropped = [i for i in cols if i.startswith('__')]
cols_dropped

[]


cols_train = [i for i in cols if i not in cols_dropped]

print(sorted([i for i in cols_train if not i.startswith('mtb')]))

['BMI', 'DBP', 'HDL', 'HbA1c', 'LDL', 'SBP', 'SampleID', 'age', 'current_smoker', 'diabetes_time', 'ex_smoker', 'exercise', 'fasting', 'fasting_glucose', 'fasting_insulin', 'healthy_vegetables', 'hypertension', 'junk_food', 'male', 'total_fiber', 'trig']


cols_train = [i for i in cols_train if i not in ['SampleID']+[target]]


assert target not in cols_train


%%time
# drop cols if missing > 50%

for c in tqdm(cols_train):
    if c!='nan':
        if c in vdf_train.columns:
            if (vdf_train[c].isna().sum() / len(vdf_train)) > 0.5:
                vdf_train = vdf_train.drop(c)
                vdf_test = vdf_test.drop(c)
                print(c, "is dropped.")
                cols_train.remove(c)

 26%|██▌       | 12/47 [1:11:20<3:26:37, 354.21s/it]

fasting_glucose is dropped.

 34%|███▍      | 16/47 [1:34:51<3:02:23, 353.02s/it]


%%time
# fillna with median

# NOTE:
# NameError: Column or variable 'nan' does not exist.

for c in tqdm(cols_train):
    if c !='nan':
        if c in vdf_train.columns:
            median = vdf_train.percentile_approx(expression=c, percentage=60.0)
            vdf_train[c] =  vdf_train[c].fillna(value=median)

            median = vdf_test.percentile_approx(expression=c,percentage=60.0)
            vdf_test[c] = vdf_test[c].fillna(value=median)


%%time
scaler = vaex.ml.RobustScaler(features=cols_train, prefix='scaled_')
scaler.fit(vdf_train)

vdf_train = scaler.transform(vdf_train)
vdf_test = scaler.transform(vdf_test)


vdf_train.head()


import lightgbm as lgb
from vaex.ml.sklearn import Predictor


features = vdf_train.get_column_names(regex='scaled_')

features[:5]


assert target not in features


target


model = lgb.LGBMClassifier()
predictor = Predictor(model=model, features=features,target=target)

predictor.fit(vdf_train)


vdf_test = predictor.transform(vdf_test)


model = booster
predictor = vaex_model

#	SampleID	existing_diabetes	incident_diabetes	diabetes_time	age	male	BMI	HDL	LDL	trig	SBP	DBP	hypertension	fasting	fasting_glucose	fasting_insulin	HbA1c	ex_smoker	exercise	healthy_vegetables	junk_food	total_fiber	mtb_1368087	mtb_1380093	mtb_1812369	mtb_1838668	mtb_1042362	mtb_1091716	mtb_1228672	mtb_1542487	mtb_1272352	mtb_1391826	mtb_1435571	mtb_1521753	mtb_1834574	mtb_1050860	mtb_606773	mtb_638620	mtb_752773	mtb_590255	mtb_709794	mtb_352255	mtb_509192	mtb_1230298	mtb_841524	mtb_1957718	mtb_1937123	mtb_1940724	mtb_18238
0	1	1	0	-37.51	58	0	49.5868	0.84	2.2	0.66	nan	nan	1	4	nan	nan	77	1	1	7	7	nan	12.3631	14.8389	12.9787	11.0748	16.2347	16.4359	13.0479	12.3211	19.7058	15.7466	8.75848	17.6786	8.7947	21.6053	22.0499	15.3083	31.6079	25.0239	15.201	15.4969	11.4193	15.9774	12.2631	9.60026	10.6193	13.915	nan
1	2	0	0	14.82	69	0	43.7841	1.6	3.88	1.85	178	79	0	5	6.91	19.6	37	0	3	15	7	41	12.5239	12.3965	8.9156	5.85953	15.8701	19.9278	11.2816	13.0256	19.7071	13.0689	9.87844	13.1757	6.62858	23.0018	22.6637	16.0921	32.8428	25.2153	15.9098	13.3169	14.5942	12.9384	12.5242	4.90242	6.96531	6.53369	8.05129
2	3	0	0	14.82	72	1	23.036	1.55	2.97	1.12	156	75	1	6	nan	nan	37	0	3	8	6	32	9.89171	10.8235	8.9352	7.12917	13.7634	14.0787	9.37904	9.09867	17.7833	11.6285	7.40871	12.8763	6.72248	23.6326	20.8345	10.9574	30.4404	22.09	13.4959	16.5171	17.8047	12.5066	15.6389	4.90258	7.09351	7.72304	8.74054
3	4	0	1	2.2	68	0	39.4217	1.2	2.8	2.33	154	80	0	4	8.83	33.4	nan	0	1	13	nan	35	13.1929	13.2317	9.49293	7.63789	20.8355	15.9021	8.60765	12.3049	19.097	14.8641	8.8135	14.0292	7.98722	28.0045	23.9002	16.6228	30.2247	24.069	11.0186	14.7709	16.507	9.38641	13.0643	5.41897	7.05177	7.63227	nan
4	5	0	0	14.82	60	0	27.8967	1.7	2.98	1.29	121	77	0	6	5.86	8.8	38	0	2	9	8	37	11.0952	12.698	10.4014	8.50409	16.3431	16.5679	12.855	10.8106	19.6521	13.4174	10.78	16.1106	5.7918	23.1755	23.1436	12.0698	31.6462	26.0449	12.9349	13.9453	16.9966	13.2027	16.2191	6.62892	8.44199	8.7143	9.21396
5	6	0	0	14.82	25	1	24.7957	1.25	3.1	0.84	139	78	0	15	nan	nan	27	1	2	11	6	34	12.4712	13.8264	9.34616	6.18691	14.9509	19.5395	11.1687	12.6883	20.9843	14.5117	13.4418	16.3214	7.15065	25.1835	23.1333	13.8249	33.0448	26.5483	14.1345	15.9318	16.7607	15.4673	16.7664	7.87914	9.4465	9.88857	7.23336
6	7	0	0	14.82	59	0	26.2646	1.77	2.8	0.83	139	64	0	4	6.72	8	34	0	2	16	10	45	12.1371	16.0913	8.47031	6.40539	18.1694	18.1004	13.176	13.2961	22.5114	13.3506	11.8203	13.5529	6.13071	22.7894	24.353	15.5699	34.8798	28.3285	16.3273	14.079	19.4038	13.4907	14.9222	8.08378	9.36566	9.76168	nan
7	8	0	1	6.12	59	0	38.2798	1.71	4.01	2.14	192	83	1	15	nan	nan	39	0	1	11	8	33	13.8504	15.1576	11.6465	11.9436	15.877	17.3455	12.0064	11.1483	19.8702	15.1974	12.3212	14.3699	7.20653	20.8749	24.3357	14.5281	32.2666	25.0502	15.3079	16.266	12.77	15.1011	15.8891	4.73719	5.23789	6.30588	nan
8	9	0	0	14.82	38	1	24.2785	1.28	2.63	1.32	118	63	0	4	nan	nan	33	0	3	9	6	28	10.014	14.0709	10.8854	9.01137	16.5478	15.9616	9.65282	9.84243	18.9291	14.368	8.16374	13.8891	8.96684	23.9993	23.9441	14.8654	31.7656	26.6439	16.5923	16.6835	16.52	12.581	12.4116	5.33062	7.76559	8.05536	nan
9	10	0	0	14.82	58	0	24.2703	1.47	3.8	1.52	167	76	0	5	nan	nan	34	0	2	12	9	37	11.7601	13.2393	9.49284	7.59128	15.1652	16.4188	11.2079	10.3951	18.6151	15.0759	8.88792	12.5889	6.06435	20.6019	25.5995	13.4135	33.019	25.136	16.061	13.5678	15.8864	14.6696	14.1436	7.10003	9.31566	9.80051	9.03318

Table of Contents

Description¶

Question 01¶

Quesiton 02¶

Question 03¶

Import modules¶

Load the data¶

Data Cleaning¶

Select small data¶

Remove subjects with existing diabetes for this analysis.¶

Create binary target¶

Data Preparation for Modelling¶

Undersampling the data (imbalanced data)¶

train test split¶

Missing values¶

Robust Scaling¶

Modelling lightgbm with vaex¶