Hints:
Hints:
Hints:
import numpy as np
import pandas as pd
import os,sys,pathlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import vaex
from tqdm import tqdm
%matplotlib inline
%load_ext watermark
%watermark -iv
pandas : 1.3.1 numpy : 1.21.0 seaborn : 0.11.1 sys : 3.9.5 (default, May 18 2021, 12:31:01) [Clang 10.0.0 ] vaex : 4.4.0 matplotlib: 3.4.2
!ls ~/data
diabetes_project_dataset.csv diabetes_project_dataset.hdf5
# %%time
# ifile = os.path.expanduser("~/data/diabetes_project_dataset.csv")
# vdf = vaex.from_csv(ifile) # csv takes longer time, convert to hdf5
# Wall time: 10min 20s
# vdf.export_hdf5(ifile.replace(".csv",".hdf5"), progress=True)
%%time
ifile = os.path.expanduser("~/data/diabetes_project_dataset.hdf5")
vdf = vaex.open(ifile) # csv takes longer time, convert to hdf5
CPU times: user 57.2 s, sys: 980 ms, total: 58.1 s Wall time: 58.5 s
cols = list(vdf.columns)[:50]
len(cols), cols[:2], cols[-2:]
(50, ['SampleID', 'existing_diabetes'], ['mtb_1940724', 'mtb_18238'])
%%time
vdf = vdf[cols]
CPU times: user 24.1 s, sys: 78.5 ms, total: 24.1 s Wall time: 24.4 s
vdf.head()
# | SampleID | existing_diabetes | incident_diabetes | diabetes_time | age | male | BMI | HDL | LDL | trig | SBP | DBP | hypertension | fasting | fasting_glucose | fasting_insulin | HbA1c | current_smoker | ex_smoker | exercise | healthy_vegetables | junk_food | total_fiber | mtb_1368087 | mtb_1380093 | mtb_1812369 | mtb_1838668 | mtb_1042362 | mtb_1091716 | mtb_1228672 | mtb_1542487 | mtb_1272352 | mtb_1391826 | mtb_1435571 | mtb_1521753 | mtb_1834574 | mtb_1050860 | mtb_606773 | mtb_638620 | mtb_752773 | mtb_590255 | mtb_709794 | mtb_352255 | mtb_509192 | mtb_1230298 | mtb_841524 | mtb_1957718 | mtb_1937123 | mtb_1940724 | mtb_18238 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | -37.51 | 58 | 0 | 49.5868 | 0.84 | 2.2 | 0.66 | nan | nan | 1 | 4 | nan | nan | 77 | 0 | 1 | 1 | 7 | 7 | nan | 12.3631 | 14.8389 | 12.9787 | 11.0748 | 16.2347 | 16.4359 | 13.0479 | 12.3211 | 19.7058 | 15.7466 | 8.75848 | 17.6786 | 8.7947 | 21.6053 | 22.0499 | 15.3083 | 31.6079 | 25.0239 | 15.201 | 15.4969 | 11.4193 | 15.9774 | 12.2631 | 9.60026 | 10.6193 | 13.915 | nan |
1 | 2 | 0 | 0 | 14.82 | 69 | 0 | 43.7841 | 1.6 | 3.88 | 1.85 | 178 | 79 | 0 | 5 | 6.91 | 19.6 | 37 | 0 | 0 | 3 | 15 | 7 | 41 | 12.5239 | 12.3965 | 8.9156 | 5.85953 | 15.8701 | 19.9278 | 11.2816 | 13.0256 | 19.7071 | 13.0689 | 9.87844 | 13.1757 | 6.62858 | 23.0018 | 22.6637 | 16.0921 | 32.8428 | 25.2153 | 15.9098 | 13.3169 | 14.5942 | 12.9384 | 12.5242 | 4.90242 | 6.96531 | 6.53369 | 8.05129 |
2 | 3 | 0 | 0 | 14.82 | 72 | 1 | 23.036 | 1.55 | 2.97 | 1.12 | 156 | 75 | 1 | 6 | nan | nan | 37 | 0 | 0 | 3 | 8 | 6 | 32 | 9.89171 | 10.8235 | 8.9352 | 7.12917 | 13.7634 | 14.0787 | 9.37904 | 9.09867 | 17.7833 | 11.6285 | 7.40871 | 12.8763 | 6.72248 | 23.6326 | 20.8345 | 10.9574 | 30.4404 | 22.09 | 13.4959 | 16.5171 | 17.8047 | 12.5066 | 15.6389 | 4.90258 | 7.09351 | 7.72304 | 8.74054 |
3 | 4 | 0 | 1 | 2.2 | 68 | 0 | 39.4217 | 1.2 | 2.8 | 2.33 | 154 | 80 | 0 | 4 | 8.83 | 33.4 | nan | 0 | 0 | 1 | 13 | nan | 35 | 13.1929 | 13.2317 | 9.49293 | 7.63789 | 20.8355 | 15.9021 | 8.60765 | 12.3049 | 19.097 | 14.8641 | 8.8135 | 14.0292 | 7.98722 | 28.0045 | 23.9002 | 16.6228 | 30.2247 | 24.069 | 11.0186 | 14.7709 | 16.507 | 9.38641 | 13.0643 | 5.41897 | 7.05177 | 7.63227 | nan |
4 | 5 | 0 | 0 | 14.82 | 60 | 0 | 27.8967 | 1.7 | 2.98 | 1.29 | 121 | 77 | 0 | 6 | 5.86 | 8.8 | 38 | 0 | 0 | 2 | 9 | 8 | 37 | 11.0952 | 12.698 | 10.4014 | 8.50409 | 16.3431 | 16.5679 | 12.855 | 10.8106 | 19.6521 | 13.4174 | 10.78 | 16.1106 | 5.7918 | 23.1755 | 23.1436 | 12.0698 | 31.6462 | 26.0449 | 12.9349 | 13.9453 | 16.9966 | 13.2027 | 16.2191 | 6.62892 | 8.44199 | 8.7143 | 9.21396 |
5 | 6 | 0 | 0 | 14.82 | 25 | 1 | 24.7957 | 1.25 | 3.1 | 0.84 | 139 | 78 | 0 | 15 | nan | nan | 27 | 0 | 1 | 2 | 11 | 6 | 34 | 12.4712 | 13.8264 | 9.34616 | 6.18691 | 14.9509 | 19.5395 | 11.1687 | 12.6883 | 20.9843 | 14.5117 | 13.4418 | 16.3214 | 7.15065 | 25.1835 | 23.1333 | 13.8249 | 33.0448 | 26.5483 | 14.1345 | 15.9318 | 16.7607 | 15.4673 | 16.7664 | 7.87914 | 9.4465 | 9.88857 | 7.23336 |
6 | 7 | 0 | 0 | 14.82 | 59 | 0 | 26.2646 | 1.77 | 2.8 | 0.83 | 139 | 64 | 0 | 4 | 6.72 | 8 | 34 | 0 | 0 | 2 | 16 | 10 | 45 | 12.1371 | 16.0913 | 8.47031 | 6.40539 | 18.1694 | 18.1004 | 13.176 | 13.2961 | 22.5114 | 13.3506 | 11.8203 | 13.5529 | 6.13071 | 22.7894 | 24.353 | 15.5699 | 34.8798 | 28.3285 | 16.3273 | 14.079 | 19.4038 | 13.4907 | 14.9222 | 8.08378 | 9.36566 | 9.76168 | nan |
7 | 8 | 0 | 1 | 6.12 | 59 | 0 | 38.2798 | 1.71 | 4.01 | 2.14 | 192 | 83 | 1 | 15 | nan | nan | 39 | 0 | 0 | 1 | 11 | 8 | 33 | 13.8504 | 15.1576 | 11.6465 | 11.9436 | 15.877 | 17.3455 | 12.0064 | 11.1483 | 19.8702 | 15.1974 | 12.3212 | 14.3699 | 7.20653 | 20.8749 | 24.3357 | 14.5281 | 32.2666 | 25.0502 | 15.3079 | 16.266 | 12.77 | 15.1011 | 15.8891 | 4.73719 | 5.23789 | 6.30588 | nan |
8 | 9 | 0 | 0 | 14.82 | 38 | 1 | 24.2785 | 1.28 | 2.63 | 1.32 | 118 | 63 | 0 | 4 | nan | nan | 33 | 0 | 0 | 3 | 9 | 6 | 28 | 10.014 | 14.0709 | 10.8854 | 9.01137 | 16.5478 | 15.9616 | 9.65282 | 9.84243 | 18.9291 | 14.368 | 8.16374 | 13.8891 | 8.96684 | 23.9993 | 23.9441 | 14.8654 | 31.7656 | 26.6439 | 16.5923 | 16.6835 | 16.52 | 12.581 | 12.4116 | 5.33062 | 7.76559 | 8.05536 | nan |
9 | 10 | 0 | 0 | 14.82 | 58 | 0 | 24.2703 | 1.47 | 3.8 | 1.52 | 167 | 76 | 0 | 5 | nan | nan | 34 | 0 | 0 | 2 | 12 | 9 | 37 | 11.7601 | 13.2393 | 9.49284 | 7.59128 | 15.1652 | 16.4188 | 11.2079 | 10.3951 | 18.6151 | 15.0759 | 8.88792 | 12.5889 | 6.06435 | 20.6019 | 25.5995 | 13.4135 | 33.019 | 25.136 | 16.061 | 13.5678 | 15.8864 | 14.6696 | 14.1436 | 7.10003 | 9.31566 | 9.80051 | 9.03318 |
%%time
vdf = vdf[vdf.existing_diabetes == 0.0]
vdf = vdf.drop('existing_diabetes') # vaex has NO axis=1, it only drops columns
CPU times: user 3.58 ms, sys: 609 µs, total: 4.19 ms Wall time: 4.04 ms
len(vdf)
7697
vdf['incident_diabetes'].unique()
[0.0, 1.0]
%%time
# if there are nans, remove missing values
vdf = vdf.dropna(['incident_diabetes']) # vaex has NO argument subset
len(vdf)
CPU times: user 5.74 ms, sys: 1.92 ms, total: 7.66 ms Wall time: 8.02 ms
7697
%%time
vdf['incident_diabetes'].isna().sum()
CPU times: user 11.8 ms, sys: 2.87 ms, total: 14.6 ms Wall time: 15.4 ms
array(0)
%%time
target = 'incident_diabetes'
ser = vdf.incident_diabetes.to_pandas_series()
CPU times: user 4.75 ms, sys: 1.74 ms, total: 6.49 ms Wall time: 8.48 ms
ser.value_counts().plot.bar()
<AxesSubplot:>
ser.value_counts(normalize=True)
0.0 0.908536 1.0 0.091464 dtype: float64
vdf_ones = vdf[vdf.incident_diabetes==1.0]
vdf_zeros = vdf[vdf.incident_diabetes==0.0]
vdf_zeros_small = vdf_zeros.sample(n=len(vdf_ones))
len(vdf)
7697
%%time
vdf = vaex.concat([vdf_ones,vdf_zeros_small])
len(vdf)
CPU times: user 6.92 ms, sys: 1.53 ms, total: 8.45 ms Wall time: 10.8 ms
1408
%%time
# we must shuffle before split
vdf = vdf.sample(frac=1)
vdf_train, vdf_test = vdf.ml.train_test_split(test_size=0.2)
CPU times: user 108 ms, sys: 6.36 ms, total: 115 ms Wall time: 137 ms
/Users/poudel/opt/miniconda3/envs/dsk/lib/python3.9/site-packages/vaex/ml/__init__.py:31: UserWarning: Make sure the DataFrame is shuffled warnings.warn('Make sure the DataFrame is shuffled')
vdf_train['LDL'].isna().sum()
array(0)
vdf_train['LDL'].isna().sum() / len(vdf_train)
0.0
vdf_train.columns
<vaex.dataframe.ColumnProxy at 0x7f960bb3e280>
list(vdf_train.columns)[:5]
['SampleID', '__existing_diabetes', 'incident_diabetes', 'diabetes_time', 'age']
cols = list(vdf_train.get_column_names()) # list(vdf_train.columns) may give dropped featues with __xxx
cols = [i for i in cols if i not in [target]]
print(target)
print(cols[:20])
incident_diabetes ['SampleID', 'diabetes_time', 'age', 'male', 'BMI', 'HDL', 'LDL', 'trig', 'SBP', 'DBP', 'hypertension', 'fasting', 'fasting_glucose', 'fasting_insulin', 'HbA1c', 'current_smoker', 'ex_smoker', 'exercise', 'healthy_vegetables', 'junk_food']
cols_dropped = [i for i in cols if i.startswith('__')]
cols_dropped
[]
cols_train = [i for i in cols if i not in cols_dropped]
print(sorted([i for i in cols_train if not i.startswith('mtb')]))
['BMI', 'DBP', 'HDL', 'HbA1c', 'LDL', 'SBP', 'SampleID', 'age', 'current_smoker', 'diabetes_time', 'ex_smoker', 'exercise', 'fasting', 'fasting_glucose', 'fasting_insulin', 'healthy_vegetables', 'hypertension', 'junk_food', 'male', 'total_fiber', 'trig']
cols_train = [i for i in cols_train if i not in ['SampleID']+[target]]
assert target not in cols_train
%%time
# drop cols if missing > 50%
for c in tqdm(cols_train):
if c!='nan':
if c in vdf_train.columns:
if (vdf_train[c].isna().sum() / len(vdf_train)) > 0.5:
vdf_train = vdf_train.drop(c)
vdf_test = vdf_test.drop(c)
print(c, "is dropped.")
cols_train.remove(c)
26%|██▌ | 12/47 [1:11:20<3:26:37, 354.21s/it]
fasting_glucose is dropped.
34%|███▍ | 16/47 [1:34:51<3:02:23, 353.02s/it]
%%time
# fillna with median
# NOTE:
# NameError: Column or variable 'nan' does not exist.
for c in tqdm(cols_train):
if c !='nan':
if c in vdf_train.columns:
median = vdf_train.percentile_approx(expression=c, percentage=60.0)
vdf_train[c] = vdf_train[c].fillna(value=median)
median = vdf_test.percentile_approx(expression=c,percentage=60.0)
vdf_test[c] = vdf_test[c].fillna(value=median)
%%time
scaler = vaex.ml.RobustScaler(features=cols_train, prefix='scaled_')
scaler.fit(vdf_train)
vdf_train = scaler.transform(vdf_train)
vdf_test = scaler.transform(vdf_test)
vdf_train.head()
import lightgbm as lgb
from vaex.ml.sklearn import Predictor
features = vdf_train.get_column_names(regex='scaled_')
features[:5]
assert target not in features
target
model = lgb.LGBMClassifier()
predictor = Predictor(model=model, features=features,target=target)
predictor.fit(vdf_train)
vdf_test = predictor.transform(vdf_test)
model = booster
predictor = vaex_model