import numpy as np
import pandas as pd

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)


# my personal library
from bhishan import bp
from bhishan.bp import show_method_attributes


# import big data module vaex
import vaex as vx
import vaex.ml
import vaex.ml.lightgbm
import lightgbm as lgb

vx.__version__

'1.0.0-beta.6'


from tqdm import tqdm, tqdm_notebook, tnrange
import ipywidgets as widgets


import bqplot as bq


df_eval = pd.DataFrame({
    'model_name':[],
    'desc':[],
    'f1':[],
    'weightedPrecision':[],
    'weightedRecall':[],
    'accuracy':[],
    'areaUnderROC':[],
    'areaUnderPR':[]
})

df_eval


%%bash
# unzip ../data/raw/creditcard.csv.zip -d ../data/raw/
ls ../data/raw

creditcard.csv
creditcard.csv.zip
creditcard.hdf5


ifile = "../data/raw/creditcard.csv"


%%bash
head -3 $ifile


%%bash
head -5 $ifile | cut -d ',' -f 1
tail -5 $ifile | cut -d ',' -f 1

tail -5 $ifile | cut -d '


bp.show_method_attributes(vx,4)

Object Type: <class 'module'>


dfv = vx.read_csv(ifile)
dfv


# show_method_attributes(dfv,5)


show_method_attributes(dfv,5,inside='plot')

Object Type: <class 'vaex.dataframe.DataFrameArrays'>


# dfv.shape # AttributeError


len(dfv)

284807


len(dfv.columns)

32


np.array(dfv.get_column_names())

array(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27',
       'V28', 'Amount', 'Class', 'index'], dtype='<U6')


np.array(list(dfv.columns))

array(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27',
       'V28', 'Amount', 'Class', 'index'], dtype='<U6')


dfv.info()


dfv.describe()


ifile = ifile.replace('.csv','.hdf5')
dfv.export_hdf5(ifile, virtual=True)


!ls ../data/raw

creditcard.csv     creditcard.csv.zip creditcard.hdf5


dfv = vx.open(ifile)


dfv.head(1)


np.array(list(dfv.columns))

array(['Amount', 'Class', 'Time', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14',
       'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22',
       'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6',
       'V7', 'V8', 'V9', 'index'], dtype='<U6')


dfv['log1p_Amount'] = np.log1p(dfv['Amount'])
dfv['log1p_Time'] = np.log1p(dfv['Amount'])


np.array(list(dfv.columns))

array(['Amount', 'Class', 'Time', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14',
       'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22',
       'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6',
       'V7', 'V8', 'V9', 'index'], dtype='<U6')


dfv[['log1p_Amount','log1p_Time']].head(2)


dfv.categorize(column='Class')


dfv.select(dfv['Class']==0, name='class_0')

dfv.mean(dfv.Amount, selection=['class_0'])

array([88.29102242])


ax = dfv['Class'].value_counts().plot.bar()
bp.add_text_barplot(ax)


pca_vars = ['V%i' % k for k in range(1,29)]
dfv[pca_vars].describe()


# dfv.mean?


dfv.Amount.mean()

array(88.34961925)


subspaces = dfv.combinations()
correlations = dfv.correlation(subspaces)
mutual_informations = dfv.mutual_information(subspaces)
names = ["_".join(subspace) for subspace in subspaces]

df_corr = pd.DataFrame({'names': names, 
                       'corr': correlations,
                       'mutual_info': mutual_informations})

df_corr.head()


dfv.plot("Amount", "V1",
         what=["mean(Amount)", "sum(V1)", "correlation(Amount, V1)"],
         title="Different statistics",
         figsize=(10,5));

plt.grid(False)


dfv.plot([["Amount", "V1"], ["Amount", "V2"]],
         what=['mean(Amount)'],
         title="V1 and V2",
         figsize=(10,4));

/Users/poudel/miniconda3/envs/xx/lib/python3.7/site-packages/vaex/viz/mpl.py:779: MatplotlibDeprecationWarning:

Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.


# dfv.plot("Amount", "V1",
#          selection="log1p(Amount)<1",
#          limits=[0,500]
#         );


dfv.scatter("Amount", "V1",
            selection="Amount < 100",
            alpha=0.5,
            c=['r','g'],
            c_expr='Class',
            length_check=False # if row > 50k we need this
           );


dfv.plot("Amount", "V1", f="log1p");


dfv.scatter("Amount", "V1", 
            selection="Amount < 100",
            c="green",
            alpha=0.5,
            length_check=False,
           );


# sns.catplot(x=dfv['Class'].values , kind='count' , palette=['r','g'],)
# This does not work
# TypeError: object of type 'NoneType' has no len()


sns.barplot(x=dfv['Class'].values, y=dfv['Amount'].values)

<matplotlib.axes._subplots.AxesSubplot at 0x132027950>


dfv_per_class = dfv.groupby(by=dfv['Class']).agg({
    'Amount': 'sum',
    'V1': 'mean',
    'V2': 'mean'
})


dfv_per_class.Amount.values

array([25102462.03998364,    60127.97      ])


import vaex.ml


bp.show_method_attributes(vaex.ml)

Object Type: <class 'module'>


# dfv.sample?


## Remove Nans
dfv = dfv.dropna()


# shuffle the data
dfv = dfv.sample(frac=1,random_state=SEED)


dfv_train = dfv[:int(len(dfv)*0.8)]
dfv_test = dfv[int(len(dfv)*0.8):]


np.array(dfv.get_column_names())

array(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27',
       'V28', 'Amount', 'Class', 'index', 'log1p_Amount', 'log1p_Time'],
      dtype='<U12')


features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27',
       'V28', 'log1p_Amount', 'log1p_Time']

# features


target = 'Class'


bp.show_method_attributes(dfv,4, inside='na')

Object Type: <class 'vaex.dataframe.DataFrameArrays'>


dfv_train['Class'].value_counts(progress=True)

[########################################]:  100.00% elapsed time  :        0s =  0.0m =  0.0h

0    227441
1       404
dtype: int64


params = {'learning_rate': 0.1,
         'max_depth': 5,
          'colsample_bytree': 0.8,
          'subsample': 0.8,
          'reg_alpha': 0,
          'reg_lambda': 1,
          'min_child_weight': 1,
          'objective': 'binary',
          'random_state': SEED,
          'n_jobs': -1
         }


bst = vaex.ml.lightgbm.LightGBMModel(features=features,
                                    params=params,
                                    num_boost_round=100)


dfv_train.head(1)


bst.fit(dfv_train, target=target)

Traceback (most recent call last):

  File "/Users/poudel/miniconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-128-8ce16207dbeb>", line 1, in <module>
    bst.fit(dfv_train, target=target)

  File "/Users/poudel/miniconda3/envs/xx/lib/python3.7/site-packages/vaex/ml/lightgbm.py", line 117, in fit
    dtrain = VaexDataset(df, self.target, features=self.features)

  File "/Users/poudel/miniconda3/envs/xx/lib/python3.7/site-packages/vaex/ml/lightgbm.py", line 221, in __init__
    self.label_data = self.df.evaluate(label)

  File "/Users/poudel/miniconda3/envs/xx/lib/python3.7/site-packages/vaex/dataframe.py", line 4848, in evaluate
    value = scope.evaluate(expression)

  File "/Users/poudel/miniconda3/envs/xx/lib/python3.7/site-packages/vaex/scopes.py", line 92, in evaluate
    result = eval(expression, expression_namespace, self)

  File "<string>", line unknown
    
    ^
SyntaxError: unexpected EOF while parsing


# ypreds = bst.predic(dfv_train)


from IPython.display import display


# display(ypreds)


# create virtual column
# dfv_train = bst.transform(dfv_train)

# dfv_train.head(2)


import joblib
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score,precision_score


# # model evaluation
# average = 'binary'
# row_eval = [model_name,desc, 
#             accuracy_score(ytx, ypreds),
#             precision_score(ytx, ypreds, average=average),
#             recall_score(ytx, ypreds, average=average),
#             f1_score(ytx, ypreds, average=average),
#             roc_auc_score(ytx, ypreds),
#             ]

# df_eval.loc[len(df_eval)] = row_eval
# df_eval = df_eval.drop_duplicates()
# time_taken = time.time() - time_start
# print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
# display(df_eval)

	0	1	2	3
0	BinnerTime	format	kld	set_log_level_exception
1	GroupBy	from_arrays	loader	set_log_level_info
2	Grouper	from_arrow_table	logger	set_log_level_off
3	add_namespace	from_ascii	logging	set_log_level_warning
4	agg	from_astropy_table	multithreading	settings
5	aliases	from_csv	open	six
6	app	from_dict	open_many	stat
7	astro	from_items	parse_qs	string_column
8	column	from_json	pkg_resources	strings
9	concat	from_pandas	print_function	superagg
10	dataframe	from_samp	promise	superstrings
11	dataset	from_scalars	read_csv	superutils
12	dataset_mmap	functions	read_csv_and_convert	tasks
13	datasets	glob	reduce	urlparse
14	delayed	grids	register_dataframe_accessor	utils
15	entry	groupby	register_function	vaex
16	events	hash	scopes	vaexfast
17	example	hdf5	selections	version
18	execution	image	serialize	viz
19	expression	import_script	server	vrange
20	expresso	json	set_log_level_debug	zeldovich
21	file

#	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class	index
0	0.0	-1.3598071336738	-0.0727811733098497	2.53634673796914	1.37815522427443	-0.33832076994251803	0.462387777762292	0.239598554061257	0.0986979012610507	0.363786969611213	0.0907941719789316	-0.551599533260813	-0.617800855762348	-0.991389847235408	-0.31116935369987897	1.46817697209427	-0.47040052525947795	0.20797124192924202	0.0257905801985591	0.403992960255733	0.251412098239705	-0.018306777944153	0.277837575558899	-0.110473910188767	0.0669280749146731	0.12853935827352803	-0.189114843888824	0.13355837674038698	-0.0210530534538215	149.62	0	0
1	0.0	1.1918571113148602	0.26615071205963	0.16648011335321	0.448154078460911	0.0600176492822243	-0.0823608088155687	-0.0788029833323113	0.0851016549148104	-0.255425128109186	-0.16697441400461402	1.6127266610547901	1.06523531137287	0.48909501589608	-0.143772296441519	0.635558093258208	0.463917041022171	-0.114804663102346	-0.18336127012399397	-0.14578304132525902	-0.0690831352230203	-0.225775248033138	-0.6386719527718511	0.10128802125323402	-0.33984647552912706	0.167170404418143	0.125894532368176	-0.00898309914322813	0.0147241691924927	2.69	0	1
2	1.0	-1.35835406159823	-1.3401630747360902	1.77320934263119	0.3797795930343279	-0.503198133318193	1.80049938079263	0.7914609564504219	0.24767578658899103	-1.5146543226058302	0.207642865216696	0.6245014594248951	0.06608368526883099	0.7172927314108309	-0.165945922763554	2.34586494901581	-2.8900831944423104	1.10996937869599	-0.12135931319588801	-2.26185709530414	0.524979725224404	0.247998153469754	0.771679401917229	0.9094122623477191	-0.689280956490685	-0.3276418337352511	-0.139096571514147	-0.0553527940384261	-0.0597518405929204	378.66	0	2
3	1.0	-0.9662717115720871	-0.185226008082898	1.79299333957872	-0.863291275036453	-0.0103088796030823	1.24720316752486	0.23760893977178	0.377435874652262	-1.38702406270197	-0.0549519224713749	-0.22648726383540102	0.178228225877303	0.507756869957169	-0.28792374549456	-0.631418117709045	-1.0596472454324999	-0.684092786345479	1.96577500349538	-1.2326219700892	-0.208037781160366	-0.108300452035545	0.0052735967825345295	-0.190320518742841	-1.1755753318632098	0.647376034602038	-0.22192884445840697	0.0627228487293033	0.0614576285006353	123.5	0	3
4	2.0	-1.1582330934952298	0.8777367548484508	1.548717846511	0.40303393395512105	-0.40719337731165295	0.0959214624684256	0.5929407453855451	-0.27053267719228197	0.8177393082352941	0.7530744319763539	-0.8228428779463629	0.53819555014995	1.3458515932154	-1.11966983471731	0.175121130008994	-0.451449182813529	-0.237033239362776	-0.0381947870352842	0.803486924960175	0.40854236039275804	-0.009430697132329191	0.7982784945897099	-0.13745807961906303	0.14126698382476902	-0.20600958761975602	0.502292224181569	0.219422229513348	0.21515314749920603	69.99	0	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
284,802	172786.0	-11.881117885432301	10.0717849710003	-9.83478345739033	-2.0666556845941297	-5.3644727809758495	-2.6068373309456	-4.91821543115252	7.305334020798	1.9144282734458	4.35617041320691	-1.59310526246153	2.7119407910571702	-0.68925560917964	4.62694202525016	-0.92445871482528	1.10764060095394	1.9916911070624297	0.5106323291306479	-0.6829196803569161	1.4758291346555001	0.21345410843735896	0.111863735978609	1.01447989719391	-0.509348453168509	1.4368069070214	0.250034279569581	0.943651171507532	0.8237309614865022	0.77	0	284802
284,803	172787.0	-0.7327886706589559	-0.0550804899173326	2.03502974528243	-0.7385885843874399	0.868229398914682	1.05841527222565	0.0243296959175797	0.294868698501783	0.584800017281683	-0.9759260633225079	-0.15018884710327599	0.9158019144035551	1.21475584849424	-0.6751429558095761	1.16493090944588	-0.7117573499788811	-0.025692855268572002	-1.22117885840624	-1.54555608554509	0.0596158998872689	0.214205341747019	0.924383584903381	0.0124630383316793	-1.01622566867336	-0.606623985854728	-0.395255065710324	0.0684724700405593	-0.0535273892010011	24.79	0	284803
284,804	172788.0	1.91956500980048	-0.301253845990644	-3.24963981406834	-0.55782812475002	2.6305151201154704	3.03126009781428	-0.296826527116156	0.708417184967134	0.432454047632915	-0.48478175575102894	0.41161373679432706	0.0631188625621446	-0.183698687930443	-0.510601843577723	1.32928351250595	0.140715981685477	0.313501786950651	0.39565247933416503	-0.5772518425011129	0.00139597028995166	0.23204503592539702	0.57822900992263	-0.0375008550221367	0.640133881346421	0.265745453243744	-0.0873705959041059	0.00445477213829229	-0.0265608285615222	67.88	0	284804
284,805	172788.0	-0.24044004968094698	0.530482513118839	0.702510230095103	0.689799168040973	-0.377961134444982	0.6237077221476801	-0.68617998628885	0.679145459790659	0.3920867124659721	-0.399125651432835	-1.9338488150571298	-0.962886142890271	-1.0420816559119102	0.4496244431660011	1.9625631206657699	-0.60857712704613	0.509928460110321	1.11398059049908	2.8978487733431297	0.127433515805355	0.265244916386865	0.8000487414981391	-0.16329794440665898	0.12320524374250802	-0.5691588641585971	0.546668462188323	0.108820734744839	0.10453282147879599	10.0	0	284805
284,806	172792.0	-0.53341252200504	-0.189733337002305	0.7033373669637789	-0.506271240328258	-0.0125456787599659	-0.6496166857137919	1.5770062543762902	-0.414650407552662	0.4861795052672371	-0.915426648905893	-1.0404583352236099	-0.0315130540252157	-0.188092900791737	-0.0843164698151014	0.0413334553360658	-0.302620086427415	-0.6603766451827839	0.16742993371973	-0.256116871098099	0.38294810487506603	0.26105733079097504	0.643078437820093	0.37677701416991705	0.008797379400242021	-0.4736487038988251	-0.8182671210411758	-0.00241530880001015	0.0136489143320671	217.0	0	284806

	0	1	2	3	4
0	healpix_plot	plot1d	plot2d_tensor	plot3d	plot_widget
1	plot	plot2d_contour	plot2d_vector	plot_bq

#	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class	index
0	0.0	-1.3598071336738	-0.0727811733098497	2.53634673796914	1.37815522427443	-0.33832076994251803	0.462387777762292	0.239598554061257	0.0986979012610507	0.363786969611213	0.0907941719789316	-0.551599533260813	-0.617800855762348	-0.991389847235408	-0.31116935369987897	1.46817697209427	-0.47040052525947795	0.20797124192924202	0.0257905801985591	0.403992960255733	0.251412098239705	-0.018306777944153	0.277837575558899	-0.110473910188767	0.0669280749146731	0.12853935827352803	-0.189114843888824	0.13355837674038698	-0.0210530534538215	149.62	0	0
1	0.0	1.1918571113148602	0.26615071205963	0.16648011335321	0.448154078460911	0.0600176492822243	-0.0823608088155687	-0.0788029833323113	0.0851016549148104	-0.255425128109186	-0.16697441400461402	1.6127266610547901	1.06523531137287	0.48909501589608	-0.143772296441519	0.635558093258208	0.463917041022171	-0.114804663102346	-0.18336127012399397	-0.14578304132525902	-0.0690831352230203	-0.225775248033138	-0.6386719527718511	0.10128802125323402	-0.33984647552912706	0.167170404418143	0.125894532368176	-0.00898309914322813	0.0147241691924927	2.69	0	1
2	1.0	-1.35835406159823	-1.3401630747360902	1.77320934263119	0.3797795930343279	-0.503198133318193	1.80049938079263	0.7914609564504219	0.24767578658899103	-1.5146543226058302	0.207642865216696	0.6245014594248951	0.06608368526883099	0.7172927314108309	-0.165945922763554	2.34586494901581	-2.8900831944423104	1.10996937869599	-0.12135931319588801	-2.26185709530414	0.524979725224404	0.247998153469754	0.771679401917229	0.9094122623477191	-0.689280956490685	-0.3276418337352511	-0.139096571514147	-0.0553527940384261	-0.0597518405929204	378.66	0	2
3	1.0	-0.9662717115720871	-0.185226008082898	1.79299333957872	-0.863291275036453	-0.0103088796030823	1.24720316752486	0.23760893977178	0.377435874652262	-1.38702406270197	-0.0549519224713749	-0.22648726383540102	0.178228225877303	0.507756869957169	-0.28792374549456	-0.631418117709045	-1.0596472454324999	-0.684092786345479	1.96577500349538	-1.2326219700892	-0.208037781160366	-0.108300452035545	0.0052735967825345295	-0.190320518742841	-1.1755753318632098	0.647376034602038	-0.22192884445840697	0.0627228487293033	0.0614576285006353	123.5	0	3
4	2.0	-1.1582330934952298	0.8777367548484508	1.548717846511	0.40303393395512105	-0.40719337731165295	0.0959214624684256	0.5929407453855451	-0.27053267719228197	0.8177393082352941	0.7530744319763539	-0.8228428779463629	0.53819555014995	1.3458515932154	-1.11966983471731	0.175121130008994	-0.451449182813529	-0.237033239362776	-0.0381947870352842	0.803486924960175	0.40854236039275804	-0.009430697132329191	0.7982784945897099	-0.13745807961906303	0.14126698382476902	-0.20600958761975602	0.502292224181569	0.219422229513348	0.21515314749920603	69.99	0	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
284,802	172786.0	-11.881117885432301	10.0717849710003	-9.83478345739033	-2.0666556845941297	-5.3644727809758495	-2.6068373309456	-4.91821543115252	7.305334020798	1.9144282734458	4.35617041320691	-1.59310526246153	2.7119407910571702	-0.68925560917964	4.62694202525016	-0.92445871482528	1.10764060095394	1.9916911070624297	0.5106323291306479	-0.6829196803569161	1.4758291346555001	0.21345410843735896	0.111863735978609	1.01447989719391	-0.509348453168509	1.4368069070214	0.250034279569581	0.943651171507532	0.8237309614865022	0.77	0	284802
284,803	172787.0	-0.7327886706589559	-0.0550804899173326	2.03502974528243	-0.7385885843874399	0.868229398914682	1.05841527222565	0.0243296959175797	0.294868698501783	0.584800017281683	-0.9759260633225079	-0.15018884710327599	0.9158019144035551	1.21475584849424	-0.6751429558095761	1.16493090944588	-0.7117573499788811	-0.025692855268572002	-1.22117885840624	-1.54555608554509	0.0596158998872689	0.214205341747019	0.924383584903381	0.0124630383316793	-1.01622566867336	-0.606623985854728	-0.395255065710324	0.0684724700405593	-0.0535273892010011	24.79	0	284803
284,804	172788.0	1.91956500980048	-0.301253845990644	-3.24963981406834	-0.55782812475002	2.6305151201154704	3.03126009781428	-0.296826527116156	0.708417184967134	0.432454047632915	-0.48478175575102894	0.41161373679432706	0.0631188625621446	-0.183698687930443	-0.510601843577723	1.32928351250595	0.140715981685477	0.313501786950651	0.39565247933416503	-0.5772518425011129	0.00139597028995166	0.23204503592539702	0.57822900992263	-0.0375008550221367	0.640133881346421	0.265745453243744	-0.0873705959041059	0.00445477213829229	-0.0265608285615222	67.88	0	284804
284,805	172788.0	-0.24044004968094698	0.530482513118839	0.702510230095103	0.689799168040973	-0.377961134444982	0.6237077221476801	-0.68617998628885	0.679145459790659	0.3920867124659721	-0.399125651432835	-1.9338488150571298	-0.962886142890271	-1.0420816559119102	0.4496244431660011	1.9625631206657699	-0.60857712704613	0.509928460110321	1.11398059049908	2.8978487733431297	0.127433515805355	0.265244916386865	0.8000487414981391	-0.16329794440665898	0.12320524374250802	-0.5691588641585971	0.546668462188323	0.108820734744839	0.10453282147879599	10.0	0	284805
284,806	172792.0	-0.53341252200504	-0.189733337002305	0.7033373669637789	-0.506271240328258	-0.0125456787599659	-0.6496166857137919	1.5770062543762902	-0.414650407552662	0.4861795052672371	-0.915426648905893	-1.0404583352236099	-0.0315130540252157	-0.188092900791737	-0.0843164698151014	0.0413334553360658	-0.302620086427415	-0.6603766451827839	0.16742993371973	-0.256116871098099	0.38294810487506603	0.26105733079097504	0.643078437820093	0.37677701416991705	0.008797379400242021	-0.4736487038988251	-0.8182671210411758	-0.00241530880001015	0.0136489143320671	217.0	0	284806

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V22	V23	V24	V25	V26	V27	V28	Amount	Class	index
dtype	float64	float64	float64	float64	float64	float64	float64	float64	float64	float64	...	float64	float64	float64	float64	float64	float64	float64	float64	int64	int64
count	284807	284807	284807	284807	284807	284807	284807	284807	284807	284807	...	284807	284807	284807	284807	284807	284807	284807	284807	284807	284807
NA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
mean	94813.85957508067	3.919560084655042e-15	5.688174400270728e-16	-8.76907126289773e-15	2.782312291808533e-15	-1.5525630329923732e-15	2.010663493875542e-15	-1.694249132734738e-15	-1.9270277088072758e-16	-3.13702431282631e-15	...	7.959908529993057e-16	5.367589788427716e-16	4.4581115101841816e-15	1.453003365084085e-15	1.6991042900067526e-15	-3.660160614016803e-16	-1.206048852934382e-16	88.34961925087359	0.001727485630620034	142403.0
std	47488.1	1.95869	1.65131	1.51625	1.41587	1.38024	1.33227	1.23709	1.19435	1.09863	...	0.7257	0.624459	0.605646	0.521277	0.482226	0.403632	0.330083	250.12	0.0415271	82216.7
min	0	-56.4075	-72.7157	-48.3256	-5.68317	-113.743	-26.1605	-43.5572	-73.2167	-13.4341	...	-10.9331	-44.8077	-2.83663	-10.2954	-2.60455	-22.5657	-15.4301	0	0	0
max	172792	2.45493	22.0577	9.38256	16.8753	34.8017	73.3016	120.589	20.0072	15.595	...	10.5031	22.5284	4.58455	7.51959	3.51735	31.6122	33.8478	25691.2	1	284806

Table of Contents

Data Description¶

Business Problem¶

Imports¶

Useful Scripts¶

Load the data¶

pandas

Columns:

Data:

Data Processing¶

create virtual columns¶

Categorize features¶

EDA¶

Correlations¶

Scatter plots¶

Barplots¶

Modelling¶

Train Test Split¶

Modelling LightGBM using Vaex¶

Predictions¶

Model Performances¶

	names	corr	mutual_info
0	Time_V1	0.117396	0.371445
1	Time_V2	-0.010593	0.111086
2	Time_V3	-0.419618	0.283335
3	Time_V4	-0.105260	0.186229
4	Time_V5	0.173072	0.140498

	0	1	2	3	4	5	6
0	BayesianTargetEncoder	FrequencyEncoder	MaxAbsScaler	PCA	StandardScaler	generate	transformations
1	CycleTransformer	InnerNamespace	MinMaxScaler	Pipeline	WeightOfEvidenceEncoder	pipeline	vaex
2	DataFrameAccessorML	LabelEncoder	OneHotEncoder	RobustScaler	datasets	state	warnings

	0	1	2	3
0	column_names	fillna	ordinal_encode	signal_pick
1	combinations	get_column_names	rename_column	signal_selection_changed
2	dropna	length_original	signal_active_fraction_changed	signal_sequence_index_change
3	dropnan	name	signal_column_changed	signal_variable_changed

#	log1p_Amount	log1p_Time
0	5.01476	5.01476
1	1.30563	1.30563