Kernel Author:
Bhishan Poudel, Ph.D Astrophysics .

Data Description¶

In this project, we will predict the probability that an auto insurance policy holder files a claim. This a binary classification problem.

We have more than half a million records and 59 features (including already calculated features).

binary features: _bin
categorical features: _cat
continuous or ordinal feafures: ind, reg, car, calc
missing values: -1

Fullforms
ind = individual
reg = registration
car = car
calc = calculated

The target columns signifies whether or not a claim was filed for that policy holder.

Evaluation Metric¶

From this graph of wikipedia G = A / (A+B). Gini index varies between 0 and 1. Here we have only binary options: rich and poor.

x-axis= number of people (cumulative sum)
y-axis = total income (cumulative sum)

0 = complete equality of richness
1 = complete inequality of richness


This competition
0 = random guessing
1 = maximum score (also remember 2*1-1 = 1 when maximum auc is 1).

If we calculate gini from gini = 2*auc -1 it has range (-1,1). For AUC:

worst binary classifier AUC = 0.5
perfect binary classifier AUC = 1

If AUC is less than below, simply simply invert 0 <==> 1 then we will get roc auc score between 0.5 and 1.0

Imports¶

import os
import time
import gc
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
time_start_notebook = time.time()
SEED=100
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('matplotlib', '3.2.2')]

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

from sklearn.model_selection import StratifiedKFold

# Google colab

%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ## load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ## Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ## Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/Porto/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    # extra modules
    # boruta shuffles the values within a column, and fits
    # random forest (bagging) to make sure the score is
    # not due to noise.
    # https://github.com/scikit-learn-contrib/boruta_py
    !pip install Boruta

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········

Useful Functions¶

df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                        'NormalizedGini': []
                    })

Load the data¶

df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/'
    'Porto_seguro_safe_driver_prediction/train.csv.zip?raw=true',compression='zip')
print(df.shape)


# faster runtime
# df = df.sample(frac=0.01,random_state=SEED)
df.head()

(595212, 59)

target = 'target'

Train-test Split with Stratify¶

from sklearn.model_selection import train_test_split

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1),df[target],
    test_size=0.2,random_state=SEED, stratify=df[target])

# backup and delete id
cols_drop = ['id']
train_id = df_Xtrain[cols_drop]
test_id = df_Xtest[cols_drop]
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)

Xtrain = df_Xtrain.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()

Xtest = df_Xtest.to_numpy()
ytest = ser_ytest.to_numpy().ravel()

# make sure no nans and no strings
print(Xtrain.sum().sum())

78071313.37562414

Training Data¶

pd.set_option('display.max_columns',250)
df_Xtrain.head()

# df_Xtrain.columns # make sure there are no id and index

Xtr = Xtrain
Xtx = Xtest
ytr = ytrain
ytx = ytest

print(Xtr.shape, Xtx.shape)

(476169, 57) (119043, 57)

ser_ytest.value_counts(normalize=True)

0    0.963551
1    0.036449
Name: target, dtype: float64

Evaluation Metric¶

https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282

def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

Feature Selection: Boruta¶

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import joblib

boruta_pkl = out_dir + 'porto_boruta.pkl'
print(boruta_pkl)

drive/My Drive/Colab Notebooks/outputs/Porto/porto_boruta.pkl

# we need to choose the good hyperparameters ourself
# eg. n_est = 100 and max_depth = 10 did not select enough features.
if not os.path.isfile(boruta_pkl):
    rfc = RandomForestClassifier(n_estimators=200, n_jobs=-1,
                                class_weight='balanced', max_depth=6)

    boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2)
    start_time = timer(None)
    boruta_selector.fit(Xtrain, ytrain)
    timer(start_time)
    joblib.dump(boruta_selector, boruta_pkl)
else:
    boruta_selector = joblib.load(boruta_pkl)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	57
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	21
Tentative: 	6
Rejected: 	30
Iteration: 	9 / 100
Confirmed: 	21
Tentative: 	6
Rejected: 	30
Iteration: 	10 / 100
Confirmed: 	21
Tentative: 	6
Rejected: 	30
Iteration: 	11 / 100
Confirmed: 	21
Tentative: 	6
Rejected: 	30
Iteration: 	12 / 100
Confirmed: 	21
Tentative: 	6
Rejected: 	30
Iteration: 	13 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	14 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	15 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	16 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	17 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	18 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	19 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	31
Iteration: 	20 / 100
Confirmed: 	21
Tentative: 	4
Rejected: 	32
Iteration: 	21 / 100
Confirmed: 	21
Tentative: 	4
Rejected: 	32
Iteration: 	22 / 100
Confirmed: 	21
Tentative: 	4
Rejected: 	32
Iteration: 	23 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	24 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	25 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	26 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	27 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	28 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	29 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	30 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	31 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	33
Iteration: 	32 / 100
Confirmed: 	21
Tentative: 	1
Rejected: 	35
Iteration: 	33 / 100
Confirmed: 	21
Tentative: 	1
Rejected: 	35
Iteration: 	34 / 100
Confirmed: 	21
Tentative: 	1
Rejected: 	35
Iteration: 	35 / 100
Confirmed: 	21
Tentative: 	1
Rejected: 	35
Iteration: 	36 / 100
Confirmed: 	21
Tentative: 	1
Rejected: 	35
Iteration: 	37 / 100
Confirmed: 	21
Tentative: 	0
Rejected: 	36


BorutaPy finished running.

Iteration: 	38 / 100
Confirmed: 	21
Tentative: 	0
Rejected: 	36

 Time taken: 0 hours 37 minutes and 23.19 seconds.

# number of selected features
print('Total    :', df_Xtrain.shape[1])
print('Selected :', boruta_selector.n_features_)
print('Excluded :', df_Xtrain.shape[1] - boruta_selector.n_features_)

Total    : 57
Selected : 21
Excluded : 36

df_feat = pd.DataFrame({'feature': df_Xtrain.columns})
df_feat['rank'] = boruta_selector.ranking_
df_feat['support'] = boruta_selector.support_
df_feat['support_weak'] = boruta_selector.support_weak_

df_feat = df_feat.sort_values('rank',ascending=True).reset_index()
df_feat.head()

cols_boruta_selected = selected =df_Xtrain.columns[boruta_selector.support_]
print(cols_boruta_selected)

Index(['ps_ind_01', 'ps_ind_03', 'ps_ind_05_cat', 'ps_ind_06_bin',
       'ps_ind_07_bin', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin',
       'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15'],
      dtype='object')

Target Permutation¶

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

clf = LGBMClassifier(num_leaves=1024,
                     max_depth=6,
                     n_estimators=500,
                     subsample=.632,
                     colsample_bytree=.5,
                     n_jobs=-1)

n_splits = 2
n_runs = 5

df_imp = np.zeros((len(df_Xtrain.columns), n_splits * n_runs))
idx = np.arange(len(ytrain))

time_start = time.time()
for run in range(n_runs):
    # Shuffle target
    np.random.seed(SEED)
    np.random.shuffle(idx)
    ser_perm_train = ser_ytrain.iloc[idx]

    # Create a new split
    folds = StratifiedKFold(n_splits, shuffle=True, random_state=SEED)
    oof = np.empty(len(df_Xtrain))

    for fold_, (idx_tr, idx_vd) in enumerate(
                         folds.split(ser_perm_train, ser_perm_train)):
        print('Fold: ',fold_)

        df_Xtr, ser_perm_tr = df_Xtrain.iloc[idx_tr], ser_perm_train.iloc[idx_tr]
        df_Xvd, ser_perm_vd = df_Xtrain.iloc[idx_vd], ser_perm_train.iloc[idx_vd]

        # Train classifier
        clf.fit(df_Xtr, ser_perm_tr)

        # Keep feature importances for this fold and run
        df_imp[:, n_splits * run + fold_] = clf.feature_importances_
        
        # Update OOF for gini score display
        oof[idx_vd] = clf.predict_proba(df_Xvd)[:, 1]

        # time taken
        time_taken = time.time() - time_start
        h,m = divmod(time_taken,60*60)
        print('  Time taken : {:.0f} hr '\
            '{:.0f} min {:.0f} secs\n'.format(h, *divmod(m,60)))


    print("Run %2d OOF score (target shuffled) : %.6f" %
          (run, eval_gini(ser_perm_train, oof)))

 Time taken: 0 hours 0 minutes and 25.52 seconds.

 Time taken: 0 hours 0 minutes and 51.05 seconds.
Run  0 OOF score (target shuffled) : -0.000973

 Time taken: 0 hours 1 minutes and 17.22 seconds.

 Time taken: 0 hours 1 minutes and 42.29 seconds.
Run  1 OOF score (target shuffled) : 0.001063

 Time taken: 0 hours 2 minutes and 8.7 seconds.

 Time taken: 0 hours 2 minutes and 34.69 seconds.
Run  2 OOF score (target shuffled) : -0.004767

 Time taken: 0 hours 3 minutes and 0.94 seconds.

 Time taken: 0 hours 3 minutes and 27.37 seconds.
Run  3 OOF score (target shuffled) : 0.000136

 Time taken: 0 hours 3 minutes and 54.19 seconds.

 Time taken: 0 hours 4 minutes and 19.81 seconds.
Run  4 OOF score (target shuffled) : -0.004905

Shuffle target and data and compare¶

df_bench_imp = np.zeros((len(df_Xtrain.columns), n_splits * n_runs))

# default boosting_type = 'gbdt' but if we use rf
# we must give bagging_freq=1 and bagging_fraction < 1.0
# https://github.com/microsoft/LightGBM/issues/1333
clf = LGBMClassifier(num_leaves=1024,
                     max_depth=6,
                     n_estimators=500,
                     subsample=.632,
                     colsample_bytree=.5,
                     n_jobs=-1)

time_start = time.time()
for run in range(n_runs):
    # Shuffle target AND dataset
    np.random.seed(SEED)
    np.random.shuffle(idx)

    ser_perm_ytrain = ser_ytrain.iloc[idx]
    df_perm_Xtrain = df_Xtrain.iloc[idx]

    # Create a new split
    folds = StratifiedKFold(n_splits, shuffle=True, random_state=SEED)
    oof = np.empty(len(df_Xtrain))

    for fold_, (idx_tr, idx_vd) in enumerate(
            folds.split(ser_perm_ytrain, ser_perm_ytrain)):
        
        print('Fold: ', fold_)
        df_Xtr, ser_perm_ytr = df_perm_Xtrain.iloc[idx_tr], ser_perm_ytrain.iloc[idx_tr]
        df_Xvd, ser_perm_yvd = df_perm_Xtrain.iloc[idx_vd], ser_perm_ytrain.iloc[idx_vd]

        # Train classifier
        clf.fit(df_Xtr, ser_perm_ytr)

        # Keep feature importances for this fold and run
        df_bench_imp[:, n_splits * run + fold_] = clf.feature_importances_

        # Update OOF for gini score display
        oof[idx_vd] = clf.predict_proba(df_Xvd)[:, 1]

        # time taken
        time_taken = time.time() - time_start
        h,m = divmod(time_taken,60*60)
        print('  Time taken : {:.0f} hr '\
            '{:.0f} min {:.0f} secs\n'.format(h, *divmod(m,60)))

    print("Run %2d OOF score (target+data shuffled): %.6f" % 
          (run, eval_gini(ser_perm_ytrain, oof)))

 Time taken: 0 hours 0 minutes and 26.96 seconds.

 Time taken: 0 hours 0 minutes and 53.4 seconds.
Run  0 OOF score (target+data shuffled): 0.206947

 Time taken: 0 hours 1 minutes and 20.75 seconds.

 Time taken: 0 hours 1 minutes and 47.04 seconds.
Run  1 OOF score (target+data shuffled): 0.211760

 Time taken: 0 hours 2 minutes and 17.22 seconds.

 Time taken: 0 hours 2 minutes and 44.64 seconds.
Run  2 OOF score (target+data shuffled): 0.209981

 Time taken: 0 hours 3 minutes and 11.88 seconds.

 Time taken: 0 hours 3 minutes and 38.47 seconds.
Run  3 OOF score (target+data shuffled): 0.208616

 Time taken: 0 hours 4 minutes and 6.13 seconds.

 Time taken: 0 hours 4 minutes and 33.32 seconds.
Run  4 OOF score (target+data shuffled): 0.206535

bench_mean = df_bench_imp.mean(axis=1)
perm_mean = df_imp.mean(axis=1)

df_compare = pd.DataFrame({'feature': df_Xtrain.columns,
                           'feat_imp_T_shuff': bench_mean,
                           'feat_imp_TD_shuff': perm_mean})
df_compare['ratio'] = (df_compare['feat_imp_T_shuff'] / 
                       df_compare['feat_imp_TD_shuff'])

df_compare.sort_values('ratio',ascending=False).head()

df_compare.sort_values('feat_imp_TD_shuff',ascending=False).head()

df_compare.sort_values('feat_imp_TD_shuff',ascending=False).tail(10)

df_compare.sort_values('feat_imp_T_shuff',ascending=False).tail(10)

# not all of the bottom ones are calc features.
# note that in this dataset, they are useless but still the method
# failed to recongnize them.
#
# the tree methods looks for best combination to get the better score,
# that does not necessarily means it selects only the best features by default.

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 1 hr 12 min 17 secs

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	11	0	1	0	0.7	0.2	0.718070	10	1	-1	1	4	1	0	0	1	12	2	0.400000	0.883679	0.370810	3.605551	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	3	0	0	1	0.8	0.4	0.766078	11	1	-1	-1	11	1	1	2	1	19	3	0.316228	0.618817	0.388716	2.449490	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	12	1	0	0	0.0	0.0	-1.000000	7	1	-1	-1	14	1	1	2	1	60	1	0.316228	0.641586	0.347275	3.316625	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	8	1	0	0	0.9	0.2	0.580948	7	1	0	1	11	1	1	3	1	104	1	0.374166	0.542949	0.294958	2.000000	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	9	1	0	0	0.7	0.6	0.840759	11	1	-1	-1	14	1	1	2	1	82	3	0.316070	0.565832	0.365103	2.000000	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0

	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_05_cat	ps_ind_06_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_15	ps_ind_16_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin
422636	0	1	6	0	2	1	0	0	12	1	0	0.9	0.2	0.422788	11	1	-1	-1	14	1	1	2	1	82	3	0.316228	0.704575	0.368511	3.316625	0.6	0.6	0.9	4	1	6	4	11	2	5	8	4	7	5	1	1	1	0
374646	1	1	5	0	0	0	1	0	3	0	1	0.6	0.5	0.844837	7	1	-1	-1	11	1	1	2	1	11	2	0.316228	0.709149	0.368782	3.605551	0.2	0.2	0.2	2	3	7	1	10	3	14	8	2	6	7	0	1	0	0
380900	5	1	4	0	0	0	1	0	7	1	0	0.8	0.3	1.114114	11	1	-1	-1	1	1	1	2	1	51	2	0.374166	0.837845	0.401746	3.605551	0.2	0.3	0.4	2	3	8	4	11	2	9	10	2	4	13	1	0	0	0
318036	5	1	8	1	4	0	0	1	6	1	0	0.4	0.6	0.841130	11	1	-1	-1	11	1	1	2	1	104	3	0.447214	0.817862	0.424617	3.000000	0.5	0.6	0.3	1	4	8	7	7	5	15	4	2	1	10	0	0	0	0
7042	0	1	3	0	0	1	0	0	0	1	0	0.6	0.4	0.809707	11	0	-1	-1	11	1	1	2	1	30	3	0.446990	0.859379	0.451110	2.828427	0.7	0.7	0.8	3	3	8	3	9	3	4	8	2	2	7	0	1	0	1

	feature	feat_imp_T_shuff	feat_imp_TD_shuff	ratio
27	ps_car_07_cat	147.5	98.6	1.495943
16	ps_ind_17_bin	146.8	99.4	1.476861
6	ps_ind_07_bin	139.4	109.1	1.277727
15	ps_ind_16_bin	154.2	124.3	1.240547
5	ps_ind_06_bin	147.2	121.7	1.209532

	feature	feat_imp_T_shuff	feat_imp_TD_shuff	ratio
34	ps_car_13	1630.0	1630.0	1.000000
20	ps_reg_03	1627.3	1559.7	1.043342
35	ps_car_14	1343.4	1354.8	0.991585
31	ps_car_11_cat	955.8	977.3	0.978001
50	ps_calc_14	946.8	944.7	1.002223

	feature	feat_imp_T_shuff	feat_imp_TD_shuff	ratio
27	ps_car_07_cat	147.5	98.6	1.495943
17	ps_ind_18_bin	96.5	93.6	1.030983
28	ps_car_08_cat	82.5	81.5	1.012270
22	ps_car_02_cat	86.8	76.3	1.137615
30	ps_car_10_cat	54.6	48.2	1.132780
13	ps_ind_14	42.8	46.4	0.922414
11	ps_ind_12_bin	25.0	31.9	0.783699
9	ps_ind_10_bin	10.7	16.1	0.664596
10	ps_ind_11_bin	14.8	14.1	1.049645
12	ps_ind_13_bin	10.8	12.2	0.885246

	index	feature	rank	support	support_weak
0	0	ps_ind_01	1	True	False
1	21	ps_car_01_cat	1	True	False
2	20	ps_reg_03	1	True	False
3	19	ps_reg_02	1	True	False
4	18	ps_reg_01	1	True	False

	feature	feat_imp_T_shuff	feat_imp_TD_shuff	ratio
51	ps_calc_15_bin	99.3	103.0	0.964078
17	ps_ind_18_bin	96.5	93.6	1.030983
22	ps_car_02_cat	86.8	76.3	1.137615
28	ps_car_08_cat	82.5	81.5	1.012270
30	ps_car_10_cat	54.6	48.2	1.132780
13	ps_ind_14	42.8	46.4	0.922414
11	ps_ind_12_bin	25.0	31.9	0.783699
10	ps_ind_11_bin	14.8	14.1	1.049645
12	ps_ind_13_bin	10.8	12.2	0.885246
9	ps_ind_10_bin	10.7	16.1	0.664596