Kernel Author:
Bhishan Poudel, Ph.D Astrophysics .

Data Description¶

In this project, we will predict the probability that an auto insurance policy holder files a claim. This a binary classification problem.

We have more than half a million records and 59 features (including already calculated features).

binary features: _bin
categorical features: _cat
continuous or ordinal feafures: ind, reg, car, calc
missing values: -1

Fullforms
ind = individual
reg = registration
car = car
calc = calculated

The target columns signifies whether or not a claim was filed for that policy holder.

Evaluation Metric¶

From this graph of wikipedia G = A / (A+B). Gini index varies between 0 and 1. Here we have only binary options: rich and poor.

x-axis= number of people (cumulative sum)
y-axis = total income (cumulative sum)

0 = complete equality of richness
1 = complete inequality of richness


This competition
0 = random guessing
1 = maximum score (also remember 2*1-1 = 1 when maximum auc is 1).

If we calculate gini from gini = 2*auc -1 it has range (-1,1). For AUC:

worst binary classifier AUC = 0.5
perfect binary classifier AUC = 1

If AUC is less than below, simply simply invert 0 <==> 1 then we will get roc auc score between 0.5 and 1.0

Imports¶

import os
import time
import gc
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
time_start_notebook = time.time()
SEED=100
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('matplotlib', '3.2.2')]

# Google colab

%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

Useful Functions¶

df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                        'NormalizedGini': []
                    })

Load the data¶

df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/'
    'Porto_seguro_safe_driver_prediction/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()

(595212, 59)

"""
Comment about file size:
The data is large, it has 595k records and 59 features.

ps = porto seguro
_bin = binary feature
_cat = categorical feature


continuous or ordinal: ind, reg, car, calc

""";

target = 'target'

Data Processing¶

# all features except target
cols_all= df.columns.drop(target).to_list() 

# categorical features except later created count
cols_cat = [c for c in cols_all if ('cat' in c and 'count' not in c)]

# we exclude calc features in numeric features
cols_num = [c for c in cols_all if ('cat' not in c and 'calc' not in c)]

print(cols_num)

['id', 'ps_ind_01', 'ps_ind_03', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']

# missing count
df['missing'] = df.eq(-1).sum(axis=1).astype(float)
cols_num.append('missing')

# individual features
cols_ind = [c for c in cols_all if 'ind' in c]
df['ind_concat'] = df[cols_ind].astype(str).agg('_'.join,axis=1)

# cat count features from whole data
cols_cat_count = []
for col in cols_cat + ['ind_concat']:
    d = df[col].value_counts().to_dict()
    df[f'{col}_count'] = df[col].apply(lambda x:d.get(x,0))
    cols_cat_count.append(f'{col}_count')

# after creating count of ind concat, drop it
df = df.drop('ind_concat',axis=1)

# one hot encoding
df = pd.get_dummies(df, columns=cols_cat, drop_first=True)

Train-test Split with Stratify¶

from sklearn.model_selection import train_test_split

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1),df[target],
    test_size=0.2,random_state=SEED, stratify=df[target])

# backup and delete id
cols_drop = ['id']
train_id = df_Xtrain[cols_drop]
test_id = df_Xtest[cols_drop]
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)


Xtrain = df_Xtrain.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()

Xtest = df_Xtest.to_numpy()
ytest = ser_ytest.to_numpy().ravel()

# make sure no nans and no strings
print(Xtrain.sum().sum())

2160094516738.3755

Training Data¶

pd.set_option('display.max_columns',250)
df_Xtrain.head()

# df_Xtrain.columns # make sure there are no id and index

Xtr = Xtrain
Xtx = Xtest
ytr = ytrain
ytx = ytest

print(Xtr.shape, Xtx.shape)

(476169, 229) (119043, 229)

ser_ytest.value_counts(normalize=True)

0    0.963551
1    0.036449
Name: target, dtype: float64

Evaluation Metric¶

https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282

# @numba.jit fails and falls back to object mode.
from sklearn import metrics

def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini(y, pred):
    fpr, tpr, thr = metrics.roc_curve(y, pred, pos_label=1)
    g = 2 * metrics.auc(fpr, tpr) -1
    return g

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

Modelling: LightGBM¶

import joblib
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

clf_lgb = lgb.LGBMClassifier(random_state=SEED)
clf_lgb

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=100, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# fit and predict
clf_lgb.fit(Xtr, ytr)
ypreds = clf_lgb.predict(Xtx)

# model evaluation
average = 'binary'
model_name = 'lgb'
desc = 'default'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            roc_auc_score(ytx, ypreds),
            2 * roc_auc_score(ytx, ypreds) - 1,
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
display(df_eval)

# changing threshold
yprobs = clf_lgb.predict_proba(Xtx)[:,1]
thresholds = np.arange(0, 1, 0.001)
auc_scores = [roc_auc_score(ytx, [0 if i <= thr else 1 for i in yprobs])
                for thr in thresholds]

idx = np.argmax(auc_scores)
best_thr = thresholds[idx]
ypreds = [0 if i <= best_thr else 1 for i in yprobs]

# model evaluation
average = 'binary'
model_name = 'lgb'
desc = 'default, threshold change'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            roc_auc_score(ytx, ypreds),
            2 * roc_auc_score(ytx, ypreds) - 1,
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
display(df_eval)

Lightgbm with tuned parameters¶

learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

time_start = time.time()

K = 5
skf = StratifiedKFold(n_splits=K, random_state=SEED, shuffle=True)
ser_vdprobs = ser_ytrain * 0
ser_vdpreds = ser_ytrain * 0 # all index of validation makes same as train
txprobs = ser_ytest.to_numpy().ravel() * 0.0 # make it zero, to add values from CV
increase = True
best_thr_lst = []

for i, (idx_tr, idx_vd) in enumerate(skf.split(Xtrain, ytrain)):
    # print
    print( "\nFold ", i)

    # data for this fold
    df_Xtr = df_Xtrain.iloc[idx_tr,:].copy()
    ser_ytr = ser_ytrain.iloc[idx_tr].copy()
    df_Xvd = df_Xtrain.iloc[idx_vd,:].copy()
    ser_yvd = ser_ytrain.iloc[idx_vd].copy()
    df_Xtx = df_Xtest.copy() # we add target encoding features to test

    clf = lgb.LGBMClassifier(random_state=SEED,**params)

    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(ser_ytr == 1)
        # Add positive examples
        df_Xtr = pd.concat([df_Xtr, df_Xtr.loc[pos]], axis=0)
        ser_ytr = pd.concat([ser_ytr, ser_ytr.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(df_Xtr))
        np.random.seed(SEED)
        np.random.shuffle(idx)
        df_Xtr = df_Xtr.iloc[idx]
        ser_ytr = ser_ytr.iloc[idx]
 
    fit_model = clf.fit(df_Xtr, ser_ytr, 
            eval_set=[(df_Xtr, ser_ytr), (df_Xvd, ser_yvd)],
            eval_metric='auc', # gini_lgb gives error.
            early_stopping_rounds=None,
            verbose=False)

    # valid probs for this fold
    vdprobs = fit_model.predict_proba(df_Xvd)[:,1]
    print( "  Gini (from probs) : ", eval_gini(ser_yvd, vdprobs) )

    # find the best threshold using validation data
    thresholds = np.arange(0, 1, 0.001)


    # using auc instead of gini to find best threshold
    auc_scores = [roc_auc_score(ser_yvd, [0 if i <= thr else 1 for i in vdprobs])
                    for thr in thresholds]
    idx = np.argmax(auc_scores)
    best_thr = thresholds[idx]
    best_auc = auc_scores[idx]
    best_thr_lst.append(best_thr)
    vdpreds = [0 if i <= best_thr else 1 for i in vdprobs]

    print(f'  Best threshold    : {best_thr:.3f}')
    print(f'  Best AUC          : {best_auc:.5f}')
    print( "  Gini (from preds) : ", eval_gini(ser_yvd, vdpreds) )

    ser_vdprobs.iloc[idx_vd] = vdprobs
    ser_vdpreds.iloc[idx_vd] = vdpreds

    # accumulate probs
    txprobs += fit_model.predict_proba(df_Xtx)[:,1] # test probs

    # clean memory
    del df_Xtr, ser_ytr, df_Xvd, ser_yvd, df_Xtx

    # time taken
    time_taken = time.time() - time_start
    h,m = divmod(time_taken,60*60)
    print('  Time taken        : {:.0f} hr '\
        '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

txprobs /= K  # avg test probs
best_thr = np.mean(best_thr_lst)
txpreds = [0 if i <= best_thr else 1 for i in txprobs]

print()
print(f'Best thresholds for fold                : {best_thr_lst}')
print("Gini for full training set (from probs)  : ",
      eval_gini(ser_ytrain, ser_vdprobs))
print("Gini for full training set   (from preds): ",
      eval_gini(ser_ytrain, ser_vdpreds))

# predictions
ypreds = txpreds

# model evaluation
average = 'binary'
row_eval = ['lgb','skf cv, upsample, threshold change', 
            accuracy_score(ytest, ypreds),
            precision_score(ytest, ypreds, average=average),
            recall_score(ytest, ypreds, average=average),
            f1_score(ytest, ypreds, average=average),
            roc_auc_score(ytest, ypreds),
            2 * roc_auc_score(ytest, ypreds) -1,
           ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
display(df_eval)

Fold  0
  Gini (from probs) :  0.26891993222441213
  Best threshold    : 0.068
  Best AUC          : 0.59816
  Gini (from preds) :  0.197157284285006
  Time taken        : 0 hr 1 min 2 secs

Fold  1
  Gini (from probs) :  0.28118799191507626
  Best threshold    : 0.072
  Best AUC          : 0.59746
  Gini (from preds) :  0.19650610093662768
  Time taken        : 0 hr 2 min 3 secs

Fold  2
  Gini (from probs) :  0.2772138105963996
  Best threshold    : 0.073
  Best AUC          : 0.59812
  Gini (from preds) :  0.19726425130980363
  Time taken        : 0 hr 3 min 5 secs

Fold  3
  Gini (from probs) :  0.2783405906236863
  Best threshold    : 0.075
  Best AUC          : 0.60125
  Gini (from preds) :  0.19187021224646972
  Time taken        : 0 hr 4 min 5 secs

Fold  4
  Gini (from probs) :  0.27643897160813047
  Best threshold    : 0.069
  Best AUC          : 0.60025
  Gini (from preds) :  0.1878881415516125
  Time taken        : 0 hr 5 min 9 secs

Best thresholds for fold                : [0.068, 0.07200000000000001, 0.073, 0.075, 0.069]
Gini for full training set (from probs)  :  0.2763446163276101
Gini for full training set   (from preds):  0.1934090454052645

time_start = time.time()

K = 5
skf = StratifiedKFold(n_splits=K, random_state=SEED, shuffle=True)
ser_vdprobs = ser_ytrain * 0
ser_vdpreds = ser_ytrain * 0 # all index of validation makes same as train
txprobs = ser_ytest.to_numpy().ravel() * 0.0 # make it zero, to add values from CV
increase = False
best_thr_lst = []

for i, (idx_tr, idx_vd) in enumerate(skf.split(Xtrain, ytrain)):
    # print
    print( "\nFold ", i)

    # data for this fold
    df_Xtr = df_Xtrain.iloc[idx_tr,:].copy()
    ser_ytr = ser_ytrain.iloc[idx_tr].copy()
    df_Xvd = df_Xtrain.iloc[idx_vd,:].copy()
    ser_yvd = ser_ytrain.iloc[idx_vd].copy()
    df_Xtx = df_Xtest.copy() # we add target encoding features to test

    clf = lgb.LGBMClassifier(random_state=SEED,**params)

    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(ser_ytr == 1)
        # Add positive examples
        df_Xtr = pd.concat([df_Xtr, df_Xtr.loc[pos]], axis=0)
        ser_ytr = pd.concat([ser_ytr, ser_ytr.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(df_Xtr))
        np.random.seed(SEED)
        np.random.shuffle(idx)
        df_Xtr = df_Xtr.iloc[idx]
        ser_ytr = ser_ytr.iloc[idx]
 
    fit_model = clf.fit(df_Xtr, ser_ytr, 
            eval_set=[(df_Xtr, ser_ytr), (df_Xvd, ser_yvd)],
            eval_metric='auc', # gini_lgb gives error.
            early_stopping_rounds=None,
            verbose=False)

    # valid probs for this fold
    vdprobs = fit_model.predict_proba(df_Xvd)[:,1]
    print( "  Gini (from probs) : ", eval_gini(ser_yvd, vdprobs) )

    # find the best threshold using validation data
    thresholds = np.arange(0, 1, 0.001)


    # using auc instead of gini to find best threshold
    auc_scores = [roc_auc_score(ser_yvd, [0 if i <= thr else 1 for i in vdprobs])
                    for thr in thresholds]
    idx = np.argmax(auc_scores)
    best_thr = thresholds[idx]
    best_auc = auc_scores[idx]
    best_thr_lst.append(best_thr)
    vdpreds = [0 if i <= best_thr else 1 for i in vdprobs]

    print(f'  Best threshold    : {best_thr:.3f}')
    print(f'  Best AUC          : {best_auc:.5f}')
    print( "  Gini (from preds) : ", eval_gini(ser_yvd, vdpreds) )

    ser_vdprobs.iloc[idx_vd] = vdprobs
    ser_vdpreds.iloc[idx_vd] = vdpreds

    # accumulate probs
    txprobs += fit_model.predict_proba(df_Xtx)[:,1] # test probs

    # clean memory
    del df_Xtr, ser_ytr, df_Xvd, ser_yvd, df_Xtx

    # time taken
    time_taken = time.time() - time_start
    h,m = divmod(time_taken,60*60)
    print('  Time taken        : {:.0f} hr '\
        '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

txprobs /= K  # avg test probs
best_thr = np.mean(best_thr_lst)
txpreds = [0 if i <= best_thr else 1 for i in txprobs]

print()
print(f'Best thresholds for fold                : {best_thr_lst}')
print("Gini for full training set (from probs)  : ",
      eval_gini(ser_ytrain, ser_vdprobs))
print("Gini for full training set   (from preds): ",
      eval_gini(ser_ytrain, ser_vdpreds))

# predictions
ypreds = txpreds

# model evaluation
average = 'binary'
row_eval = ['lgb','skf cv, NO upsample, threshold change', 
            accuracy_score(ytest, ypreds),
            precision_score(ytest, ypreds, average=average),
            recall_score(ytest, ypreds, average=average),
            f1_score(ytest, ypreds, average=average),
            roc_auc_score(ytest, ypreds),
            2 * roc_auc_score(ytest, ypreds) -1,
           ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
display(df_eval)

Fold  0
  Gini (from probs) :  0.26685514526443777
  Best threshold    : 0.037
  Best AUC          : 0.59549
  Gini (from preds) :  0.18615586863749845
  Time taken        : 0 hr 1 min 0 secs

Fold  1
  Gini (from probs) :  0.2824486832291745
  Best threshold    : 0.035
  Best AUC          : 0.59934
  Gini (from preds) :  0.20650834975584842
  Time taken        : 0 hr 2 min 1 secs

Fold  2
  Gini (from probs) :  0.27314526470779876
  Best threshold    : 0.033
  Best AUC          : 0.59634
  Gini (from preds) :  0.18780766932092763
  Time taken        : 0 hr 3 min 0 secs

Fold  3
  Gini (from probs) :  0.27811358317546275
  Best threshold    : 0.036
  Best AUC          : 0.60136
  Gini (from preds) :  0.19769684768429097
  Time taken        : 0 hr 3 min 59 secs

Fold  4
  Gini (from probs) :  0.2785345183336665
  Best threshold    : 0.034
  Best AUC          : 0.60164
  Gini (from preds) :  0.19344096801069643
  Time taken        : 0 hr 4 min 58 secs

Best thresholds for fold                : [0.037, 0.035, 0.033, 0.036000000000000004, 0.034]
Gini for full training set (from probs)  :  0.2757617667779544
Gini for full training set   (from preds):  0.19242741588993084

	ps_ind_01	ps_ind_03	ps_ind_06_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_15	ps_ind_16_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_20_bin	missing	ps_ind_02_cat_count	ps_ind_04_cat_count	ps_ind_05_cat_count	ps_car_01_cat_count	ps_car_02_cat_count	ps_car_03_cat_count	ps_car_04_cat_count	ps_car_05_cat_count	ps_car_06_cat_count	ps_car_07_cat_count	ps_car_08_cat_count	ps_car_09_cat_count	ps_car_10_cat_count	ps_car_11_cat_count	ind_concat_count	ps_ind_02_cat_1	ps_ind_04_cat_0	ps_ind_04_cat_1	ps_ind_05_cat_0	ps_ind_05_cat_2	ps_ind_05_cat_4	ps_car_01_cat_7	ps_car_01_cat_11	ps_car_02_cat_0	ps_car_02_cat_1	ps_car_06_cat_1	ps_car_06_cat_11	ps_car_06_cat_14	ps_car_07_cat_1	ps_car_08_cat_1	ps_car_09_cat_2	ps_car_10_cat_1	ps_car_11_cat_11	ps_car_11_cat_30	ps_car_11_cat_51	ps_car_11_cat_82	ps_car_11_cat_104
422636	0	6	1	0	0	12	1	0	0.9	0.2	0.422788	3	0.316228	0.704575	0.368511	3.316625	0.6	0.6	0.9	4	1	6	4	11	2	5	8	4	7	5	1	1	1	0	2.0	431859	346965	4184	207573	493990	411231	496581	266551	59253	553148	495264	353482	590179	10470	8	1	1	0	0	1	0	0	1	0	1	0	0	1	1	1	1	1	0	0	0	1	0
374646	1	5	0	1	0	3	0	1	0.6	0.5	0.844837	2	0.316228	0.709149	0.368782	3.605551	0.2	0.2	0.2	2	3	7	1	10	3	14	8	2	6	7	0	1	0	0	2.0	431859	346965	528009	179247	493990	411231	496581	266551	131527	553148	495264	353482	590179	6716	25	1	1	0	1	0	0	1	0	0	1	0	1	0	1	1	1	1	1	0	0	0	0
380900	5	4	0	1	0	7	1	0	0.8	0.3	1.114114	2	0.374166	0.837845	0.401746	3.605551	0.2	0.3	0.4	2	3	8	4	11	2	9	10	2	4	13	1	0	0	0	2.0	431859	346965	528009	207573	493990	411231	496581	266551	118386	553148	495264	353482	590179	6008	84	1	1	0	1	0	0	0	1	0	1	1	0	0	1	1	1	1	0	0	1	0	0
318036	5	8	0	0	1	6	1	0	0.4	0.6	0.841130	3	0.447214	0.817862	0.424617	3.000000	0.5	0.6	0.3	1	4	8	7	7	5	15	4	2	1	10	0	0	0	0	2.0	431859	248164	18344	207573	493990	411231	496581	266551	131527	553148	495264	353482	590179	85083	1	1	0	1	0	0	1	0	1	0	1	0	1	0	1	1	1	1	0	0	0	0	1
7042	0	3	1	0	0	0	1	0	0.6	0.4	0.809707	3	0.446990	0.859379	0.451110	2.828427	0.7	0.7	0.8	3	3	8	3	9	3	4	8	2	2	7	0	1	0	1	2.0	431859	346965	528009	207573	101217	411231	496581	266551	131527	553148	495264	353482	590179	2322	152	1	1	0	1	0	0	0	1	1	0	0	1	0	1	1	1	1	0	1	0	0	0

	Model	Description	Accuracy	Precision	Recall	F1	AUC	NormlaizedGini
0	lgb	default	0.963559	0.666667	0.000461	0.000921	0.500226	0.000452
1	lgb	default, threshold change	0.600556	0.053908	0.601752	0.098952	0.601131	0.202262

	Model	Description	Accuracy	Precision	Recall	F1	AUC	NormlaizedGini
0	lgb	default	0.963559	0.666667	0.000461	0.000921	0.500226	0.000452
1	lgb	default, threshold change	0.600556	0.053908	0.601752	0.098952	0.601131	0.202262
2	lgb	skf cv, upsample, threshold change	0.636804	0.056224	0.567873	0.102317	0.603642	0.207284

	Model	Description	Accuracy	Precision	Recall	F1	AUC	NormlaizedGini
0	lgb	default	0.963559	0.666667	0.000461	0.000921	0.500226	0.000452
1	lgb	default, threshold change	0.600556	0.053908	0.601752	0.098952	0.601131	0.202262
2	lgb	skf cv, upsample, threshold change	0.636804	0.056224	0.567873	0.102317	0.603642	0.207284
3	lgb	skf cv, NO upsample, threshold change	0.589837	0.053728	0.617193	0.098850	0.602998	0.205995

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	11	0	1	0	0.7	0.2	0.718070	10	1	-1	1	4	1	0	0	1	12	2	0.400000	0.883679	0.370810	3.605551	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	3	0	0	1	0.8	0.4	0.766078	11	1	-1	-1	11	1	1	2	1	19	3	0.316228	0.618817	0.388716	2.449490	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	12	1	0	0	0.0	0.0	-1.000000	7	1	-1	-1	14	1	1	2	1	60	1	0.316228	0.641586	0.347275	3.316625	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	8	1	0	0	0.9	0.2	0.580948	7	1	0	1	11	1	1	3	1	104	1	0.374166	0.542949	0.294958	2.000000	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	9	1	0	0	0.7	0.6	0.840759	11	1	-1	-1	14	1	1	2	1	82	3	0.316070	0.565832	0.365103	2.000000	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0