Table of Contents

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 

# random state
SEED=100
np.random.seed(SEED)

[(x.__name__,x.__version__) for x in [np,pd,sns]]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
Out[1]:
[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]
In [2]:
pd.options.display.max_columns = None
In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [4]:
import datetime
import functools
In [5]:
from sklearn.model_selection import train_test_split
In [6]:
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)

[(x.__name__,x.__version__) for x in [plotly]]
Out[6]:
[('plotly', '4.5.2')]

Load the data

In [7]:
!ls ../data/raw
LCDataDictionary.xlsx   loan_data_2007_2014.csv loan_data_2015.csv
In [8]:
!du -sh ../data/raw/loan_data_2007_2014.csv
229M	../data/raw/loan_data_2007_2014.csv
In [222]:
dat_raw = '../data/raw/'
dat_pro = '../data/processed/'
In [9]:
df = pd.read_csv(dat_raw + 'loan_data_2007_2014.csv',low_memory=False)

print(df.shape)
df.head(2).append(df.tail(2))
(466285, 75)
Out[9]:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_il_6m open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m
0 0 1077501 1296599 5000 5000 4975.0 36 months 10.65 162.87 B B2 NaN 10+ years RENT 24000.0 Verified Dec-11 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/22/11 > I need to upgra... credit_card Computer 860xx AZ 27.65 0.0 Jan-85 1.0 NaN NaN 3.0 0.0 13648 83.7 9.0 f 0.00 0.00 5861.071414 5831.78 5000.00 861.07 0.0 0.00 0.00 Jan-15 171.62 NaN Jan-16 0.0 NaN 1 INDIVIDUAL NaN NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1 1077430 1314167 2500 2500 2500.0 60 months 15.27 59.83 C C4 Ryder < 1 year RENT 30000.0 Source Verified Dec-11 Charged Off n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/22/11 > I plan to use t... car bike 309xx GA 1.00 0.0 Apr-99 5.0 NaN NaN 3.0 0.0 1687 9.4 4.0 f 0.00 0.00 1008.710000 1008.71 456.46 435.17 0.0 117.08 1.11 Apr-13 119.66 NaN Sep-13 0.0 NaN 1 INDIVIDUAL NaN NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
466283 466283 9604874 11457002 2000 2000 2000.0 36 months 7.90 62.59 A A4 Server Engineer Lead 3 years OWN 83000.0 Verified Jan-14 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... NaN credit_card Credit card refinancing 913xx CA 5.39 3.0 Feb-03 1.0 13.0 NaN 21.0 0.0 11404 21.5 27.0 w 0.00 0.00 2126.579838 2126.58 2000.00 126.58 0.0 0.00 0.00 Dec-14 1500.68 NaN Apr-15 0.0 NaN 1 INDIVIDUAL NaN NaN NaN 0.0 0.0 591610.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 53100.0 NaN NaN NaN
466284 466284 9199665 11061576 10000 10000 9975.0 36 months 19.20 367.58 D D3 NaN 10+ years MORTGAGE 46000.0 Verified Jan-14 Current n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/04/13 > I will like a l... other Other 950xx CA 22.78 1.0 Feb-00 0.0 9.0 NaN 6.0 0.0 11325 70.8 22.0 f 3984.38 3974.41 8821.620000 8799.57 6015.62 2806.00 0.0 0.00 0.00 Jan-16 367.58 Feb-16 Jan-16 0.0 NaN 1 INDIVIDUAL NaN NaN NaN 0.0 0.0 57477.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16000.0 NaN NaN NaN
In [10]:
df_copy = df.copy(deep=True)
In [11]:
# df = df_copy.copy()

Memory Reduction

In [12]:
col_floats = df.select_dtypes(np.float64).columns
for col in col_floats:
    df[col] = df[col].astype(np.float32)
In [13]:
col_ints = df.select_dtypes(np.int64).columns
for col in col_ints:
    df[col] = df[col].astype(np.int32)

data type: datetetime

In [14]:
df.select_dtypes('object').head(1).T
Out[14]:
0
term 36 months
grade B
sub_grade B2
emp_title NaN
emp_length 10+ years
home_ownership RENT
verification_status Verified
issue_d Dec-11
loan_status Fully Paid
pymnt_plan n
url https://www.lendingclub.com/browse/loanDetail....
desc Borrower added on 12/22/11 > I need to upgra...
purpose credit_card
title Computer
zip_code 860xx
addr_state AZ
earliest_cr_line Jan-85
initial_list_status f
last_pymnt_d Jan-15
next_pymnt_d NaN
last_credit_pull_d Jan-16
application_type INDIVIDUAL
In [15]:
cols_date = ['issue_d',
             'earliest_cr_line',
             'last_pymnt_d',
             'last_credit_pull_d']

for col in cols_date:
    df[col+'_date'] = pd.to_datetime(df[col],
                                     format='%b-%y',
                                     errors='coerce')
In [16]:
df[cols_date].dtypes
Out[16]:
issue_d               object
earliest_cr_line      object
last_pymnt_d          object
last_credit_pull_d    object
dtype: object
In [17]:
df[cols_date].head(2)
Out[17]:
issue_d earliest_cr_line last_pymnt_d last_credit_pull_d
0 Dec-11 Jan-85 Jan-15 Jan-16
1 Dec-11 Apr-99 Apr-13 Sep-13
In [18]:
today = datetime.datetime.today()
today
Out[18]:
datetime.datetime(2020, 3, 31, 16, 6, 21, 636929)
In [19]:
today = datetime.datetime(2020, 3, 30)
today
Out[19]:
datetime.datetime(2020, 3, 30, 0, 0)
In [20]:
df['earliest_cr_line_date'].head(2)
Out[20]:
0   1985-01-01
1   1999-04-01
Name: earliest_cr_line_date, dtype: datetime64[ns]
In [21]:
df['earliest_cr_line_date'].isnull().sum()
Out[21]:
29
In [22]:
for col in cols_date:
    df['mths_since_'+col] = \
        round((today - df[col+'_date']) / 
              np.timedelta64(1,'M'))
In [23]:
cols_months_since = ['mths_since_' + i for i in cols_date]
df[cols_months_since].describe().round(2).T
Out[23]:
count mean std min 25% 50% 75% max
mths_since_issue_d 466285.0 79.26 14.34 64.0 69.0 75.0 85.0 154.0
mths_since_earliest_cr_line 466256.0 267.48 93.97 -584.0 211.0 253.0 313.0 615.0
mths_since_last_pymnt_d 465909.0 59.29 12.81 51.0 51.0 52.0 63.0 148.0
mths_since_last_credit_pull_d 466243.0 55.05 9.64 51.0 51.0 51.0 53.0 155.0

data type: object

In [24]:
df.select_dtypes('object').nunique().sort_values()
Out[24]:
application_type            1
initial_list_status         2
pymnt_plan                  2
term                        2
verification_status         3
home_ownership              6
grade                       7
loan_status                 9
emp_length                 11
purpose                    14
sub_grade                  35
addr_state                 50
issue_d                    91
last_pymnt_d               98
next_pymnt_d              100
last_credit_pull_d        103
earliest_cr_line          664
zip_code                  888
title                   63099
desc                   124436
emp_title              205475
url                    466285
dtype: int64
In [25]:
cols_cat_small = df.select_dtypes('object').nunique()[lambda x: x<12].index
In [26]:
for col in cols_cat_small:
    print(col)
    print(df[col].unique())
    print()
term
[' 36 months' ' 60 months']

grade
['B' 'C' 'A' 'E' 'F' 'D' 'G']

emp_length
['10+ years' '< 1 year' '1 year' '3 years' '8 years' '9 years' '4 years'
 '5 years' '6 years' '2 years' '7 years' nan]

home_ownership
['RENT' 'OWN' 'MORTGAGE' 'OTHER' 'NONE' 'ANY']

verification_status
['Verified' 'Source Verified' 'Not Verified']

loan_status
['Fully Paid' 'Charged Off' 'Current' 'Default' 'Late (31-120 days)'
 'In Grace Period' 'Late (16-30 days)'
 'Does not meet the credit policy. Status:Fully Paid'
 'Does not meet the credit policy. Status:Charged Off']

pymnt_plan
['n' 'y']

initial_list_status
['f' 'w']

application_type
['INDIVIDUAL']

Hidden Continuous features

In [27]:
df['term'].value_counts(dropna=False)
Out[27]:
 36 months    337953
 60 months    128332
Name: term, dtype: int64
In [28]:
df['term_int'] = df['term'].str.extract('(\d+)').astype(int)
df['term_int'].unique()
Out[28]:
array([36, 60])
In [29]:
df['emp_length'].value_counts(dropna=False)
Out[29]:
10+ years    150049
2 years       41373
3 years       36596
< 1 year      36265
5 years       30774
1 year        29622
4 years       28023
7 years       26180
6 years       26112
8 years       22395
NaN           21008
9 years       17888
Name: emp_length, dtype: int64
In [30]:
df['emp_length_int'] = df['emp_length']\
.str.replace('< 1years','0')\
.str.extract(r'(\d+)').astype(np.float32)

df['emp_length_int'].value_counts(dropna=False)
Out[30]:
10.0    150049
1.0      65887
2.0      41373
3.0      36596
5.0      30774
4.0      28023
7.0      26180
6.0      26112
8.0      22395
NaN      21008
9.0      17888
Name: emp_length_int, dtype: int64

Create New Columns

In [31]:
df['loan_income_ratio']= df['loan_amnt']/df['annual_inc']

df.iloc[:2,-2:]
Out[31]:
emp_length_int loan_income_ratio
0 10.0 0.208333
1 1.0 0.083333
In [32]:
df['loan_status'].value_counts()
Out[32]:
Current                                                224226
Fully Paid                                             184739
Charged Off                                             42475
Late (31-120 days)                                       6900
In Grace Period                                          3146
Does not meet the credit policy. Status:Fully Paid       1988
Late (16-30 days)                                        1218
Default                                                   832
Does not meet the credit policy. Status:Charged Off       761
Name: loan_status, dtype: int64
In [33]:
df['loan_status'].unique()
Out[33]:
array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)
In [34]:
cols_bad = ['Charged Off',
            'Default',
            'Does not meet the credit policy. Status:Charged Off',
            'Late (31-120 days)']

df['good_bad'] = np.where(df['loan_status'].isin(cols_bad),0,1)

df['good_bad'].value_counts()
Out[34]:
1    415317
0     50968
Name: good_bad, dtype: int64

Missing Values

In [35]:
pd.options.display.max_rows = 200
In [36]:
df.head(2)
Out[36]:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_il_6m open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m issue_d_date earliest_cr_line_date last_pymnt_d_date last_credit_pull_d_date mths_since_issue_d mths_since_earliest_cr_line mths_since_last_pymnt_d mths_since_last_credit_pull_d term_int emp_length_int loan_income_ratio good_bad
0 0 1077501 1296599 5000 5000 4975.0 36 months 10.65 162.869995 B B2 NaN 10+ years RENT 24000.0 Verified Dec-11 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/22/11 > I need to upgra... credit_card Computer 860xx AZ 27.65 0.0 Jan-85 1.0 NaN NaN 3.0 0.0 13648 83.699997 9.0 f 0.0 0.0 5861.071289 5831.779785 5000.000000 861.070007 0.0 0.000000 0.00 Jan-15 171.619995 NaN Jan-16 0.0 NaN 1 INDIVIDUAL NaN NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2011-12-01 1985-01-01 2015-01-01 2016-01-01 100.0 423.0 63.0 51.0 36 10.0 0.208333 1
1 1 1077430 1314167 2500 2500 2500.0 60 months 15.27 59.830002 C C4 Ryder < 1 year RENT 30000.0 Source Verified Dec-11 Charged Off n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/22/11 > I plan to use t... car bike 309xx GA 1.00 0.0 Apr-99 5.0 NaN NaN 3.0 0.0 1687 9.400000 4.0 f 0.0 0.0 1008.710022 1008.710022 456.459991 435.170013 0.0 117.080002 1.11 Apr-13 119.660004 NaN Sep-13 0.0 NaN 1 INDIVIDUAL NaN NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2011-12-01 1999-04-01 2013-04-01 2013-09-01 100.0 252.0 84.0 79.0 60 1.0 0.083333 0
In [37]:
missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()

df[missing_cols].head(2)
Out[37]:
loan_income_ratio annual_inc title earliest_cr_line_date total_acc pub_rec mths_since_earliest_cr_line open_acc earliest_cr_line delinq_2yrs inq_last_6mths acc_now_delinq mths_since_last_credit_pull_d last_credit_pull_d_date last_credit_pull_d collections_12_mths_ex_med revol_util mths_since_last_pymnt_d last_pymnt_d last_pymnt_d_date emp_length_int emp_length emp_title tot_coll_amt tot_cur_bal total_rev_hi_lim next_pymnt_d mths_since_last_delinq desc mths_since_last_major_derog mths_since_last_record annual_inc_joint inq_last_12m total_cu_tl inq_fi dti_joint max_bal_bc open_rv_24m open_rv_12m il_util total_bal_il mths_since_rcnt_il open_il_24m open_il_12m open_il_6m open_acc_6m verification_status_joint all_util
0 0.208333 24000.0 Computer 1985-01-01 9.0 0.0 423.0 3.0 Jan-85 0.0 1.0 0.0 51.0 2016-01-01 Jan-16 0.0 83.699997 63.0 Jan-15 2015-01-01 10.0 10+ years NaN NaN NaN NaN NaN NaN Borrower added on 12/22/11 > I need to upgra... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0.083333 30000.0 bike 1999-04-01 4.0 0.0 252.0 3.0 Apr-99 0.0 5.0 0.0 79.0 2013-09-01 Sep-13 0.0 9.400000 84.0 Apr-13 2013-04-01 1.0 < 1 year Ryder NaN NaN NaN NaN NaN Borrower added on 12/22/11 > I plan to use t... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [38]:
# df[missing_cols].isnull().sum()
In [39]:
df.shape
Out[39]:
(466285, 87)
In [40]:
# drop columns if all of them are nans
df = df.dropna(how='all',axis=1)
df.isnull().sum()[lambda x: x>0]
Out[40]:
emp_title                         27588
emp_length                        21008
annual_inc                            4
desc                             340302
title                                20
delinq_2yrs                          29
earliest_cr_line                     29
inq_last_6mths                       29
mths_since_last_delinq           250351
mths_since_last_record           403647
open_acc                             29
pub_rec                              29
revol_util                          340
total_acc                            29
last_pymnt_d                        376
next_pymnt_d                     227214
last_credit_pull_d                   42
collections_12_mths_ex_med          145
mths_since_last_major_derog      367311
acc_now_delinq                       29
tot_coll_amt                      70276
tot_cur_bal                       70276
total_rev_hi_lim                  70276
earliest_cr_line_date                29
last_pymnt_d_date                   376
last_credit_pull_d_date              42
mths_since_earliest_cr_line          29
mths_since_last_pymnt_d             376
mths_since_last_credit_pull_d        42
emp_length_int                    21008
loan_income_ratio                     4
dtype: int64
In [41]:
missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()

df[missing_cols].head(2)
Out[41]:
loan_income_ratio annual_inc title acc_now_delinq total_acc pub_rec mths_since_earliest_cr_line open_acc earliest_cr_line delinq_2yrs inq_last_6mths earliest_cr_line_date mths_since_last_credit_pull_d last_credit_pull_d last_credit_pull_d_date collections_12_mths_ex_med revol_util mths_since_last_pymnt_d last_pymnt_d_date last_pymnt_d emp_length_int emp_length emp_title tot_cur_bal tot_coll_amt total_rev_hi_lim next_pymnt_d mths_since_last_delinq desc mths_since_last_major_derog mths_since_last_record
0 0.208333 24000.0 Computer 0.0 9.0 0.0 423.0 3.0 Jan-85 0.0 1.0 1985-01-01 51.0 Jan-16 2016-01-01 0.0 83.699997 63.0 2015-01-01 Jan-15 10.0 10+ years NaN NaN NaN NaN NaN NaN Borrower added on 12/22/11 > I need to upgra... NaN NaN
1 0.083333 30000.0 bike 0.0 4.0 0.0 252.0 3.0 Apr-99 0.0 5.0 1999-04-01 79.0 Sep-13 2013-09-01 0.0 9.400000 84.0 2013-04-01 Apr-13 1.0 < 1 year Ryder NaN NaN NaN NaN NaN Borrower added on 12/22/11 > I plan to use t... NaN NaN
In [42]:
columns = ['feature','n_missing','top_value',
            'proportion','nunique']
df_missing = pd.DataFrame(columns=columns)

for col in missing_cols:
    x = df[col].value_counts(normalize=True).round(2)
    unq = df[col].nunique()
    n_missing = df[col].isnull().sum()

    row = [col,n_missing, x.index[0],x.iloc[0],unq]
    df_missing.loc[len(df_missing)] = row

df_missing = df_missing.set_index('feature')
df_missing
Out[42]:
n_missing top_value proportion nunique
feature
loan_income_ratio 4 0.2 0.02 77022
annual_inc 4 60000 0.04 31901
title 20 Debt consolidation 0.35 63099
acc_now_delinq 29 0 1.00 6
total_acc 29 21 0.04 112
pub_rec 29 0 0.87 26
mths_since_earliest_cr_line 29 234 0.01 664
open_acc 29 9 0.09 62
earliest_cr_line 29 Oct-00 0.01 664
delinq_2yrs 29 0 0.82 24
inq_last_6mths 29 0 0.52 28
earliest_cr_line_date 29 2000-10-01 00:00:00 0.01 664
mths_since_last_credit_pull_d 42 51 0.70 103
last_credit_pull_d 42 Jan-16 0.70 103
last_credit_pull_d_date 42 2016-01-01 00:00:00 0.70 103
collections_12_mths_ex_med 145 0 0.99 9
revol_util 340 0 0.00 1269
mths_since_last_pymnt_d 376 51 0.39 98
last_pymnt_d_date 376 2016-01-01 00:00:00 0.39 98
last_pymnt_d 376 Jan-16 0.39 98
emp_length_int 21008 10 0.34 10
emp_length 21008 10+ years 0.34 11
emp_title 27588 Teacher 0.01 205475
tot_cur_bal 70276 0 0.00 220690
tot_coll_amt 70276 0 0.87 6321
total_rev_hi_lim 70276 15000 0.00 14612
next_pymnt_d 227214 Feb-16 0.87 100
mths_since_last_delinq 250351 9 0.02 145
desc 340302 0.00 124436
mths_since_last_major_derog 367311 45 0.02 162
mths_since_last_record 403647 0 0.02 123

fill nans with mean

In [43]:
df['annual_inc'].isna().sum()
Out[43]:
4
In [44]:
df['annual_inc'] = df['annual_inc'].fillna(df['annual_inc'].mean())

fill nans with another column

In [45]:
df['total_rev_hi_lim'] = df['total_rev_hi_lim'].fillna(df['funded_amnt'])

fill nans with zero

In [46]:
cols_zero = ['mths_since_earliest_cr_line','acc_now_delinq',
            'total_acc','pub_rec','open_acc','inq_last_6mths',
            'delinq_2yrs','emp_length_int']

for col in cols_zero:
    df[col] = df[col].fillna(0)

Dummy variables

In [47]:
df.select_dtypes('object').nunique().sort_values()
Out[47]:
application_type            1
initial_list_status         2
pymnt_plan                  2
term                        2
verification_status         3
home_ownership              6
grade                       7
loan_status                 9
emp_length                 11
purpose                    14
sub_grade                  35
addr_state                 50
issue_d                    91
last_pymnt_d               98
next_pymnt_d              100
last_credit_pull_d        103
earliest_cr_line          664
zip_code                  888
title                   63099
desc                   124436
emp_title              205475
url                    466285
dtype: int64
In [48]:
pd.get_dummies(df['grade'],prefix='grade',prefix_sep=':').head(2)
Out[48]:
grade:A grade:B grade:C grade:D grade:E grade:F grade:G
0 0 1 0 0 0 0 0
1 0 0 1 0 0 0 0
In [49]:
cols_dummy = ['grade','sub_grade','home_ownership',
             'verification_status','loan_status',
             'purpose','addr_state','initial_list_status']


df_dummies = [pd.get_dummies(df[col],
                             prefix=col,prefix_sep = ':')
             for col in cols_dummy]

df_dummies = pd.concat(df_dummies,axis=1)

print(df_dummies.shape)

df_dummies.head(2)
(466285, 126)
Out[49]:
grade:A grade:B grade:C grade:D grade:E grade:F grade:G sub_grade:A1 sub_grade:A2 sub_grade:A3 sub_grade:A4 sub_grade:A5 sub_grade:B1 sub_grade:B2 sub_grade:B3 sub_grade:B4 sub_grade:B5 sub_grade:C1 sub_grade:C2 sub_grade:C3 sub_grade:C4 sub_grade:C5 sub_grade:D1 sub_grade:D2 sub_grade:D3 sub_grade:D4 sub_grade:D5 sub_grade:E1 sub_grade:E2 sub_grade:E3 sub_grade:E4 sub_grade:E5 sub_grade:F1 sub_grade:F2 sub_grade:F3 sub_grade:F4 sub_grade:F5 sub_grade:G1 sub_grade:G2 sub_grade:G3 sub_grade:G4 sub_grade:G5 home_ownership:ANY home_ownership:MORTGAGE home_ownership:NONE home_ownership:OTHER home_ownership:OWN home_ownership:RENT verification_status:Not Verified verification_status:Source Verified verification_status:Verified loan_status:Charged Off loan_status:Current loan_status:Default loan_status:Does not meet the credit policy. Status:Charged Off loan_status:Does not meet the credit policy. Status:Fully Paid loan_status:Fully Paid loan_status:In Grace Period loan_status:Late (16-30 days) loan_status:Late (31-120 days) purpose:car purpose:credit_card purpose:debt_consolidation purpose:educational purpose:home_improvement purpose:house purpose:major_purchase purpose:medical purpose:moving purpose:other purpose:renewable_energy purpose:small_business purpose:vacation purpose:wedding addr_state:AK addr_state:AL addr_state:AR addr_state:AZ addr_state:CA addr_state:CO addr_state:CT addr_state:DC addr_state:DE addr_state:FL addr_state:GA addr_state:HI addr_state:IA addr_state:ID addr_state:IL addr_state:IN addr_state:KS addr_state:KY addr_state:LA addr_state:MA addr_state:MD addr_state:ME addr_state:MI addr_state:MN addr_state:MO addr_state:MS addr_state:MT addr_state:NC addr_state:NE addr_state:NH addr_state:NJ addr_state:NM addr_state:NV addr_state:NY addr_state:OH addr_state:OK addr_state:OR addr_state:PA addr_state:RI addr_state:SC addr_state:SD addr_state:TN addr_state:TX addr_state:UT addr_state:VA addr_state:VT addr_state:WA addr_state:WI addr_state:WV addr_state:WY initial_list_status:f initial_list_status:w
0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
In [50]:
df = pd.concat([df,df_dummies],axis=1)

print(df.shape)
df.head(2)
(466285, 196)
Out[50]:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type acc_now_delinq tot_coll_amt tot_cur_bal total_rev_hi_lim issue_d_date earliest_cr_line_date last_pymnt_d_date last_credit_pull_d_date mths_since_issue_d mths_since_earliest_cr_line mths_since_last_pymnt_d mths_since_last_credit_pull_d term_int emp_length_int loan_income_ratio good_bad grade:A grade:B grade:C grade:D grade:E grade:F grade:G sub_grade:A1 sub_grade:A2 sub_grade:A3 sub_grade:A4 sub_grade:A5 sub_grade:B1 sub_grade:B2 sub_grade:B3 sub_grade:B4 sub_grade:B5 sub_grade:C1 sub_grade:C2 sub_grade:C3 sub_grade:C4 sub_grade:C5 sub_grade:D1 sub_grade:D2 sub_grade:D3 sub_grade:D4 sub_grade:D5 sub_grade:E1 sub_grade:E2 sub_grade:E3 sub_grade:E4 sub_grade:E5 sub_grade:F1 sub_grade:F2 sub_grade:F3 sub_grade:F4 sub_grade:F5 sub_grade:G1 sub_grade:G2 sub_grade:G3 sub_grade:G4 sub_grade:G5 home_ownership:ANY home_ownership:MORTGAGE home_ownership:NONE home_ownership:OTHER home_ownership:OWN home_ownership:RENT verification_status:Not Verified verification_status:Source Verified verification_status:Verified loan_status:Charged Off loan_status:Current loan_status:Default loan_status:Does not meet the credit policy. Status:Charged Off loan_status:Does not meet the credit policy. Status:Fully Paid loan_status:Fully Paid loan_status:In Grace Period loan_status:Late (16-30 days) loan_status:Late (31-120 days) purpose:car purpose:credit_card purpose:debt_consolidation purpose:educational purpose:home_improvement purpose:house purpose:major_purchase purpose:medical purpose:moving purpose:other purpose:renewable_energy purpose:small_business purpose:vacation purpose:wedding addr_state:AK addr_state:AL addr_state:AR addr_state:AZ addr_state:CA addr_state:CO addr_state:CT addr_state:DC addr_state:DE addr_state:FL addr_state:GA addr_state:HI addr_state:IA addr_state:ID addr_state:IL addr_state:IN addr_state:KS addr_state:KY addr_state:LA addr_state:MA addr_state:MD addr_state:ME addr_state:MI addr_state:MN addr_state:MO addr_state:MS addr_state:MT addr_state:NC addr_state:NE addr_state:NH addr_state:NJ addr_state:NM addr_state:NV addr_state:NY addr_state:OH addr_state:OK addr_state:OR addr_state:PA addr_state:RI addr_state:SC addr_state:SD addr_state:TN addr_state:TX addr_state:UT addr_state:VA addr_state:VT addr_state:WA addr_state:WI addr_state:WV addr_state:WY initial_list_status:f initial_list_status:w
0 0 1077501 1296599 5000 5000 4975.0 36 months 10.65 162.869995 B B2 NaN 10+ years RENT 24000.0 Verified Dec-11 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/22/11 > I need to upgra... credit_card Computer 860xx AZ 27.65 0.0 Jan-85 1.0 NaN NaN 3.0 0.0 13648 83.699997 9.0 f 0.0 0.0 5861.071289 5831.779785 5000.000000 861.070007 0.0 0.000000 0.00 Jan-15 171.619995 NaN Jan-16 0.0 NaN 1 INDIVIDUAL 0.0 NaN NaN 5000.0 2011-12-01 1985-01-01 2015-01-01 2016-01-01 100.0 423.0 63.0 51.0 36 10.0 0.208333 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
1 1 1077430 1314167 2500 2500 2500.0 60 months 15.27 59.830002 C C4 Ryder < 1 year RENT 30000.0 Source Verified Dec-11 Charged Off n https://www.lendingclub.com/browse/loanDetail.... Borrower added on 12/22/11 > I plan to use t... car bike 309xx GA 1.00 0.0 Apr-99 5.0 NaN NaN 3.0 0.0 1687 9.400000 4.0 f 0.0 0.0 1008.710022 1008.710022 456.459991 435.170013 0.0 117.080002 1.11 Apr-13 119.660004 NaN Sep-13 0.0 NaN 1 INDIVIDUAL 0.0 NaN NaN 2500.0 2011-12-01 1999-04-01 2013-04-01 2013-09-01 100.0 252.0 84.0 79.0 60 1.0 0.083333 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
In [51]:
df.isnull().sum().sum()
Out[51]:
1779794

Split train test data

In [52]:
from sklearn.model_selection import train_test_split
In [53]:
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop('good_bad', axis = 1),
    df['good_bad'],
    train_size=0.8,
    random_state=SEED
    )
In [54]:
df_Xtrain.shape, df_Xtest.shape, ser_ytrain.shape, ser_ytest.shape
Out[54]:
((373028, 195), (93257, 195), (373028,), (93257,))

Prepare the data for train and test separately

In [224]:
X_prep = df_Xtrain
y_prep = ser_ytrain
In [225]:
# In second round, we will comment train part and
# uncomment this test part.
#
# Note: after doing test part, comment test part and
#       and keep train part uncommented, since we need to
#       create features based on train not test.
#


# X_prep = df_Xtest
# y_prep = ser_ytest

Weight of Evidence WoE and Information Value IV

In [226]:
def get_woe_iv(df1,
               ser_target,
               feature_cat,
               target_name='good_bad',
               sort='woe',
              return_all=False):
    """Calculate Weight of Evidence and Infomation Value.
    
    Parameters
    -----------
    df1: pandas.DataFrame
        Input dataframe having categorical feature.
    ser_target: pandas.Series
        Pandas Series of binary values. e.g y_prep
    feature_cat: str
        Name of categorical column
    target_name: str
        Name of target column.
    sort: str
        Sort the output dataframe by this column.
        Default is 'woe'. We use 'woe' for discrete
        columns.
        
        If the column values are continuous, then
        use the actual column name for sorting.
    return_all: boolean, optional
        Whether or not return all columns


    Returns:
    --------
    Returns dataframe with WOE, IV and other quantities.
    
    
    Example:
    --------
    df_woe = get_woe_iv(X_prep,'grade',y_prep)
    """
    df1 = pd.DataFrame({
        feature_cat: df1[feature_cat],
        target_name: ser_target
    })

    df1 = df1.groupby(feature_cat).agg(
        n_obs=(target_name,'count'),
        prop_good=(target_name,'mean'))

    df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()

    df1['n_good'] = df1['prop_good'] * df1['n_obs']
    df1['n_bad'] = (1-df1['prop_good']) * df1['n_obs']

    df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()
    df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()

    df1['woe'] = np.log(df1['prop_n_good']/df1['prop_n_bad'])

    df1['diff_prop_good'] = df1['prop_good'].diff().abs()
    df1['diff_woe'] = df1['woe'].diff().abs()
    df1['diff_n_obs'] = df1['n_obs'].diff(-1).abs()

    df1['iv'] = ((df1['prop_n_good']-df1['prop_n_bad']
                 ) * df1['woe']
                ).sum()

    # reorder columns
    if not return_all:
        df1 = df1[['n_obs', 'woe', 'diff_n_obs' ]]  

    if return_all:
        df1 = df1[['n_obs', 'woe', 'diff_n_obs',
                   'prop_n_obs','iv',
                   'n_good', 'n_bad',
                   'prop_n_good','prop_n_bad',
                   'prop_good',
                   'diff_prop_good', 'diff_woe' ]]

    # add one more column
    n_obs1 = df1['n_obs'].apply(lambda x: "{:,d}".format(x)).astype(str)
    

    df1['text_plotly'] = (
        'n_obs:  ' + n_obs1 + '<br>' +
        'WoE:    ' + df1['woe'].round(4).astype(str) + '<br>' 
                         )

    # sort values
    df1 = df1.sort_values(sort)
    df1 = df1.reset_index()

    return df1

df_woe = get_woe_iv(X_prep,y_prep,'grade')

df_woe.head().style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})
Out[226]:
grade n_obs woe diff_n_obs text_plotly
0 G 703 -1.258162 nan n_obs: 703
WoE: -1.2582
1 F 2,641 -0.922705 1,938 n_obs: 2,641
WoE: -0.9227
2 E 7,084 -0.692380 4,443 n_obs: 7,084
WoE: -0.6924
3 D 15,294 -0.396822 8,210 n_obs: 15,294
WoE: -0.3968
4 C 24,982 -0.054636 9,688 n_obs: 24,982
WoE: -0.0546
In [227]:
def plot_woe(df_woe,xrot=0,figsize=(18,12),fontsize=20,color='red'):
    x = df_woe.iloc[:,0].to_numpy().astype(str)
    y = df_woe['woe']
    fig,ax = plt.subplots(1,1,figsize=figsize)
    plt.plot(x,y,marker='o',linestyle='--',color='k')
    plt.xlabel(df_woe.columns[0],fontsize=fontsize)
    plt.ylabel('Weight of Evidence',fontsize=fontsize)
    plt.xticks(rotation=xrot,fontsize=fontsize,color=color)
    plt.yticks(fontsize=fontsize)
    
    plt.grid(color='k',linestyle='-.')
    
    for i,txt in enumerate(df_woe['n_obs']):
        txt = '   {:,.0f}'.format(txt)
        try: # sometimes I get error in y[i]
            plt.annotate(txt, (x[i],y[i]),
                     rotation=90,fontsize=18,color='blue')
        except:
            pass
In [228]:
def plot_woe_plotly(df_woe):
    col = df_woe.columns.tolist()[0]
    x = df_woe.iloc[:,0].astype(str)
    y = df_woe['woe']
    z = df_woe['text_plotly']

    data = go.Scatter(x=x,y=y,text=z,mode='lines+markers')
    fig = go.Figure(data=data)
    
    fig.update_layout(
        title=f"WoE Curve for **{col}**",
        xaxis_title=f"{col}",
        yaxis_title="Weight of Evidence",

    )

    # also display dataframe
    df_style = df_woe.style\
              .background_gradient(subset=['n_obs'],
                                   cmap='Blues')\
              .background_gradient(subset=['woe'],
                                   cmap='Reds')\
              .format({'n_obs':"{:,d}",
                       'diff_n_obs':"{:,.0f}"})

    display(df_style)
    fig.show()
In [229]:
def create_bins_dummies_cont(X_prep,col,bins):
    """Creates new dummy columns in-place in given dataframe.
    
    For this continuous bins,
    left numbers are inclusive.
    
    Example:
    ========
    bins = [0,9.961,12.025, 15.74, 20.281]
    col = 'int_rate'

    create_bins_dummies_cont(X_prep,col,bins)
    cols = [i for i in X_prep.columns if col in i]
    X_prep[cols].head(2)

    Outputs:
    ========
    int_rate:0_9.961
    int_rate:9.961_12.025
    int_rate:12.025_15.74
    int_rate:15.74_20.281
    int_rate:>=20.281
    """
    nbins = len(bins)
    ser = X_prep[col]

    for i in range(nbins):
        if i < (nbins-1):
            a,b = bins[i], bins[i+1]
            name = col + f':{a}_{b}'
            print(name)

            # condition
            cond = (ser>=a) & (ser<b)
            X_prep[name] = np.where(cond,1,0)

        elif i == (nbins-1):
            b = bins[i]
            name = col + ':>=' + str(b) 
            print(name)

            # condition
            cond = ser >= b
            X_prep[name] = np.where(cond,1,0) 
In [230]:
def create_bins_dummies_int(X_prep,col,bins):
    """Creates new dummy columns in-place in given dataframe.
    
    For this integer bins,
    both left and right numbers are included.
    
    Example:
    ---------
    col = 'mths_since_last_delinq'
    bins = [0,3,30,56]
    create_bins_dummies_integer(X_prep,col,bins) 

    Outputs:
    --------
    mths_since_last_delinq:0_3
    mths_since_last_delinq:4_30
    mths_since_last_delinq:31_56
    mths_since_last_delinq:>=57

    """
    nbins = len(bins)
    ser = X_prep[col]

    # assert last number of bin is integer
    assert isinstance(bins[-1],int)

    for i in range(nbins):
        if i == 0:
            a,b = bins[i], bins[i+1]
            name = col + f':{a}_{b}'
            print(name)

            # condition
            cond = (ser>=a) & (ser<=b)
            X_prep[name] = np.where(cond,1,0)

        elif i < (nbins-1):
            a,b = bins[i]+1, bins[i+1]
            name = col + f':{a}_{b}'
            print(name)

            # condition
            cond = (ser>=a) & (ser<=b)
            X_prep[name] = np.where(cond,1,0)

        elif i == (nbins-1):
            b = bins[i]+1
            name = col + ':>=' + str(b) 
            print(name)

            # condition
            cond = ser >= b
            X_prep[name] = np.where(cond,1,0)
In [231]:
def create_dummies_left_inclusive(X_prep,col,bins,labels):
    nlabels = len(labels)
    for i in range(nlabels):
        name = col + ':' + labels[i]
        ser = X_prep[col]
        if i == 0:
            cond = (ser <= bins[i])
            X_prep[name] = np.where(cond,1,0)
            print(name)

        elif (i>0) & (i<(nlabels-1)):
            cond = (ser >= bins[i-1]) & (ser < bins[i])
            X_prep[name] = np.where(cond,1,0)
            print(name)
    
        elif i == (nlabels-1):
            cond = (ser > bins[i-1])
            X_prep[name] = np.where(cond,1,0)
            print(name)

woe dummies: grade

In [232]:
col = 'grade'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  7
nulls =  0
Out[232]:
193817    B
174039    C
Name: grade, dtype: object
In [233]:
df[col].value_counts().sort_index()
Out[233]:
A     74867
B    136929
C    125293
D     76888
E     35757
F     13229
G      3322
Name: grade, dtype: int64
In [234]:
"""
as we expect, lower the grade, higher is the probabilty of default.

here, we will not group grades into grade:G_F and so on.
we already have one hot encoding of grade, so we are fine.

grade:A
grade:B
grade:C
grade:D
grade:E
grade:F

""";

woe dummies: home_ownership

In [235]:
col = 'home_ownership'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  5
nulls =  0
Out[235]:
193817    RENT
174039     OWN
Name: home_ownership, dtype: object
In [236]:
cols = [i for i in X_prep.columns if 'home_ownership' in i]
X_prep[cols].head(2)
Out[236]:
home_ownership home_ownership:ANY home_ownership:MORTGAGE home_ownership:NONE home_ownership:OTHER home_ownership:OWN home_ownership:RENT
193817 RENT 0 0 0 0 0 1
174039 OWN 0 0 0 0 1 0
In [237]:
df_woe = get_woe_iv(X_prep,y_prep,'home_ownership')
plot_woe_plotly(df_woe)
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/io/formats/style.py:1093: RuntimeWarning:

invalid value encountered in double_scalars

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/matplotlib/colors.py:527: RuntimeWarning:

invalid value encountered in less

home_ownership n_obs woe diff_n_obs text_plotly
0 OTHER 37 -0.241722 8,297 n_obs: 37
WoE: -0.2417
1 RENT 37,331 -0.158371 nan n_obs: 37,331
WoE: -0.1584
2 OWN 8,334 -0.010046 28,997 n_obs: 8,334
WoE: -0.01
3 MORTGAGE 47,544 0.141610 47,533 n_obs: 47,544
WoE: 0.1416
4 NONE 11 inf 26 n_obs: 11
WoE: inf
In [238]:
"""

home_ownership:RENT_OTHER_NONE_ANY
home_ownership:OWN
home_ownership:MORTGAGE

""";
In [239]:
X_prep['home_ownership:RENT_OTHER_NONE_ANY'] = \
    sum([X_prep['home_ownership:RENT'], 
         X_prep['home_ownership:OTHER'],                                      X_prep['home_ownership:NONE'],
         X_prep['home_ownership:ANY']])

woe dummies: addr_state

In [240]:
col = 'addr_state'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  49
nulls =  0
Out[240]:
193817    CA
174039    WA
Name: addr_state, dtype: object
In [241]:
cols = [i for i in X_prep.columns if 'addr_state' in i]
X_prep[cols].head(2)
Out[241]:
addr_state addr_state:AK addr_state:AL addr_state:AR addr_state:AZ addr_state:CA addr_state:CO addr_state:CT addr_state:DC addr_state:DE addr_state:FL addr_state:GA addr_state:HI addr_state:IA addr_state:ID addr_state:IL addr_state:IN addr_state:KS addr_state:KY addr_state:LA addr_state:MA addr_state:MD addr_state:ME addr_state:MI addr_state:MN addr_state:MO addr_state:MS addr_state:MT addr_state:NC addr_state:NE addr_state:NH addr_state:NJ addr_state:NM addr_state:NV addr_state:NY addr_state:OH addr_state:OK addr_state:OR addr_state:PA addr_state:RI addr_state:SC addr_state:SD addr_state:TN addr_state:TX addr_state:UT addr_state:VA addr_state:VT addr_state:WA addr_state:WI addr_state:WV addr_state:WY
193817 CA 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
174039 WA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
In [242]:
X_prep['addr_state'].unique()
Out[242]:
array(['CA', 'WA', 'CO', 'NY', 'PA', 'GA', 'TX', 'OK', 'KY', 'FL', 'MS',
       'NC', 'MD', 'MO', 'OH', 'LA', 'HI', 'SC', 'RI', 'MI', 'NJ', 'TN',
       'MA', 'IL', 'UT', 'WI', 'VA', 'NV', 'MT', 'AL', 'CT', 'AZ', 'IN',
       'DC', 'DE', 'MN', 'OR', 'NM', 'AR', 'KS', 'NH', 'WV', 'AK', 'VT',
       'SD', 'WY', 'ID', 'IA', 'NE'], dtype=object)
In [243]:
train_state_names = ['GA', 'PA', 'WI', 'CA', 'FL', 'CT', 'TX', 'MA', 'OR', 'HI', 'MO',
       'MS', 'NY', 'SC', 'WV', 'IL', 'NJ', 'AZ', 'MN', 'CO', 'TN', 'AL',
       'LA', 'OH', 'VA', 'NV', 'WA', 'NC', 'WY', 'OK', 'MI', 'DC', 'NM',
       'AK', 'MD', 'AR', 'KS', 'IN', 'UT', 'DE', 'NH', 'KY', 'RI', 'MT',
       'VT', 'SD', 'NE', 'ID', 'ME', 'IA']
In [244]:
X_prep['addr_state'].nunique()
Out[244]:
49
In [245]:
"""
We have 50 states and one DC, so, we should have 51 values.

one state is missing. and the missing state is ND.
""";
In [246]:
state_names51 = ['AK','AL','AR','AZ','CA','CO','CT',
                 'DC',
                 'DE','FL','GA','HI','IA','ID','IL',
                 'IN','KS','KY','LA','MA','MD','ME',
                 'MI','MN','MO','MS','MT','NC', 'ND',
                 'NE','NH','NJ','NM','NV','NY','OH',
                 'OK','OR','PA','RI','SC','SD','TN',
                 'TX','UT','VA','VT','WA','WI','WV',
                 'WY']

len(state_names51)
Out[246]:
51
In [247]:
missing_states = [i for i in state_names51 if i not in
                train_state_names]

missing_states
Out[247]:
['ND']
In [248]:
for col in missing_states:
    name = 'addr_state:' + col
    if name in X_prep.columns:
        pass
    else:
        X_prep[name] = 0
In [249]:
# df_woe = get_woe_iv(X_prep,y_prep,'addr_state')
# plot_woe_plotly(df_woe)
In [250]:
"""
** do not forget the missing states.

 We create the following categories:
 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'
 'NM' 'VA'
 'NY'
 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'
 'CA'
 'UT' 'KY' 'AZ' 'NJ'
 'AR' 'MI' 'PA' 'OH' 'MN'
 'RI' 'MA' 'DE' 'SD' 'IN'
 'GA' 'WA' 'OR'
 'WI' 'MT'
 'TX'
 'IL' 'CT'
 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'
 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'

 'ND_NE_IA_NV_FL_HI_AL' will be the reference category.

""";
In [251]:
multiple = [
    'ND_NE_IA_NV_FL_HI_AL',
    'NM_VA',
    'OK_TN_MO_LA_MD_NC',
    'UT_KY_AZ_NJ',
    'AR_MI_PA_OH_MN',
    'RI_MA_DE_SD_IN',
    'GA_WA_OR',
    'WI_MT',
    'IL_CT',
    'KS_SC_CO_VT_AK_MS',
    'WV_NH_WY_DC_ME_ID'
                    
    ]

dummies = ['addr_state:'+i for i in multiple]
for d in dummies:
    print(d)
addr_state:ND_NE_IA_NV_FL_HI_AL
addr_state:NM_VA
addr_state:OK_TN_MO_LA_MD_NC
addr_state:UT_KY_AZ_NJ
addr_state:AR_MI_PA_OH_MN
addr_state:RI_MA_DE_SD_IN
addr_state:GA_WA_OR
addr_state:WI_MT
addr_state:IL_CT
addr_state:KS_SC_CO_VT_AK_MS
addr_state:WV_NH_WY_DC_ME_ID
In [252]:
"""
Store these dummies:

addr_state:ND_NE_IA_NV_FL_HI_AL
addr_state:NM_VA
addr_state:OK_TN_MO_LA_MD_NC
addr_state:UT_KY_AZ_NJ
addr_state:AR_MI_PA_OH_MN
addr_state:RI_MA_DE_SD_IN
addr_state:GA_WA_OR
addr_state:WI_MT
addr_state:IL_CT
addr_state:KS_SC_CO_VT_AK_MS
addr_state:WV_NH_WY_DC_ME_ID

""";
In [253]:
sep = '_'
one = 'ND_NE_IA_NV_FL_HI_AL'
lst_one = one.split(sep)
lst_one
Out[253]:
['ND', 'NE', 'IA', 'NV', 'FL', 'HI', 'AL']
In [254]:
colname = 'addr_state'
new_cat = colname + ':' + one
new_cat
Out[254]:
'addr_state:ND_NE_IA_NV_FL_HI_AL'
In [255]:
lst = [X_prep[colname + ':' + i] for i in lst_one]
In [256]:
sum(lst).head(2)
Out[256]:
193817    0
174039    0
dtype: int64
In [257]:
colname = 'addr_state'
sep = '_'
for one in multiple:
    lst_one = one.split(sep)
    new_cat = colname + ':' + one
    lst = [X_prep[colname + ':' + i] for i in lst_one]
    X_prep[new_cat] = sum(lst)
In [258]:
X_prep.iloc[:2,-2:]
Out[258]:
addr_state:KS_SC_CO_VT_AK_MS addr_state:WV_NH_WY_DC_ME_ID
193817 0 0
174039 0 0

woe dummies: verification_status

In [259]:
col = 'verification_status'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  3
nulls =  0
Out[259]:
193817        Verified
174039    Not Verified
Name: verification_status, dtype: object
In [260]:
X_prep[col].value_counts()
Out[260]:
Verified           33473
Source Verified    29955
Not Verified       29829
Name: verification_status, dtype: int64
In [261]:
cols = [i for i in X_prep.columns if 'verification_status' in i]
X_prep[cols].head(2)
Out[261]:
verification_status verification_status:Not Verified verification_status:Source Verified verification_status:Verified
193817 Verified 0 0 1
174039 Not Verified 1 0 0
In [262]:
for col in cols:
    print(col)
verification_status
verification_status:Not Verified
verification_status:Source Verified
verification_status:Verified
In [263]:
"""
verification_status:Not Verified
verification_status:Source Verified
verification_status:Verified

""";
In [264]:
# they all have significant n_obs.
# no need to group them.

woe dummies: purpose

In [265]:
col = 'purpose'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  14
nulls =  0
Out[265]:
193817    debt_consolidation
174039    debt_consolidation
Name: purpose, dtype: object
In [266]:
cols = [i for i in X_prep.columns if 'purpose' in i]
X_prep[cols].head(2)
Out[266]:
purpose purpose:car purpose:credit_card purpose:debt_consolidation purpose:educational purpose:home_improvement purpose:house purpose:major_purchase purpose:medical purpose:moving purpose:other purpose:renewable_energy purpose:small_business purpose:vacation purpose:wedding
193817 debt_consolidation 0 0 1 0 0 0 0 0 0 0 0 0 0 0
174039 debt_consolidation 0 0 1 0 0 0 0 0 0 0 0 0 0 0
In [267]:
# df_woe = get_woe_iv(X_prep,y_prep,'purpose')
# plot_woe_plotly(df_woe)
In [268]:
dummies = \
"""
small_business__educational__moving__renewable_energy
other
house__medical__wedding__vacation
debt_consolidation
home_improvement__major_purchase__car
credit_card
""".strip().split('\n')

dummies = ['purpose:'+ i for i in dummies]
for d in dummies:
    print(d)
purpose:small_business__educational__moving__renewable_energy
purpose:other
purpose:house__medical__wedding__vacation
purpose:debt_consolidation
purpose:home_improvement__major_purchase__car
purpose:credit_card
In [269]:
multiple =[
    'small_business__educational__moving__renewable_energy',
    'house__medical__wedding__vacation',
    'home_improvement__major_purchase__car'

]
In [270]:
colname = 'purpose'
sep = '__'

multiple =[
    'small_business__educational__moving__renewable_energy',
    'house__medical__wedding__vacation',
    'home_improvement__major_purchase__car'
]

for one in multiple:
    lst_one = one.split(sep)
    new_cat = colname + ':' + one
    lst = [X_prep[colname + ':' + i] for i in lst_one]
    X_prep[new_cat] = sum(lst)
In [271]:
X_prep.iloc[:2,-2:]
Out[271]:
purpose:house__medical__wedding__vacation purpose:home_improvement__major_purchase__car
193817 0 0
174039 0 0

woe dummies: initial_list_status

In [272]:
col = 'initial_list_status'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  2
nulls =  0
Out[272]:
193817    f
174039    f
Name: initial_list_status, dtype: object
In [273]:
cols = [i for i in X_prep.columns if 'initial_list_status' in i]
X_prep[cols].head(2)
Out[273]:
initial_list_status initial_list_status:f initial_list_status:w
193817 f 1 0
174039 f 1 0
In [274]:
# we already have binary encoding.
In [275]:
dummies = """
initial_list_status:f
initial_list_status:w

""";

woe dummies: term_int

In [276]:
col = 'term_int'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  2
nulls =  0
Out[276]:
193817    36
174039    36
Name: term_int, dtype: int64
In [277]:
X_prep['term:36'] = np.where((X_prep['term_int'] == 36), 1, 0)

X_prep['term:60'] = np.where((X_prep['term_int'] == 60), 1, 0)
In [278]:
dummies = \
"""
term:36
term:60
""";

woe dummies continuous: emp_length_int

In [279]:
col = 'emp_length_int'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  11
nulls =  0
Out[279]:
193817    5.0
174039    1.0
Name: emp_length_int, dtype: float32
In [280]:
X_prep[col].value_counts()
Out[280]:
10.0    30273
1.0     13123
2.0      8072
3.0      7245
5.0      6195
4.0      5620
6.0      5339
7.0      5118
8.0      4529
0.0      4131
9.0      3612
Name: emp_length_int, dtype: int64
In [281]:
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
Out[281]:
emp_length_int
193817 5.0
174039 1.0
In [282]:
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe_plotly(df_woe)
emp_length_int n_obs woe diff_n_obs text_plotly
0 0.000000 4,131 -0.224541 8,992 n_obs: 4,131
WoE: -0.2245
1 1.000000 13,123 -0.058005 5,051 n_obs: 13,123
WoE: -0.058
2 2.000000 8,072 0.027248 827 n_obs: 8,072
WoE: 0.0272
3 3.000000 7,245 -0.012977 1,625 n_obs: 7,245
WoE: -0.013
4 4.000000 5,620 0.046928 575 n_obs: 5,620
WoE: 0.0469
5 5.000000 6,195 -0.014760 856 n_obs: 6,195
WoE: -0.0148
6 6.000000 5,339 -0.084708 221 n_obs: 5,339
WoE: -0.0847
7 7.000000 5,118 -0.065842 589 n_obs: 5,118
WoE: -0.0658
8 8.000000 4,529 -0.020316 917 n_obs: 4,529
WoE: -0.0203
9 9.000000 3,612 -0.042647 26,661 n_obs: 3,612
WoE: -0.0426
10 10.000000 30,273 0.087623 nan n_obs: 30,273
WoE: 0.0876
In [283]:
"""
emp_length:0
emp_length:1
emp_length:2_3
emp_length:4_5
emp_length:6_7
emp_length:8_9
emp_length:10

""";
In [284]:
X_prep['emp_length:0'] = np.where(X_prep['emp_length_int'].isin([0]), 1, 0)

X_prep['emp_length:1'] = np.where(X_prep['emp_length_int'].isin([1]), 1, 0)

X_prep['emp_length:2_3'] = np.where(X_prep['emp_length_int'].isin([2,3]), 1, 0)

X_prep['emp_length:4_5'] = np.where(X_prep['emp_length_int'].isin([4,5]), 1, 0)

X_prep['emp_length:6_7'] = np.where(X_prep['emp_length_int'].isin([6,7]), 1, 0)

X_prep['emp_length:8_9'] = np.where(X_prep['emp_length_int'].isin([8,9]), 1, 0)

X_prep['emp_length:10'] = np.where(X_prep['emp_length_int'].isin([10]), 1, 0)

woe dummies continuous: mths_since_issue_d

In [285]:
col = 'mths_since_issue_d'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  91
nulls =  0
Out[285]:
193817    90.0
174039    87.0
Name: mths_since_issue_d, dtype: float64
In [286]:
X_prep[colf] = pd.cut(X_prep[col], 50)
In [287]:
X_prep[colf].head(2)
Out[287]:
193817    (89.2, 91.0]
174039    (85.6, 87.4]
Name: mths_since_issue_d_factor, dtype: category
Categories (50, interval[float64]): [(63.91, 65.8] < (65.8, 67.6] < (67.6, 69.4] < (69.4, 71.2] ... (146.8, 148.6] < (148.6, 150.4] < (150.4, 152.2] < (152.2, 154.0]]
In [288]:
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
# plot_woe_plotly(df_woe)
In [289]:
bins = [0,66,70,75,82,93,128]
col = 'mths_since_issue_d'

create_bins_dummies_int(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
mths_since_issue_d:0_66
mths_since_issue_d:67_70
mths_since_issue_d:71_75
mths_since_issue_d:76_82
mths_since_issue_d:83_93
mths_since_issue_d:94_128
mths_since_issue_d:>=129
Out[289]:
mths_since_issue_d mths_since_issue_d_factor mths_since_issue_d:0_66 mths_since_issue_d:67_70 mths_since_issue_d:71_75 mths_since_issue_d:76_82 mths_since_issue_d:83_93 mths_since_issue_d:94_128 mths_since_issue_d:>=129
193817 90.0 (89.2, 91.0] 0 0 0 0 1 0 0
174039 87.0 (85.6, 87.4] 0 0 0 0 1 0 0
In [290]:
# after 93, there are very few obervations
# we can group them all to one or make the two.

"""
mths_since_issue_d:0_66
mths_since_issue_d:67_70
mths_since_issue_d:71_75
mths_since_issue_d:76_82
mths_since_issue_d:83_93
mths_since_issue_d:94_128
mths_since_issue_d:>=129

""";

woe dummies continuous: int_rate

In [291]:
col = 'int_rate'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  477
nulls =  0
Out[291]:
193817    10.16
174039    15.80
Name: int_rate, dtype: float32
In [292]:
X_prep[colf] = pd.cut(X_prep[col], 50)
In [293]:
X_prep[colf].head(2)
Out[293]:
193817    (9.961, 10.374]
174039    (15.74, 16.153]
Name: int_rate_factor, dtype: category
Categories (50, interval[float64]): [(5.399, 5.833] < (5.833, 6.246] < (6.246, 6.658] < (6.658, 7.071] ... (24.409, 24.822] < (24.822, 25.234] < (25.234, 25.647] < (25.647, 26.06]]
In [294]:
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
In [295]:
bins = [0,9.961,12.025, 15.74, 20.281,27]

# we have monotonically decreasing curve, its easy to partition.
# first few point are zigzag, we suspect low n_obs.
# there is low n_obs, so bundle them up.
#
# look at graph, where woe drops sharply

"""
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281

""";
In [296]:
bins = [0,9.961,12.025, 15.74, 20.281]
col = 'int_rate'

create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281
Out[296]:
int_rate int_rate_factor int_rate:0_9.961 int_rate:9.961_12.025 int_rate:12.025_15.74 int_rate:15.74_20.281 int_rate:>=20.281
193817 10.16 (9.961, 10.374] 0 1 0 0 0
174039 15.80 (15.74, 16.153] 0 0 0 1 0

woe dummies continuous: funded_amnt **

In [297]:
col = 'funded_amnt'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  1245
nulls =  0
Out[297]:
193817    5500
174039    6000
Name: funded_amnt, dtype: int32
In [298]:
X_prep[colf] = pd.cut(X_prep[col], 50)
In [299]:
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
In [300]:
# woe is almost constant
# woe does not depend on dependent variable
# we can remove the variable from analysis.

woe dummies continuous: mths_since_earliest_cr_line

In [301]:
col = 'mths_since_earliest_cr_line'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  613
nulls =  0
Out[301]:
193817    156.0
174039    293.0
Name: mths_since_earliest_cr_line, dtype: float64
In [302]:
X_prep[colf] = pd.cut(X_prep[col], 50)

df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

In [303]:
bins = [0,207, 350]
col = 'mths_since_earliest_cr_line'

create_bins_dummies_int(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
mths_since_earliest_cr_line:0_207
mths_since_earliest_cr_line:208_350
mths_since_earliest_cr_line:>=351
Out[303]:
mths_since_earliest_cr_line mths_since_earliest_cr_line_factor mths_since_earliest_cr_line:0_207 mths_since_earliest_cr_line:208_350 mths_since_earliest_cr_line:>=351
193817 156.0 (135.4, 159.38] 1 0 0
174039 293.0 (279.28, 303.26] 0 1 0
In [304]:
"""
Ignore first few low numbers,
we see upward trend

mths_since_earliest_cr_line:0_207
mths_since_earliest_cr_line:208_350
mths_since_earliest_cr_line:>=351

""";

woe dummies continuous: delinq_2yrs **

In [305]:
col = 'delinq_2yrs'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  19
nulls =  0
Out[305]:
193817    0.0
174039    1.0
Name: delinq_2yrs, dtype: float32
In [306]:
X_prep[col].value_counts()
Out[306]:
0.0     76625
1.0     11186
2.0      3286
3.0      1150
4.0       457
5.0       249
6.0       137
7.0        70
8.0        33
9.0        21
10.0       14
12.0        8
11.0        7
13.0        4
15.0        3
22.0        2
17.0        2
14.0        2
18.0        1
Name: delinq_2yrs, dtype: int64
In [307]:
# we do not want factor here.
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

In [308]:
"""
we might be attemped to ignore the feature
but this might be useful feature

we see lots of zeros, make it one feature.

then make 1-3 and greater than 3.

delinq_2yrs:0
delinq_2yrs:1_3
delinq_2yrs:>=3
""";
In [309]:
X_prep['delinq_2yrs:0'] = \
  np.where((X_prep['delinq_2yrs'] ==0), 1, 0)

X_prep['delinq_2yrs:1-3'] = \
  np.where((X_prep['delinq_2yrs'] >= 1) & 
           (X_prep['delinq_2yrs'] <= 3), 1, 0)

X_prep['delinq_2yrs:>=3'] = \
  np.where((X_prep['delinq_2yrs'] > 3), 1, 0)

woe dummies continuous: inq_last_6mths

In [310]:
col = 'inq_last_6mths'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  16
nulls =  0
Out[310]:
193817    1.0
174039    0.0
Name: inq_last_6mths, dtype: float32
In [311]:
X_prep[col].value_counts()
Out[311]:
0.0     48486
1.0     25848
2.0     11517
3.0      5028
4.0      1489
5.0       551
6.0       247
7.0        42
8.0        23
9.0        12
10.0        5
11.0        3
12.0        2
15.0        2
14.0        1
24.0        1
Name: inq_last_6mths, dtype: int64
In [312]:
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

In [313]:
"""
we see lots of zeros, make it one feature.

then make 1-3 and greater than 3.

inq_last_6mths:0
inq_last_6mths:1
inq_last_6mths:2_3
inq_last_6mths:>=3
""";
In [314]:
X_prep['inq_last_6mths:0'] = \
  np.where((X_prep['inq_last_6mths'] ==0), 1, 0)

X_prep['inq_last_6mths:1'] = \
  np.where((X_prep['inq_last_6mths'] ==1), 1, 0)

X_prep['inq_last_6mths:2_3'] = \
  np.where((X_prep['inq_last_6mths'] > 1) & 
           (X_prep['inq_last_6mths'] <= 3), 1, 0)

X_prep['inq_last_6mths:>3'] = \
  np.where((X_prep['inq_last_6mths'] > 3), 1, 0)

woe dummies continuous: open_acc

In [315]:
col = 'open_acc'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  55
nulls =  0
Out[315]:
193817    5.0
174039    7.0
Name: open_acc, dtype: float32
In [316]:
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

In [317]:
X_prep[col].value_counts().sort_index()[lambda x: x>600]
Out[317]:
3.0     1125
4.0     2562
5.0     4199
6.0     6183
7.0     7520
8.0     8415
9.0     8766
10.0    8451
11.0    7867
12.0    6854
13.0    5983
14.0    5031
15.0    3997
16.0    3334
17.0    2656
18.0    2206
19.0    1754
20.0    1349
21.0    1048
22.0     790
23.0     658
Name: open_acc, dtype: int64
In [318]:
X_prep[col+':<=3'] = \
  np.where((X_prep[col] <=3), 1, 0)

X_prep[col+':4_5'] = \
  np.where((X_prep[col] > 3) & 
           (X_prep[col] <= 5), 1, 0)

X_prep[col+':6_13'] = \
  np.where((X_prep[col] > 5) & 
           (X_prep[col] <= 13), 1, 0)

X_prep[col+':13_17'] = \
  np.where((X_prep[col] > 13) & 
           (X_prep[col] <= 17), 1, 0)

X_prep[col+':>17'] = \
  np.where((X_prep[col] > 17), 1, 0)
In [319]:
"""
There are many small number and woe curve is flat.
look at n_obs and group them.

open_acc:<=3
open_acc:4_5
open_acc:6_13
open_acc:13_17
open_acc:>17

""";
In [320]:
# df_woe

woe dummies continuous: pub_rec

In [321]:
col = 'pub_rec'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  17
nulls =  0
Out[321]:
193817    0.0
174039    0.0
Name: pub_rec, dtype: float32
In [322]:
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

In [323]:
X_prep[col].value_counts().sort_index()
Out[323]:
0.0     81014
1.0     10635
2.0      1098
3.0       322
4.0        89
5.0        45
6.0        26
7.0        10
8.0         7
9.0         2
10.0        2
11.0        2
13.0        1
16.0        1
17.0        1
18.0        1
19.0        1
Name: pub_rec, dtype: int64
In [324]:
X_prep[col+':0'] = \
  np.where((X_prep[col] ==0), 1, 0)

X_prep[col+':1'] = \
  np.where((X_prep[col] ==1), 1, 0)

X_prep[col+':2'] = \
  np.where((X_prep[col] ==2), 1, 0)

X_prep[col+':3'] = \
  np.where((X_prep[col] ==3), 1, 0)


X_prep[col+':>3'] = \
  np.where((X_prep[col] > 3), 1, 0)
In [325]:
"""
pub_rec:0
pub_rec:1
pub_rec:2
pub_rec:3
pub_rec:>3

""";

woe dummies continuous: total_acc

In [326]:
col = 'total_acc'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  99
nulls =  0
Out[326]:
193817     6.0
174039    16.0
Name: total_acc, dtype: float32
In [327]:
X_prep[col].value_counts()[lambda x: x>9000]
Out[327]:
Series([], Name: total_acc, dtype: int64)
In [328]:
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

In [329]:
bins = [0,6.24,21.84,37.44,40.56]
col = 'total_acc'

create_bins_dummies_cont(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
total_acc:0_6.24
total_acc:6.24_21.84
total_acc:21.84_37.44
total_acc:37.44_40.56
total_acc:>=40.56
Out[329]:
total_acc total_acc_factor total_acc:0_6.24 total_acc:6.24_21.84 total_acc:21.84_37.44 total_acc:37.44_40.56 total_acc:>=40.56
193817 6.0 (4.68, 7.02] 1 0 0 0 0
174039 16.0 (14.04, 16.38] 0 1 0 0 0
In [330]:
"""
total_acc:0_6.24
total_acc:6.24_21.84
total_acc:21.84_37.44
total_acc:37.44_40.56
total_acc:>=40.56

""";

woe dummies continuous: acc_now_delinq

In [331]:
col = 'acc_now_delinq'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  5
nulls =  0
Out[331]:
193817    0.0
174039    0.0
Name: acc_now_delinq, dtype: float32
In [332]:
X_prep[col].value_counts()
Out[332]:
0.0    92922
1.0      323
2.0        9
3.0        2
4.0        1
Name: acc_now_delinq, dtype: int64
In [333]:
X_prep[col+':0'] = \
  np.where((X_prep[col] ==0), 1, 0)

X_prep[col+'>=1'] = \
  np.where((X_prep[col] > 3), 1, 0)
In [334]:
"""
acc_now_delinq:0
acc_now_delinq:>=1

""";

woe dummies continuous: total_rev_hi_lim

In [335]:
col = 'total_rev_hi_lim'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  4925
nulls =  0
Out[335]:
193817    9600.0
174039    7300.0
Name: total_rev_hi_lim, dtype: float32
In [336]:
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

Out[336]:
total_rev_hi_lim_factor n_obs woe diff_n_obs text_plotly
0 (-1090.7, 10907.0] 21484 -0.173011 7121.0 n_obs: 21,484<br>WoE: -0.173<br>
1 (10907.0, 21814.0] 28605 -0.101744 10872.0 n_obs: 28,605<br>WoE: -0.1017<br>
2 (21814.0, 32721.0] 17733 0.011071 7346.0 n_obs: 17,733<br>WoE: 0.0111<br>
3 (32721.0, 43628.0] 10387 0.137990 4584.0 n_obs: 10,387<br>WoE: 0.138<br>
4 (43628.0, 54535.0] 5803 0.256852 2541.0 n_obs: 5,803<br>WoE: 0.2569<br>
In [337]:
df_woe = df_woe[df_woe.n_obs > 1000]
df_woe
Out[337]:
total_rev_hi_lim_factor n_obs woe diff_n_obs text_plotly
0 (-1090.7, 10907.0] 21484 -0.173011 7121.0 n_obs: 21,484<br>WoE: -0.173<br>
1 (10907.0, 21814.0] 28605 -0.101744 10872.0 n_obs: 28,605<br>WoE: -0.1017<br>
2 (21814.0, 32721.0] 17733 0.011071 7346.0 n_obs: 17,733<br>WoE: 0.0111<br>
3 (32721.0, 43628.0] 10387 0.137990 4584.0 n_obs: 10,387<br>WoE: 0.138<br>
4 (43628.0, 54535.0] 5803 0.256852 2541.0 n_obs: 5,803<br>WoE: 0.2569<br>
5 (54535.0, 65442.0] 3262 0.444045 1303.0 n_obs: 3,262<br>WoE: 0.444<br>
6 (65442.0, 76349.0] 1959 0.521514 761.0 n_obs: 1,959<br>WoE: 0.5215<br>
7 (76349.0, 87256.0] 1198 0.580151 398.0 n_obs: 1,198<br>WoE: 0.5802<br>
In [338]:
plot_woe(df_woe,xrot=90,color='k')
In [339]:
"""
total_rev_hi_lim:<=5k
total_rev_hi_lim:5k_10k
total_rev_hi_lim:10k_20k
total_rev_hi_lim:20k_30k
total_rev_hi_lim:30k_40k
total_rev_hi_lim:40k_50k
total_rev_hi_lim:50k_65k
total_rev_hi_lim:65k_80k
total_rev_hi_lim:>80k

""";
In [340]:
X_prep['total_rev_hi_lim:<=5k'] = \
np.where((X_prep['total_rev_hi_lim'] <= 5000), 1, 0)

X_prep['total_rev_hi_lim:5k_10k'] = \
np.where((X_prep['total_rev_hi_lim'] > 5000) & 
         (X_prep['total_rev_hi_lim'] <= 10000), 1, 0)

X_prep['total_rev_hi_lim:10k_20k'] = \
np.where((X_prep['total_rev_hi_lim'] > 10000) & 
         (X_prep['total_rev_hi_lim'] <= 20000), 1, 0)

X_prep['total_rev_hi_lim:20k_30k'] = \
np.where((X_prep['total_rev_hi_lim'] > 20000) & 
         (X_prep['total_rev_hi_lim'] <= 30000), 1, 0)

X_prep['total_rev_hi_lim:30k_40k'] = \
np.where((X_prep['total_rev_hi_lim'] > 30000) & 
         (X_prep['total_rev_hi_lim'] <= 40000), 1, 0)

X_prep['total_rev_hi_lim:40k_50k'] = \
np.where((X_prep['total_rev_hi_lim'] > 40000) & 
         (X_prep['total_rev_hi_lim'] <= 50000), 1, 0)

X_prep['total_rev_hi_lim:50k_65k'] = \
np.where((X_prep['total_rev_hi_lim'] > 50000) & 
         (X_prep['total_rev_hi_lim'] <= 65000), 1, 0)

X_prep['total_rev_hi_lim:65k_80k'] = \
np.where((X_prep['total_rev_hi_lim'] > 65000) & 
         (X_prep['total_rev_hi_lim'] <= 80000), 1, 0)

X_prep['total_rev_hi_lim:>80k'] = np.where((X_prep['total_rev_hi_lim'] > 80000), 1, 0)

woe dummies continuous: installment

In [341]:
col = 'installment'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  26843
nulls =  0
Out[341]:
193817    177.889999
174039    210.360001
Name: installment, dtype: float32
In [342]:
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)

df_woe.head()
Out[342]:
installment_factor n_obs woe diff_n_obs text_plotly
0 (14.518, 43.754] 454 0.159373 780.0 n_obs: 454<br>WoE: 0.1594<br>
1 (43.754, 71.599] 1234 0.130200 476.0 n_obs: 1,234<br>WoE: 0.1302<br>
2 (71.599, 99.443] 1710 0.067130 572.0 n_obs: 1,710<br>WoE: 0.0671<br>
3 (99.443, 127.288] 2282 -0.038149 144.0 n_obs: 2,282<br>WoE: -0.0381<br>
4 (127.288, 155.132] 2426 0.026251 1816.0 n_obs: 2,426<br>WoE: 0.0263<br>
In [352]:
df_woe = df_woe[df_woe.n_obs > 500]

df_woe.style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})
Out[352]:
installment_factor n_obs woe diff_n_obs text_plotly
1 (43.754, 71.599] 1,234 0.130200 476 n_obs: 1,234
WoE: 0.1302
2 (71.599, 99.443] 1,710 0.067130 572 n_obs: 1,710
WoE: 0.0671
3 (99.443, 127.288] 2,282 -0.038149 144 n_obs: 2,282
WoE: -0.0381
4 (127.288, 155.132] 2,426 0.026251 1,816 n_obs: 2,426
WoE: 0.0263
5 (155.132, 182.976] 4,242 0.033381 41 n_obs: 4,242
WoE: 0.0334
6 (182.976, 210.821] 4,283 0.126215 541 n_obs: 4,283
WoE: 0.1262
7 (210.821, 238.665] 3,742 0.096831 821 n_obs: 3,742
WoE: 0.0968
8 (238.665, 266.51] 4,563 0.129521 422 n_obs: 4,563
WoE: 0.1295
9 (266.51, 294.354] 4,985 -0.028918 187 n_obs: 4,985
WoE: -0.0289
10 (294.354, 322.198] 5,172 0.067806 1,278 n_obs: 5,172
WoE: 0.0678
11 (322.198, 350.043] 6,450 -0.025363 1,643 n_obs: 6,450
WoE: -0.0254
12 (350.043, 377.887] 4,807 -0.101960 340 n_obs: 4,807
WoE: -0.102
13 (377.887, 405.732] 4,467 0.126107 1,182 n_obs: 4,467
WoE: 0.1261
14 (405.732, 433.576] 3,285 -0.125048 120 n_obs: 3,285
WoE: -0.125
15 (433.576, 461.42] 3,165 -0.135761 706 n_obs: 3,165
WoE: -0.1358
16 (461.42, 489.265] 3,871 0.099491 81 n_obs: 3,871
WoE: 0.0995
17 (489.265, 517.109] 3,952 -0.078377 1,163 n_obs: 3,952
WoE: -0.0784
18 (517.109, 544.954] 2,789 -0.201587 101 n_obs: 2,789
WoE: -0.2016
19 (544.954, 572.798] 2,688 -0.016065 357 n_obs: 2,688
WoE: -0.0161
20 (572.798, 600.642] 2,331 -0.177846 33 n_obs: 2,331
WoE: -0.1778
21 (600.642, 628.487] 2,298 0.183212 142 n_obs: 2,298
WoE: 0.1832
22 (628.487, 656.331] 2,156 0.076731 123 n_obs: 2,156
WoE: 0.0767
23 (656.331, 684.176] 2,279 0.046221 687 n_obs: 2,279
WoE: 0.0462
24 (684.176, 712.02] 1,592 -0.208344 376 n_obs: 1,592
WoE: -0.2083
25 (712.02, 739.864] 1,216 0.068516 113 n_obs: 1,216
WoE: 0.0685
26 (739.864, 767.709] 1,329 0.167071 145 n_obs: 1,329
WoE: 0.1671
27 (767.709, 795.553] 1,184 0.075039 138 n_obs: 1,184
WoE: 0.075
28 (795.553, 823.398] 1,046 0.138044 90 n_obs: 1,046
WoE: 0.138
29 (823.398, 851.242] 956 0.026938 88 n_obs: 956
WoE: 0.0269
30 (851.242, 879.086] 1,044 0.051178 272 n_obs: 1,044
WoE: 0.0512
31 (879.086, 906.931] 772 -0.060162 204 n_obs: 772
WoE: -0.0602
32 (906.931, 934.775] 568 -0.360655 35 n_obs: 568
WoE: -0.3607
33 (934.775, 962.62] 603 -0.234802 126 n_obs: 603
WoE: -0.2348
In [360]:
plot_woe(df_woe,xrot=90,color='k')
In [361]:
bins = [0, 127,210,266,294,322,378,406,433,517,545,
       629,712,796,852,880,935,963]

col = 'installment'

create_bins_dummies_cont(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
installment:0_127
installment:127_210
installment:210_266
installment:266_294
installment:294_322
installment:322_378
installment:378_406
installment:406_433
installment:433_517
installment:517_545
installment:545_629
installment:629_712
installment:712_796
installment:796_852
installment:852_880
installment:880_935
installment:935_963
installment:>=963
Out[361]:
installment installment_factor installment:0_127 installment:127_210 installment:210_266 installment:266_294 installment:294_322 installment:322_378 installment:378_406 installment:406_433 installment:433_517 installment:517_545 installment:545_629 installment:629_712 installment:712_796 installment:796_852 installment:852_880 installment:880_935 installment:935_963 installment:>=963
193817 177.889999 (155.132, 182.976] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
174039 210.360001 (182.976, 210.821] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [362]:
"""
installment:0_127
installment:127_210
installment:210_266
installment:266_294
installment:294_322
installment:322_378
installment:378_406
installment:406_433
installment:433_517
installment:517_545
installment:545_629
installment:629_712
installment:712_796
installment:796_852
installment:852_880
installment:880_935
installment:935_963
installment:>=963

""";

woe dummies continuous: annual_inc

In [363]:
col = 'annual_inc'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  8952
nulls =  0
Out[363]:
193817    18572.849609
174039    20000.000000
Name: annual_inc, dtype: float32
In [364]:
X_prep[col].describe()
Out[364]:
count    9.325700e+04
mean     7.338044e+04
std      5.973804e+04
min      2.000000e+03
25%      4.500000e+04
50%      6.300000e+04
75%      8.826200e+04
max      7.446395e+06
Name: annual_inc, dtype: float64
In [365]:
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)

df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

Out[365]:
annual_inc_factor n_obs woe diff_n_obs text_plotly
0 (-5444.395, 150887.9] 89166 -0.018654 85606.0 n_obs: 89,166<br>WoE: -0.0187<br>
1 (150887.9, 299775.8] 3560 0.498201 3169.0 n_obs: 3,560<br>WoE: 0.4982<br>
2 (299775.8, 448663.7] 391 0.503297 317.0 n_obs: 391<br>WoE: 0.5033<br>
3 (448663.7, 597551.6] 74 0.526648 47.0 n_obs: 74<br>WoE: 0.5266<br>
4 (597551.6, 746439.5] 27 inf 12.0 n_obs: 27<br>WoE: inf<br>
In [366]:
# we see too many obs in first category. increase bins to 100
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)

df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

Out[366]:
annual_inc_factor n_obs woe diff_n_obs text_plotly
0 (-5444.395, 76443.95] 61174 -0.133182 33182.0 n_obs: 61,174<br>WoE: -0.1332<br>
1 (76443.95, 150887.9] 27992 0.274730 25035.0 n_obs: 27,992<br>WoE: 0.2747<br>
2 (150887.9, 225331.85] 2957 0.468022 2354.0 n_obs: 2,957<br>WoE: 0.468<br>
3 (225331.85, 299775.8] 603 0.658820 299.0 n_obs: 603<br>WoE: 0.6588<br>
4 (299775.8, 374219.75] 304 0.555222 217.0 n_obs: 304<br>WoE: 0.5552<br>
In [367]:
"""
Looking at factors of annual income, we can make two
groups with 150k (or 140k) and analyze the group
earning more than 140k separately.

""";
In [368]:
X_prep_tmp = X_prep[X_prep[col] <= 140000]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
y_prep_tmp = y_prep[X_prep_tmp.index]

df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()
Out[368]:
annual_inc_factor n_obs woe diff_n_obs text_plotly
0 (1862.0, 4760.0] 4 -0.976810 9.0 n_obs: 4<br>WoE: -0.9768<br>
1 (4760.0, 7520.0] 13 -0.871449 58.0 n_obs: 13<br>WoE: -0.8714<br>
2 (7520.0, 10280.0] 71 -0.482791 84.0 n_obs: 71<br>WoE: -0.4828<br>
3 (10280.0, 13040.0] 155 -0.473706 126.0 n_obs: 155<br>WoE: -0.4737<br>
4 (13040.0, 15800.0] 281 -0.662220 129.0 n_obs: 281<br>WoE: -0.6622<br>
In [369]:
X_prep_tmp[col].hist()
Out[369]:
<matplotlib.axes._subplots.AxesSubplot at 0x1aa38c128>
In [370]:
plot_woe(df_woe,xrot=90,color='k')
In [371]:
"""
WoE is monotonically increasing.
We can break into 10 equal parts.

""";
In [372]:
col = 'annual_inc'
bins = [20_000,30_000,40_000,50_000,
        60_000,70_000,80_000,90_000,
        100_000,120_000,140_000]

# we need one more labels than bins
labels = ['<20k',
          '20k_30k','30k_40k','40k_50k',
          '50k_60k','60k_70k','70k_80k',
          '80k_90k','90k_100k','100k_120k','120k_140k',
          '>140k']

create_dummies_left_inclusive(X_prep,col,bins,labels)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
annual_inc:<20k
annual_inc:20k_30k
annual_inc:30k_40k
annual_inc:40k_50k
annual_inc:50k_60k
annual_inc:60k_70k
annual_inc:70k_80k
annual_inc:80k_90k
annual_inc:90k_100k
annual_inc:100k_120k
annual_inc:120k_140k
annual_inc:>140k
Out[372]:
annual_inc annual_inc_factor annual_inc:<20k annual_inc:20k_30k annual_inc:30k_40k annual_inc:40k_50k annual_inc:50k_60k annual_inc:60k_70k annual_inc:70k_80k annual_inc:80k_90k annual_inc:90k_100k annual_inc:100k_120k annual_inc:120k_140k annual_inc:>140k
193817 18572.849609 (-5444.395, 76443.95] 1 0 0 0 0 0 0 0 0 0 0 0
174039 20000.000000 (-5444.395, 76443.95] 1 1 0 0 0 0 0 0 0 0 0 0

woe dummies continuous: mths_since_last_delinq **

In [373]:
col = 'mths_since_last_delinq'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  113
nulls =  49936
Out[373]:
193817     NaN
174039    22.0
Name: mths_since_last_delinq, dtype: float32
In [374]:
X_prep[col].hist()
Out[374]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a8c06b70>
In [375]:
# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals

print(col+':missing')
mths_since_last_delinq:missing
In [376]:
"""
Do not forget missing:

mths_since_last_delinq:missing
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57

""";
In [377]:
X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)

df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

Out[377]:
mths_since_last_delinq_factor n_obs woe diff_n_obs text_plotly
0 (-0.152, 3.04] 1369 -0.268930 608.0 n_obs: 1,369<br>WoE: -0.2689<br>
1 (3.04, 6.08] 1977 -0.047493 515.0 n_obs: 1,977<br>WoE: -0.0475<br>
2 (6.08, 9.12] 2492 -0.105770 34.0 n_obs: 2,492<br>WoE: -0.1058<br>
3 (9.12, 12.16] 2458 -0.045329 19.0 n_obs: 2,458<br>WoE: -0.0453<br>
4 (12.16, 15.2] 2439 0.013693 108.0 n_obs: 2,439<br>WoE: 0.0137<br>
In [378]:
plot_woe(df_woe,xrot=90,color='k')
In [379]:
# month is an integer, make intervals integer.
# NOTE: create missing category
In [380]:
col = 'mths_since_last_delinq'
bins = [0,3,30,56]
create_bins_dummies_int(X_prep,col,bins)    
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57
In [381]:
cols = [i for i in X_prep.columns if 'delinq' in i]
X_prep[cols].head(2)
Out[381]:
delinq_2yrs mths_since_last_delinq acc_now_delinq delinq_2yrs:0 delinq_2yrs:1-3 delinq_2yrs:>=3 acc_now_delinq:0 acc_now_delinq>=1 mths_since_last_delinq:missing mths_since_last_delinq:0_3 mths_since_last_delinq:4_30 mths_since_last_delinq:31_56 mths_since_last_delinq:>=57
193817 0.0 NaN 0.0 1 0 0 1 0 1 0 0 0 0
174039 1.0 22.0 0.0 0 1 0 1 0 0 0 1 0 0

woe dummies continuous: dti **

In [382]:
col = 'dti'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  3868
nulls =  0
Out[382]:
193817    13.370000
174039    19.559999
Name: dti, dtype: float32
In [383]:
X_prep[col].hist()
Out[383]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a071db00>
In [384]:
# we see very few observation after 35, we make one group
# we can create temp df with range 0 to 35 and bin into 50
# parts
In [385]:
X_prep_tmp = X_prep[X_prep[col]<35]
y_prep_tmp = y_prep[X_prep_tmp.index]
In [386]:
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()
Out[386]:
dti_factor n_obs woe diff_n_obs text_plotly
0 (-0.035, 0.7] 355 -0.141722 81.0 n_obs: 355<br>WoE: -0.1417<br>
1 (0.7, 1.4] 436 -0.007521 75.0 n_obs: 436<br>WoE: -0.0075<br>
2 (1.4, 2.099] 511 0.038375 134.0 n_obs: 511<br>WoE: 0.0384<br>
3 (2.099, 2.799] 645 0.517080 138.0 n_obs: 645<br>WoE: 0.5171<br>
4 (2.799, 3.499] 783 0.208073 209.0 n_obs: 783<br>WoE: 0.2081<br>
In [387]:
plot_woe(df_woe,xrot=90,color='k')
In [388]:
bins = [0,0.7,1.4,2.1,3.5,4.9,5.6,6.3,7,9,9.8,10.5,
       11.2,11.9,12.6,13.3,14,14.7,17.5,18.2,21,21.7,
       22.4,23.1,25.2,25.9,29.4,30.1,30.8,31.5,32.2,32.9,
       33.6]

create_bins_dummies_cont(X_prep,col,bins)  
dti:0_0.7
dti:0.7_1.4
dti:1.4_2.1
dti:2.1_3.5
dti:3.5_4.9
dti:4.9_5.6
dti:5.6_6.3
dti:6.3_7
dti:7_9
dti:9_9.8
dti:9.8_10.5
dti:10.5_11.2
dti:11.2_11.9
dti:11.9_12.6
dti:12.6_13.3
dti:13.3_14
dti:14_14.7
dti:14.7_17.5
dti:17.5_18.2
dti:18.2_21
dti:21_21.7
dti:21.7_22.4
dti:22.4_23.1
dti:23.1_25.2
dti:25.2_25.9
dti:25.9_29.4
dti:29.4_30.1
dti:30.1_30.8
dti:30.8_31.5
dti:31.5_32.2
dti:32.2_32.9
dti:32.9_33.6
dti:>=33.6

woe dummies continuous: mths_since_last_record ** missing

In [389]:
col = 'mths_since_last_record'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)
unique =  123
nulls =  80742
Out[389]:
193817   NaN
174039   NaN
Name: mths_since_last_record, dtype: float32
In [390]:
X_prep[col].hist()
Out[390]:
<matplotlib.axes._subplots.AxesSubplot at 0x18bf28320>
In [391]:
# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals

print(col+':missing')
mths_since_last_record:missing
In [392]:
"""
mths_since_last_record:missing
mths_since_last_record:0_2
mths_since_last_record:3_22
mths_since_last_record:23_46
mths_since_last_record:47_68
mths_since_last_record:69_85
mths_since_last_record:>=86

""";
In [393]:
X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)

df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

Out[393]:
mths_since_last_record_factor n_obs woe diff_n_obs text_plotly
0 (-0.129, 2.58] 287 -1.175688 257.0 n_obs: 287<br>WoE: -1.1757<br>
1 (2.58, 5.16] 30 -1.132177 2.0 n_obs: 30<br>WoE: -1.1322<br>
2 (5.16, 7.74] 32 1.290209 24.0 n_obs: 32<br>WoE: 1.2902<br>
3 (7.74, 10.32] 56 0.727902 13.0 n_obs: 56<br>WoE: 0.7279<br>
4 (10.32, 12.9] 43 0.133490 32.0 n_obs: 43<br>WoE: 0.1335<br>
In [394]:
plot_woe(df_woe,xrot=90,color='k')
In [395]:
# look at large falling line and create groups

bins = [0,2,22,46,68,85]
create_bins_dummies_int(X_prep,col,bins)
mths_since_last_record:0_2
mths_since_last_record:3_22
mths_since_last_record:23_46
mths_since_last_record:47_68
mths_since_last_record:69_85
mths_since_last_record:>=86

Save the train data and repeat for test

In [396]:
print(X_prep.shape)
X_prep.head(2)
(93257, 351)
Out[396]:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type acc_now_delinq tot_coll_amt tot_cur_bal total_rev_hi_lim issue_d_date earliest_cr_line_date last_pymnt_d_date last_credit_pull_d_date mths_since_issue_d mths_since_earliest_cr_line mths_since_last_pymnt_d mths_since_last_credit_pull_d term_int emp_length_int loan_income_ratio grade:A grade:B grade:C grade:D grade:E grade:F grade:G sub_grade:A1 sub_grade:A2 sub_grade:A3 sub_grade:A4 sub_grade:A5 sub_grade:B1 sub_grade:B2 sub_grade:B3 sub_grade:B4 sub_grade:B5 sub_grade:C1 sub_grade:C2 sub_grade:C3 sub_grade:C4 sub_grade:C5 sub_grade:D1 sub_grade:D2 sub_grade:D3 sub_grade:D4 sub_grade:D5 sub_grade:E1 sub_grade:E2 sub_grade:E3 sub_grade:E4 sub_grade:E5 sub_grade:F1 sub_grade:F2 sub_grade:F3 sub_grade:F4 sub_grade:F5 sub_grade:G1 sub_grade:G2 sub_grade:G3 sub_grade:G4 sub_grade:G5 home_ownership:ANY home_ownership:MORTGAGE home_ownership:NONE home_ownership:OTHER home_ownership:OWN home_ownership:RENT verification_status:Not Verified verification_status:Source Verified verification_status:Verified loan_status:Charged Off loan_status:Current loan_status:Default loan_status:Does not meet the credit policy. Status:Charged Off loan_status:Does not meet the credit policy. Status:Fully Paid loan_status:Fully Paid loan_status:In Grace Period loan_status:Late (16-30 days) loan_status:Late (31-120 days) purpose:car purpose:credit_card purpose:debt_consolidation purpose:educational purpose:home_improvement purpose:house purpose:major_purchase purpose:medical purpose:moving purpose:other purpose:renewable_energy purpose:small_business purpose:vacation purpose:wedding addr_state:AK addr_state:AL addr_state:AR addr_state:AZ addr_state:CA addr_state:CO addr_state:CT addr_state:DC addr_state:DE addr_state:FL addr_state:GA addr_state:HI addr_state:IA addr_state:ID addr_state:IL addr_state:IN addr_state:KS addr_state:KY addr_state:LA addr_state:MA addr_state:MD addr_state:ME addr_state:MI addr_state:MN addr_state:MO addr_state:MS addr_state:MT addr_state:NC addr_state:NE addr_state:NH addr_state:NJ addr_state:NM addr_state:NV addr_state:NY addr_state:OH addr_state:OK addr_state:OR addr_state:PA addr_state:RI addr_state:SC addr_state:SD addr_state:TN addr_state:TX addr_state:UT addr_state:VA addr_state:VT addr_state:WA addr_state:WI addr_state:WV addr_state:WY initial_list_status:f initial_list_status:w home_ownership:RENT_OTHER_NONE_ANY addr_state:ND addr_state:ND_NE_IA_NV_FL_HI_AL addr_state:NM_VA addr_state:OK_TN_MO_LA_MD_NC addr_state:UT_KY_AZ_NJ addr_state:AR_MI_PA_OH_MN addr_state:RI_MA_DE_SD_IN addr_state:GA_WA_OR addr_state:WI_MT addr_state:IL_CT addr_state:KS_SC_CO_VT_AK_MS addr_state:WV_NH_WY_DC_ME_ID purpose:small_business__educational__moving__renewable_energy purpose:house__medical__wedding__vacation purpose:home_improvement__major_purchase__car term:36 term:60 emp_length:0 emp_length:1 emp_length:2_3 emp_length:4_5 emp_length:6_7 emp_length:8_9 emp_length:10 mths_since_issue_d_factor mths_since_issue_d:0_66 mths_since_issue_d:67_70 mths_since_issue_d:71_75 mths_since_issue_d:76_82 mths_since_issue_d:83_93 mths_since_issue_d:94_128 mths_since_issue_d:>=129 int_rate_factor int_rate:0_9.961 int_rate:9.961_12.025 int_rate:12.025_15.74 int_rate:15.74_20.281 int_rate:>=20.281 funded_amnt_factor mths_since_earliest_cr_line_factor mths_since_earliest_cr_line:0_207 mths_since_earliest_cr_line:208_350 mths_since_earliest_cr_line:>=351 delinq_2yrs:0 delinq_2yrs:1-3 delinq_2yrs:>=3 inq_last_6mths:0 inq_last_6mths:1 inq_last_6mths:2_3 inq_last_6mths:>3 open_acc:<=3 open_acc:4_5 open_acc:6_13 open_acc:13_17 open_acc:>17 pub_rec:0 pub_rec:1 pub_rec:2 pub_rec:3 pub_rec:>3 total_acc_factor total_acc:0_6.24 total_acc:6.24_21.84 total_acc:21.84_37.44 total_acc:37.44_40.56 total_acc:>=40.56 acc_now_delinq:0 acc_now_delinq>=1 total_rev_hi_lim_factor total_rev_hi_lim:<=5k total_rev_hi_lim:5k_10k total_rev_hi_lim:10k_20k total_rev_hi_lim:20k_30k total_rev_hi_lim:30k_40k total_rev_hi_lim:40k_50k total_rev_hi_lim:50k_65k total_rev_hi_lim:65k_80k total_rev_hi_lim:>80k installment_factor installment:0_127 installment:127_210 installment:210_266 installment:266_294 installment:294_322 installment:322_378 installment:378_406 installment:406_433 installment:433_517 installment:517_545 installment:545_629 installment:629_712 installment:712_796 installment:796_852 installment:852_880 installment:880_935 installment:935_963 installment:>=963 annual_inc_factor annual_inc:<20k annual_inc:20k_30k annual_inc:30k_40k annual_inc:40k_50k annual_inc:50k_60k annual_inc:60k_70k annual_inc:70k_80k annual_inc:80k_90k annual_inc:90k_100k annual_inc:100k_120k annual_inc:120k_140k annual_inc:>140k mths_since_last_delinq:missing mths_since_last_delinq:0_3 mths_since_last_delinq:4_30 mths_since_last_delinq:31_56 mths_since_last_delinq:>=57 dti:0_0.7 dti:0.7_1.4 dti:1.4_2.1 dti:2.1_3.5 dti:3.5_4.9 dti:4.9_5.6 dti:5.6_6.3 dti:6.3_7 dti:7_9 dti:9_9.8 dti:9.8_10.5 dti:10.5_11.2 dti:11.2_11.9 dti:11.9_12.6 dti:12.6_13.3 dti:13.3_14 dti:14_14.7 dti:14.7_17.5 dti:17.5_18.2 dti:18.2_21 dti:21_21.7 dti:21.7_22.4 dti:22.4_23.1 dti:23.1_25.2 dti:25.2_25.9 dti:25.9_29.4 dti:29.4_30.1 dti:30.1_30.8 dti:30.8_31.5 dti:31.5_32.2 dti:32.2_32.9 dti:32.9_33.6 dti:>=33.6 mths_since_last_record:missing mths_since_last_record:0_2 mths_since_last_record:3_22 mths_since_last_record:23_46 mths_since_last_record:47_68 mths_since_last_record:69_85 mths_since_last_record:>=86
193817 193817 1581078 1850197 5500 5500 5500.0 36 months 10.16 177.889999 B B1 UPS 5 years RENT 18572.849609 Verified Oct-12 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... NaN debt_consolidation Debt consolidation 906xx CA 13.370000 0.0 Apr-07 1.0 NaN NaN 5.0 0.0 4892 51.0 6.0 f 0.0 0.0 6402.621094 6402.620117 5500.0 902.619995 0.0 0.0 0.0 Oct-15 181.750000 NaN Jan-16 0.0 NaN 1 INDIVIDUAL 0.0 0.0 6632.0 9600.0 2012-10-01 2007-04-01 2015-10-01 2016-01-01 90.0 156.0 54.0 51.0 36 5.0 0.296131 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 (89.2, 91.0] 0 0 0 0 1 0 0 (9.961, 10.374] 0 1 0 0 0 (5330.0, 6020.0] (135.4, 159.38] 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 (4.68, 7.02] 1 0 0 0 0 1 0 (-1090.7, 10907.0] 0 1 0 0 0 0 0 0 0 (155.132, 182.976] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 (-5444.395, 76443.95] 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
174039 174039 3055729 3728105 6000 6000 6000.0 36 months 15.80 210.360001 C C3 peshastin market < 1 year OWN 20000.000000 Not Verified Jan-13 Fully Paid n https://www.lendingclub.com/browse/loanDetail.... NaN debt_consolidation credit card pay off 988xx WA 19.559999 1.0 Nov-95 0.0 22.0 NaN 7.0 0.0 5291 72.5 16.0 f 0.0 0.0 7468.176270 7468.180176 6000.0 1468.180054 0.0 0.0 0.0 Apr-15 2004.670044 NaN May-15 0.0 29.0 1 INDIVIDUAL 0.0 0.0 11685.0 7300.0 2013-01-01 1995-11-01 2015-04-01 2015-05-01 87.0 293.0 60.0 59.0 36 1.0 0.300000 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 (85.6, 87.4] 0 0 0 0 1 0 0 (15.74, 16.153] 0 0 0 1 0 (5330.0, 6020.0] (279.28, 303.26] 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 (14.04, 16.38] 0 1 0 0 0 1 0 (-1090.7, 10907.0] 0 1 0 0 0 0 0 0 0 (182.976, 210.821] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 (-5444.395, 76443.95] 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
In [397]:
# 1. after saving, comment these two lines
# 2. go up in notebook where we have defined X_prep
#    and make it test and run all the cells below that.

X_prep.to_csv(dat_pro + 'Xtrain.csv')
y_prep.to_csv(dat_pro + 'ytrain.csv')
In [398]:
# after running this, comment it and uncomment train part
# we always create from train, not from test.

# X_prep.to_csv(dat_pro + 'Xtest.csv')
# y_prep.to_csv(dat_pro + 'ytest.csv')