1 Load the data
2 Memory Reduction
3 data type: datetetime
4 data type: object
- 4.1 Hidden Continuous features
5 Create New Columns
6 Missing Values
7 Dummy variables
8 Split train test data
9 Prepare the data for train and test separately
10 Weight of Evidence WoE and Information Value IV
11 Save the train data and repeat for test

import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 

# random state
SEED=100
np.random.seed(SEED)

[(x.__name__,x.__version__) for x in [np,pd,sns]]

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]

pd.options.display.max_columns = None

%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

import datetime
import functools

from sklearn.model_selection import train_test_split

import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)

[(x.__name__,x.__version__) for x in [plotly]]

[('plotly', '4.5.2')]

Load the data¶

!ls ../data/raw

LCDataDictionary.xlsx   loan_data_2007_2014.csv loan_data_2015.csv

!du -sh ../data/raw/loan_data_2007_2014.csv

229M	../data/raw/loan_data_2007_2014.csv

dat_raw = '../data/raw/'
dat_pro = '../data/processed/'

df = pd.read_csv(dat_raw + 'loan_data_2007_2014.csv',low_memory=False)

print(df.shape)
df.head(2).append(df.tail(2))

(466285, 75)

df_copy = df.copy(deep=True)

# df = df_copy.copy()

Memory Reduction¶

col_floats = df.select_dtypes(np.float64).columns
for col in col_floats:
    df[col] = df[col].astype(np.float32)

col_ints = df.select_dtypes(np.int64).columns
for col in col_ints:
    df[col] = df[col].astype(np.int32)

data type: datetetime¶

df.select_dtypes('object').head(1).T

cols_date = ['issue_d',
             'earliest_cr_line',
             'last_pymnt_d',
             'last_credit_pull_d']

for col in cols_date:
    df[col+'_date'] = pd.to_datetime(df[col],
                                     format='%b-%y',
                                     errors='coerce')

df[cols_date].dtypes

issue_d               object
earliest_cr_line      object
last_pymnt_d          object
last_credit_pull_d    object
dtype: object

df[cols_date].head(2)

today = datetime.datetime.today()
today

datetime.datetime(2020, 3, 31, 16, 6, 21, 636929)

today = datetime.datetime(2020, 3, 30)
today

datetime.datetime(2020, 3, 30, 0, 0)

df['earliest_cr_line_date'].head(2)

0   1985-01-01
1   1999-04-01
Name: earliest_cr_line_date, dtype: datetime64[ns]

df['earliest_cr_line_date'].isnull().sum()

29

for col in cols_date:
    df['mths_since_'+col] = \
        round((today - df[col+'_date']) / 
              np.timedelta64(1,'M'))

cols_months_since = ['mths_since_' + i for i in cols_date]
df[cols_months_since].describe().round(2).T

data type: object¶

df.select_dtypes('object').nunique().sort_values()

application_type            1
initial_list_status         2
pymnt_plan                  2
term                        2
verification_status         3
home_ownership              6
grade                       7
loan_status                 9
emp_length                 11
purpose                    14
sub_grade                  35
addr_state                 50
issue_d                    91
last_pymnt_d               98
next_pymnt_d              100
last_credit_pull_d        103
earliest_cr_line          664
zip_code                  888
title                   63099
desc                   124436
emp_title              205475
url                    466285
dtype: int64

cols_cat_small = df.select_dtypes('object').nunique()[lambda x: x<12].index

for col in cols_cat_small:
    print(col)
    print(df[col].unique())
    print()

term
[' 36 months' ' 60 months']

grade
['B' 'C' 'A' 'E' 'F' 'D' 'G']

emp_length
['10+ years' '< 1 year' '1 year' '3 years' '8 years' '9 years' '4 years'
 '5 years' '6 years' '2 years' '7 years' nan]

home_ownership
['RENT' 'OWN' 'MORTGAGE' 'OTHER' 'NONE' 'ANY']

verification_status
['Verified' 'Source Verified' 'Not Verified']

loan_status
['Fully Paid' 'Charged Off' 'Current' 'Default' 'Late (31-120 days)'
 'In Grace Period' 'Late (16-30 days)'
 'Does not meet the credit policy. Status:Fully Paid'
 'Does not meet the credit policy. Status:Charged Off']

pymnt_plan
['n' 'y']

initial_list_status
['f' 'w']

application_type
['INDIVIDUAL']

Hidden Continuous features¶

df['term'].value_counts(dropna=False)

 36 months    337953
 60 months    128332
Name: term, dtype: int64

df['term_int'] = df['term'].str.extract('(\d+)').astype(int)
df['term_int'].unique()

array([36, 60])

df['emp_length'].value_counts(dropna=False)

10+ years    150049
2 years       41373
3 years       36596
< 1 year      36265
5 years       30774
1 year        29622
4 years       28023
7 years       26180
6 years       26112
8 years       22395
NaN           21008
9 years       17888
Name: emp_length, dtype: int64

df['emp_length_int'] = df['emp_length']\
.str.replace('< 1years','0')\
.str.extract(r'(\d+)').astype(np.float32)

df['emp_length_int'].value_counts(dropna=False)

10.0    150049
1.0      65887
2.0      41373
3.0      36596
5.0      30774
4.0      28023
7.0      26180
6.0      26112
8.0      22395
NaN      21008
9.0      17888
Name: emp_length_int, dtype: int64

Create New Columns¶

df['loan_income_ratio']= df['loan_amnt']/df['annual_inc']

df.iloc[:2,-2:]

df['loan_status'].value_counts()

Current                                                224226
Fully Paid                                             184739
Charged Off                                             42475
Late (31-120 days)                                       6900
In Grace Period                                          3146
Does not meet the credit policy. Status:Fully Paid       1988
Late (16-30 days)                                        1218
Default                                                   832
Does not meet the credit policy. Status:Charged Off       761
Name: loan_status, dtype: int64

df['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

cols_bad = ['Charged Off',
            'Default',
            'Does not meet the credit policy. Status:Charged Off',
            'Late (31-120 days)']

df['good_bad'] = np.where(df['loan_status'].isin(cols_bad),0,1)

df['good_bad'].value_counts()

1    415317
0     50968
Name: good_bad, dtype: int64

Missing Values¶

pd.options.display.max_rows = 200

df.head(2)

missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()

df[missing_cols].head(2)

# df[missing_cols].isnull().sum()

df.shape

(466285, 87)

# drop columns if all of them are nans
df = df.dropna(how='all',axis=1)
df.isnull().sum()[lambda x: x>0]

emp_title                         27588
emp_length                        21008
annual_inc                            4
desc                             340302
title                                20
delinq_2yrs                          29
earliest_cr_line                     29
inq_last_6mths                       29
mths_since_last_delinq           250351
mths_since_last_record           403647
open_acc                             29
pub_rec                              29
revol_util                          340
total_acc                            29
last_pymnt_d                        376
next_pymnt_d                     227214
last_credit_pull_d                   42
collections_12_mths_ex_med          145
mths_since_last_major_derog      367311
acc_now_delinq                       29
tot_coll_amt                      70276
tot_cur_bal                       70276
total_rev_hi_lim                  70276
earliest_cr_line_date                29
last_pymnt_d_date                   376
last_credit_pull_d_date              42
mths_since_earliest_cr_line          29
mths_since_last_pymnt_d             376
mths_since_last_credit_pull_d        42
emp_length_int                    21008
loan_income_ratio                     4
dtype: int64

missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()

df[missing_cols].head(2)

columns = ['feature','n_missing','top_value',
            'proportion','nunique']
df_missing = pd.DataFrame(columns=columns)

for col in missing_cols:
    x = df[col].value_counts(normalize=True).round(2)
    unq = df[col].nunique()
    n_missing = df[col].isnull().sum()

    row = [col,n_missing, x.index[0],x.iloc[0],unq]
    df_missing.loc[len(df_missing)] = row

df_missing = df_missing.set_index('feature')
df_missing

fill nans with mean¶

df['annual_inc'].isna().sum()

4

df['annual_inc'] = df['annual_inc'].fillna(df['annual_inc'].mean())

fill nans with another column¶

df['total_rev_hi_lim'] = df['total_rev_hi_lim'].fillna(df['funded_amnt'])

fill nans with zero¶

cols_zero = ['mths_since_earliest_cr_line','acc_now_delinq',
            'total_acc','pub_rec','open_acc','inq_last_6mths',
            'delinq_2yrs','emp_length_int']

for col in cols_zero:
    df[col] = df[col].fillna(0)

Dummy variables¶

df.select_dtypes('object').nunique().sort_values()

application_type            1
initial_list_status         2
pymnt_plan                  2
term                        2
verification_status         3
home_ownership              6
grade                       7
loan_status                 9
emp_length                 11
purpose                    14
sub_grade                  35
addr_state                 50
issue_d                    91
last_pymnt_d               98
next_pymnt_d              100
last_credit_pull_d        103
earliest_cr_line          664
zip_code                  888
title                   63099
desc                   124436
emp_title              205475
url                    466285
dtype: int64

pd.get_dummies(df['grade'],prefix='grade',prefix_sep=':').head(2)

cols_dummy = ['grade','sub_grade','home_ownership',
             'verification_status','loan_status',
             'purpose','addr_state','initial_list_status']


df_dummies = [pd.get_dummies(df[col],
                             prefix=col,prefix_sep = ':')
             for col in cols_dummy]

df_dummies = pd.concat(df_dummies,axis=1)

print(df_dummies.shape)

df_dummies.head(2)

(466285, 126)

df = pd.concat([df,df_dummies],axis=1)

print(df.shape)
df.head(2)

(466285, 196)

df.isnull().sum().sum()

1779794

Split train test data¶

from sklearn.model_selection import train_test_split

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop('good_bad', axis = 1),
    df['good_bad'],
    train_size=0.8,
    random_state=SEED
    )

df_Xtrain.shape, df_Xtest.shape, ser_ytrain.shape, ser_ytest.shape

((373028, 195), (93257, 195), (373028,), (93257,))

Prepare the data for train and test separately¶

X_prep = df_Xtrain
y_prep = ser_ytrain

# In second round, we will comment train part and
# uncomment this test part.
#
# Note: after doing test part, comment test part and
#       and keep train part uncommented, since we need to
#       create features based on train not test.
#


# X_prep = df_Xtest
# y_prep = ser_ytest

Weight of Evidence WoE and Information Value IV¶

def get_woe_iv(df1,
               ser_target,
               feature_cat,
               target_name='good_bad',
               sort='woe',
              return_all=False):
    """Calculate Weight of Evidence and Infomation Value.
    
    Parameters
    -----------
    df1: pandas.DataFrame
        Input dataframe having categorical feature.
    ser_target: pandas.Series
        Pandas Series of binary values. e.g y_prep
    feature_cat: str
        Name of categorical column
    target_name: str
        Name of target column.
    sort: str
        Sort the output dataframe by this column.
        Default is 'woe'. We use 'woe' for discrete
        columns.
        
        If the column values are continuous, then
        use the actual column name for sorting.
    return_all: boolean, optional
        Whether or not return all columns


    Returns:
    --------
    Returns dataframe with WOE, IV and other quantities.
    
    
    Example:
    --------
    df_woe = get_woe_iv(X_prep,'grade',y_prep)
    """
    df1 = pd.DataFrame({
        feature_cat: df1[feature_cat],
        target_name: ser_target
    })

    df1 = df1.groupby(feature_cat).agg(
        n_obs=(target_name,'count'),
        prop_good=(target_name,'mean'))

    df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()

    df1['n_good'] = df1['prop_good'] * df1['n_obs']
    df1['n_bad'] = (1-df1['prop_good']) * df1['n_obs']

    df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()
    df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()

    df1['woe'] = np.log(df1['prop_n_good']/df1['prop_n_bad'])

    df1['diff_prop_good'] = df1['prop_good'].diff().abs()
    df1['diff_woe'] = df1['woe'].diff().abs()
    df1['diff_n_obs'] = df1['n_obs'].diff(-1).abs()

    df1['iv'] = ((df1['prop_n_good']-df1['prop_n_bad']
                 ) * df1['woe']
                ).sum()

    # reorder columns
    if not return_all:
        df1 = df1[['n_obs', 'woe', 'diff_n_obs' ]]  

    if return_all:
        df1 = df1[['n_obs', 'woe', 'diff_n_obs',
                   'prop_n_obs','iv',
                   'n_good', 'n_bad',
                   'prop_n_good','prop_n_bad',
                   'prop_good',
                   'diff_prop_good', 'diff_woe' ]]

    # add one more column
    n_obs1 = df1['n_obs'].apply(lambda x: "{:,d}".format(x)).astype(str)
    

    df1['text_plotly'] = (
        'n_obs:  ' + n_obs1 + '<br>' +
        'WoE:    ' + df1['woe'].round(4).astype(str) + '<br>' 
                         )

    # sort values
    df1 = df1.sort_values(sort)
    df1 = df1.reset_index()

    return df1

df_woe = get_woe_iv(X_prep,y_prep,'grade')

df_woe.head().style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})

def plot_woe(df_woe,xrot=0,figsize=(18,12),fontsize=20,color='red'):
    x = df_woe.iloc[:,0].to_numpy().astype(str)
    y = df_woe['woe']
    fig,ax = plt.subplots(1,1,figsize=figsize)
    plt.plot(x,y,marker='o',linestyle='--',color='k')
    plt.xlabel(df_woe.columns[0],fontsize=fontsize)
    plt.ylabel('Weight of Evidence',fontsize=fontsize)
    plt.xticks(rotation=xrot,fontsize=fontsize,color=color)
    plt.yticks(fontsize=fontsize)
    
    plt.grid(color='k',linestyle='-.')
    
    for i,txt in enumerate(df_woe['n_obs']):
        txt = '   {:,.0f}'.format(txt)
        try: # sometimes I get error in y[i]
            plt.annotate(txt, (x[i],y[i]),
                     rotation=90,fontsize=18,color='blue')
        except:
            pass

def plot_woe_plotly(df_woe):
    col = df_woe.columns.tolist()[0]
    x = df_woe.iloc[:,0].astype(str)
    y = df_woe['woe']
    z = df_woe['text_plotly']

    data = go.Scatter(x=x,y=y,text=z,mode='lines+markers')
    fig = go.Figure(data=data)
    
    fig.update_layout(
        title=f"WoE Curve for **{col}**",
        xaxis_title=f"{col}",
        yaxis_title="Weight of Evidence",

    )

    # also display dataframe
    df_style = df_woe.style\
              .background_gradient(subset=['n_obs'],
                                   cmap='Blues')\
              .background_gradient(subset=['woe'],
                                   cmap='Reds')\
              .format({'n_obs':"{:,d}",
                       'diff_n_obs':"{:,.0f}"})

    display(df_style)
    fig.show()

def create_bins_dummies_cont(X_prep,col,bins):
    """Creates new dummy columns in-place in given dataframe.
    
    For this continuous bins,
    left numbers are inclusive.
    
    Example:
    ========
    bins = [0,9.961,12.025, 15.74, 20.281]
    col = 'int_rate'

    create_bins_dummies_cont(X_prep,col,bins)
    cols = [i for i in X_prep.columns if col in i]
    X_prep[cols].head(2)

    Outputs:
    ========
    int_rate:0_9.961
    int_rate:9.961_12.025
    int_rate:12.025_15.74
    int_rate:15.74_20.281
    int_rate:>=20.281
    """
    nbins = len(bins)
    ser = X_prep[col]

    for i in range(nbins):
        if i < (nbins-1):
            a,b = bins[i], bins[i+1]
            name = col + f':{a}_{b}'
            print(name)

            # condition
            cond = (ser>=a) & (ser<b)
            X_prep[name] = np.where(cond,1,0)

        elif i == (nbins-1):
            b = bins[i]
            name = col + ':>=' + str(b) 
            print(name)

            # condition
            cond = ser >= b
            X_prep[name] = np.where(cond,1,0)

def create_bins_dummies_int(X_prep,col,bins):
    """Creates new dummy columns in-place in given dataframe.
    
    For this integer bins,
    both left and right numbers are included.
    
    Example:
    ---------
    col = 'mths_since_last_delinq'
    bins = [0,3,30,56]
    create_bins_dummies_integer(X_prep,col,bins) 

    Outputs:
    --------
    mths_since_last_delinq:0_3
    mths_since_last_delinq:4_30
    mths_since_last_delinq:31_56
    mths_since_last_delinq:>=57

    """
    nbins = len(bins)
    ser = X_prep[col]

    # assert last number of bin is integer
    assert isinstance(bins[-1],int)

    for i in range(nbins):
        if i == 0:
            a,b = bins[i], bins[i+1]
            name = col + f':{a}_{b}'
            print(name)

            # condition
            cond = (ser>=a) & (ser<=b)
            X_prep[name] = np.where(cond,1,0)

        elif i < (nbins-1):
            a,b = bins[i]+1, bins[i+1]
            name = col + f':{a}_{b}'
            print(name)

            # condition
            cond = (ser>=a) & (ser<=b)
            X_prep[name] = np.where(cond,1,0)

        elif i == (nbins-1):
            b = bins[i]+1
            name = col + ':>=' + str(b) 
            print(name)

            # condition
            cond = ser >= b
            X_prep[name] = np.where(cond,1,0)

def create_dummies_left_inclusive(X_prep,col,bins,labels):
    nlabels = len(labels)
    for i in range(nlabels):
        name = col + ':' + labels[i]
        ser = X_prep[col]
        if i == 0:
            cond = (ser <= bins[i])
            X_prep[name] = np.where(cond,1,0)
            print(name)

        elif (i>0) & (i<(nlabels-1)):
            cond = (ser >= bins[i-1]) & (ser < bins[i])
            X_prep[name] = np.where(cond,1,0)
            print(name)
    
        elif i == (nlabels-1):
            cond = (ser > bins[i-1])
            X_prep[name] = np.where(cond,1,0)
            print(name)

woe dummies: grade¶

col = 'grade'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  7
nulls =  0

193817    B
174039    C
Name: grade, dtype: object

df[col].value_counts().sort_index()

A     74867
B    136929
C    125293
D     76888
E     35757
F     13229
G      3322
Name: grade, dtype: int64

"""
as we expect, lower the grade, higher is the probabilty of default.

here, we will not group grades into grade:G_F and so on.
we already have one hot encoding of grade, so we are fine.

grade:A
grade:B
grade:C
grade:D
grade:E
grade:F

""";

woe dummies: home_ownership¶

col = 'home_ownership'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  5
nulls =  0

193817    RENT
174039     OWN
Name: home_ownership, dtype: object

cols = [i for i in X_prep.columns if 'home_ownership' in i]
X_prep[cols].head(2)

df_woe = get_woe_iv(X_prep,y_prep,'home_ownership')
plot_woe_plotly(df_woe)

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/io/formats/style.py:1093: RuntimeWarning:

invalid value encountered in double_scalars

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/matplotlib/colors.py:527: RuntimeWarning:

invalid value encountered in less

"""

home_ownership:RENT_OTHER_NONE_ANY
home_ownership:OWN
home_ownership:MORTGAGE

""";

X_prep['home_ownership:RENT_OTHER_NONE_ANY'] = \
    sum([X_prep['home_ownership:RENT'], 
         X_prep['home_ownership:OTHER'],                                      X_prep['home_ownership:NONE'],
         X_prep['home_ownership:ANY']])

woe dummies: addr_state¶

col = 'addr_state'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  49
nulls =  0

193817    CA
174039    WA
Name: addr_state, dtype: object

cols = [i for i in X_prep.columns if 'addr_state' in i]
X_prep[cols].head(2)

X_prep['addr_state'].unique()

array(['CA', 'WA', 'CO', 'NY', 'PA', 'GA', 'TX', 'OK', 'KY', 'FL', 'MS',
       'NC', 'MD', 'MO', 'OH', 'LA', 'HI', 'SC', 'RI', 'MI', 'NJ', 'TN',
       'MA', 'IL', 'UT', 'WI', 'VA', 'NV', 'MT', 'AL', 'CT', 'AZ', 'IN',
       'DC', 'DE', 'MN', 'OR', 'NM', 'AR', 'KS', 'NH', 'WV', 'AK', 'VT',
       'SD', 'WY', 'ID', 'IA', 'NE'], dtype=object)

train_state_names = ['GA', 'PA', 'WI', 'CA', 'FL', 'CT', 'TX', 'MA', 'OR', 'HI', 'MO',
       'MS', 'NY', 'SC', 'WV', 'IL', 'NJ', 'AZ', 'MN', 'CO', 'TN', 'AL',
       'LA', 'OH', 'VA', 'NV', 'WA', 'NC', 'WY', 'OK', 'MI', 'DC', 'NM',
       'AK', 'MD', 'AR', 'KS', 'IN', 'UT', 'DE', 'NH', 'KY', 'RI', 'MT',
       'VT', 'SD', 'NE', 'ID', 'ME', 'IA']

X_prep['addr_state'].nunique()

49

"""
We have 50 states and one DC, so, we should have 51 values.

one state is missing. and the missing state is ND.
""";

state_names51 = ['AK','AL','AR','AZ','CA','CO','CT',
                 'DC',
                 'DE','FL','GA','HI','IA','ID','IL',
                 'IN','KS','KY','LA','MA','MD','ME',
                 'MI','MN','MO','MS','MT','NC', 'ND',
                 'NE','NH','NJ','NM','NV','NY','OH',
                 'OK','OR','PA','RI','SC','SD','TN',
                 'TX','UT','VA','VT','WA','WI','WV',
                 'WY']

len(state_names51)

51

missing_states = [i for i in state_names51 if i not in
                train_state_names]

missing_states

['ND']

for col in missing_states:
    name = 'addr_state:' + col
    if name in X_prep.columns:
        pass
    else:
        X_prep[name] = 0

# df_woe = get_woe_iv(X_prep,y_prep,'addr_state')
# plot_woe_plotly(df_woe)

"""
** do not forget the missing states.

 We create the following categories:
 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'
 'NM' 'VA'
 'NY'
 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'
 'CA'
 'UT' 'KY' 'AZ' 'NJ'
 'AR' 'MI' 'PA' 'OH' 'MN'
 'RI' 'MA' 'DE' 'SD' 'IN'
 'GA' 'WA' 'OR'
 'WI' 'MT'
 'TX'
 'IL' 'CT'
 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'
 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'

 'ND_NE_IA_NV_FL_HI_AL' will be the reference category.

""";

multiple = [
    'ND_NE_IA_NV_FL_HI_AL',
    'NM_VA',
    'OK_TN_MO_LA_MD_NC',
    'UT_KY_AZ_NJ',
    'AR_MI_PA_OH_MN',
    'RI_MA_DE_SD_IN',
    'GA_WA_OR',
    'WI_MT',
    'IL_CT',
    'KS_SC_CO_VT_AK_MS',
    'WV_NH_WY_DC_ME_ID'
                    
    ]

dummies = ['addr_state:'+i for i in multiple]
for d in dummies:
    print(d)

addr_state:ND_NE_IA_NV_FL_HI_AL
addr_state:NM_VA
addr_state:OK_TN_MO_LA_MD_NC
addr_state:UT_KY_AZ_NJ
addr_state:AR_MI_PA_OH_MN
addr_state:RI_MA_DE_SD_IN
addr_state:GA_WA_OR
addr_state:WI_MT
addr_state:IL_CT
addr_state:KS_SC_CO_VT_AK_MS
addr_state:WV_NH_WY_DC_ME_ID

"""
Store these dummies:

addr_state:ND_NE_IA_NV_FL_HI_AL
addr_state:NM_VA
addr_state:OK_TN_MO_LA_MD_NC
addr_state:UT_KY_AZ_NJ
addr_state:AR_MI_PA_OH_MN
addr_state:RI_MA_DE_SD_IN
addr_state:GA_WA_OR
addr_state:WI_MT
addr_state:IL_CT
addr_state:KS_SC_CO_VT_AK_MS
addr_state:WV_NH_WY_DC_ME_ID

""";

sep = '_'
one = 'ND_NE_IA_NV_FL_HI_AL'
lst_one = one.split(sep)
lst_one

['ND', 'NE', 'IA', 'NV', 'FL', 'HI', 'AL']

colname = 'addr_state'
new_cat = colname + ':' + one
new_cat

'addr_state:ND_NE_IA_NV_FL_HI_AL'

lst = [X_prep[colname + ':' + i] for i in lst_one]

sum(lst).head(2)

193817    0
174039    0
dtype: int64

colname = 'addr_state'
sep = '_'
for one in multiple:
    lst_one = one.split(sep)
    new_cat = colname + ':' + one
    lst = [X_prep[colname + ':' + i] for i in lst_one]
    X_prep[new_cat] = sum(lst)

X_prep.iloc[:2,-2:]

woe dummies: verification_status¶

col = 'verification_status'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  3
nulls =  0

193817        Verified
174039    Not Verified
Name: verification_status, dtype: object

X_prep[col].value_counts()

Verified           33473
Source Verified    29955
Not Verified       29829
Name: verification_status, dtype: int64

cols = [i for i in X_prep.columns if 'verification_status' in i]
X_prep[cols].head(2)

for col in cols:
    print(col)

verification_status
verification_status:Not Verified
verification_status:Source Verified
verification_status:Verified

"""
verification_status:Not Verified
verification_status:Source Verified
verification_status:Verified

""";

# they all have significant n_obs.
# no need to group them.

woe dummies: purpose¶

col = 'purpose'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  14
nulls =  0

193817    debt_consolidation
174039    debt_consolidation
Name: purpose, dtype: object

cols = [i for i in X_prep.columns if 'purpose' in i]
X_prep[cols].head(2)

# df_woe = get_woe_iv(X_prep,y_prep,'purpose')
# plot_woe_plotly(df_woe)

dummies = \
"""
small_business__educational__moving__renewable_energy
other
house__medical__wedding__vacation
debt_consolidation
home_improvement__major_purchase__car
credit_card
""".strip().split('\n')

dummies = ['purpose:'+ i for i in dummies]
for d in dummies:
    print(d)

purpose:small_business__educational__moving__renewable_energy
purpose:other
purpose:house__medical__wedding__vacation
purpose:debt_consolidation
purpose:home_improvement__major_purchase__car
purpose:credit_card

multiple =[
    'small_business__educational__moving__renewable_energy',
    'house__medical__wedding__vacation',
    'home_improvement__major_purchase__car'

]

colname = 'purpose'
sep = '__'

multiple =[
    'small_business__educational__moving__renewable_energy',
    'house__medical__wedding__vacation',
    'home_improvement__major_purchase__car'
]

for one in multiple:
    lst_one = one.split(sep)
    new_cat = colname + ':' + one
    lst = [X_prep[colname + ':' + i] for i in lst_one]
    X_prep[new_cat] = sum(lst)

X_prep.iloc[:2,-2:]

woe dummies: initial_list_status¶

col = 'initial_list_status'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  2
nulls =  0

193817    f
174039    f
Name: initial_list_status, dtype: object

cols = [i for i in X_prep.columns if 'initial_list_status' in i]
X_prep[cols].head(2)

# we already have binary encoding.

dummies = """
initial_list_status:f
initial_list_status:w

""";

woe dummies: term_int¶

col = 'term_int'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  2
nulls =  0

193817    36
174039    36
Name: term_int, dtype: int64

X_prep['term:36'] = np.where((X_prep['term_int'] == 36), 1, 0)

X_prep['term:60'] = np.where((X_prep['term_int'] == 60), 1, 0)

dummies = \
"""
term:36
term:60
""";

woe dummies continuous: emp_length_int¶

col = 'emp_length_int'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  11
nulls =  0

193817    5.0
174039    1.0
Name: emp_length_int, dtype: float32

X_prep[col].value_counts()

10.0    30273
1.0     13123
2.0      8072
3.0      7245
5.0      6195
4.0      5620
6.0      5339
7.0      5118
8.0      4529
0.0      4131
9.0      3612
Name: emp_length_int, dtype: int64

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe_plotly(df_woe)

"""
emp_length:0
emp_length:1
emp_length:2_3
emp_length:4_5
emp_length:6_7
emp_length:8_9
emp_length:10

""";

X_prep['emp_length:0'] = np.where(X_prep['emp_length_int'].isin([0]), 1, 0)

X_prep['emp_length:1'] = np.where(X_prep['emp_length_int'].isin([1]), 1, 0)

X_prep['emp_length:2_3'] = np.where(X_prep['emp_length_int'].isin([2,3]), 1, 0)

X_prep['emp_length:4_5'] = np.where(X_prep['emp_length_int'].isin([4,5]), 1, 0)

X_prep['emp_length:6_7'] = np.where(X_prep['emp_length_int'].isin([6,7]), 1, 0)

X_prep['emp_length:8_9'] = np.where(X_prep['emp_length_int'].isin([8,9]), 1, 0)

X_prep['emp_length:10'] = np.where(X_prep['emp_length_int'].isin([10]), 1, 0)

woe dummies continuous: mths_since_issue_d¶

col = 'mths_since_issue_d'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  91
nulls =  0

193817    90.0
174039    87.0
Name: mths_since_issue_d, dtype: float64

X_prep[colf] = pd.cut(X_prep[col], 50)

X_prep[colf].head(2)

193817    (89.2, 91.0]
174039    (85.6, 87.4]
Name: mths_since_issue_d_factor, dtype: category
Categories (50, interval[float64]): [(63.91, 65.8] < (65.8, 67.6] < (67.6, 69.4] < (69.4, 71.2] ... (146.8, 148.6] < (148.6, 150.4] < (150.4, 152.2] < (152.2, 154.0]]

df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
# plot_woe_plotly(df_woe)

bins = [0,66,70,75,82,93,128]
col = 'mths_since_issue_d'

create_bins_dummies_int(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

mths_since_issue_d:0_66
mths_since_issue_d:67_70
mths_since_issue_d:71_75
mths_since_issue_d:76_82
mths_since_issue_d:83_93
mths_since_issue_d:94_128
mths_since_issue_d:>=129

# after 93, there are very few obervations
# we can group them all to one or make the two.

"""
mths_since_issue_d:0_66
mths_since_issue_d:67_70
mths_since_issue_d:71_75
mths_since_issue_d:76_82
mths_since_issue_d:83_93
mths_since_issue_d:94_128
mths_since_issue_d:>=129

""";

woe dummies continuous: int_rate¶

col = 'int_rate'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  477
nulls =  0

193817    10.16
174039    15.80
Name: int_rate, dtype: float32

X_prep[colf] = pd.cut(X_prep[col], 50)

X_prep[colf].head(2)

193817    (9.961, 10.374]
174039    (15.74, 16.153]
Name: int_rate_factor, dtype: category
Categories (50, interval[float64]): [(5.399, 5.833] < (5.833, 6.246] < (6.246, 6.658] < (6.658, 7.071] ... (24.409, 24.822] < (24.822, 25.234] < (25.234, 25.647] < (25.647, 26.06]]

# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')

bins = [0,9.961,12.025, 15.74, 20.281,27]

# we have monotonically decreasing curve, its easy to partition.
# first few point are zigzag, we suspect low n_obs.
# there is low n_obs, so bundle them up.
#
# look at graph, where woe drops sharply

"""
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281

""";

bins = [0,9.961,12.025, 15.74, 20.281]
col = 'int_rate'

create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281

woe dummies continuous: funded_amnt **¶

col = 'funded_amnt'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  1245
nulls =  0

193817    5500
174039    6000
Name: funded_amnt, dtype: int32

X_prep[colf] = pd.cut(X_prep[col], 50)

# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')

# woe is almost constant
# woe does not depend on dependent variable
# we can remove the variable from analysis.

woe dummies continuous: mths_since_earliest_cr_line¶

col = 'mths_since_earliest_cr_line'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  613
nulls =  0

193817    156.0
174039    293.0
Name: mths_since_earliest_cr_line, dtype: float64

X_prep[colf] = pd.cut(X_prep[col], 50)

df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

bins = [0,207, 350]
col = 'mths_since_earliest_cr_line'

create_bins_dummies_int(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

mths_since_earliest_cr_line:0_207
mths_since_earliest_cr_line:208_350
mths_since_earliest_cr_line:>=351

"""
Ignore first few low numbers,
we see upward trend

mths_since_earliest_cr_line:0_207
mths_since_earliest_cr_line:208_350
mths_since_earliest_cr_line:>=351

""";

woe dummies continuous: delinq_2yrs **¶

col = 'delinq_2yrs'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  19
nulls =  0

193817    0.0
174039    1.0
Name: delinq_2yrs, dtype: float32

X_prep[col].value_counts()

0.0     76625
1.0     11186
2.0      3286
3.0      1150
4.0       457
5.0       249
6.0       137
7.0        70
8.0        33
9.0        21
10.0       14
12.0        8
11.0        7
13.0        4
15.0        3
22.0        2
17.0        2
14.0        2
18.0        1
Name: delinq_2yrs, dtype: int64

# we do not want factor here.
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

"""
we might be attemped to ignore the feature
but this might be useful feature

we see lots of zeros, make it one feature.

then make 1-3 and greater than 3.

delinq_2yrs:0
delinq_2yrs:1_3
delinq_2yrs:>=3
""";

X_prep['delinq_2yrs:0'] = \
  np.where((X_prep['delinq_2yrs'] ==0), 1, 0)

X_prep['delinq_2yrs:1-3'] = \
  np.where((X_prep['delinq_2yrs'] >= 1) & 
           (X_prep['delinq_2yrs'] <= 3), 1, 0)

X_prep['delinq_2yrs:>=3'] = \
  np.where((X_prep['delinq_2yrs'] > 3), 1, 0)

woe dummies continuous: inq_last_6mths¶

col = 'inq_last_6mths'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  16
nulls =  0

193817    1.0
174039    0.0
Name: inq_last_6mths, dtype: float32

X_prep[col].value_counts()

0.0     48486
1.0     25848
2.0     11517
3.0      5028
4.0      1489
5.0       551
6.0       247
7.0        42
8.0        23
9.0        12
10.0        5
11.0        3
12.0        2
15.0        2
14.0        1
24.0        1
Name: inq_last_6mths, dtype: int64

df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

"""
we see lots of zeros, make it one feature.

then make 1-3 and greater than 3.

inq_last_6mths:0
inq_last_6mths:1
inq_last_6mths:2_3
inq_last_6mths:>=3
""";

X_prep['inq_last_6mths:0'] = \
  np.where((X_prep['inq_last_6mths'] ==0), 1, 0)

X_prep['inq_last_6mths:1'] = \
  np.where((X_prep['inq_last_6mths'] ==1), 1, 0)

X_prep['inq_last_6mths:2_3'] = \
  np.where((X_prep['inq_last_6mths'] > 1) & 
           (X_prep['inq_last_6mths'] <= 3), 1, 0)

X_prep['inq_last_6mths:>3'] = \
  np.where((X_prep['inq_last_6mths'] > 3), 1, 0)

woe dummies continuous: open_acc¶

col = 'open_acc'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  55
nulls =  0

193817    5.0
174039    7.0
Name: open_acc, dtype: float32

df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

X_prep[col].value_counts().sort_index()[lambda x: x>600]

3.0     1125
4.0     2562
5.0     4199
6.0     6183
7.0     7520
8.0     8415
9.0     8766
10.0    8451
11.0    7867
12.0    6854
13.0    5983
14.0    5031
15.0    3997
16.0    3334
17.0    2656
18.0    2206
19.0    1754
20.0    1349
21.0    1048
22.0     790
23.0     658
Name: open_acc, dtype: int64

X_prep[col+':<=3'] = \
  np.where((X_prep[col] <=3), 1, 0)

X_prep[col+':4_5'] = \
  np.where((X_prep[col] > 3) & 
           (X_prep[col] <= 5), 1, 0)

X_prep[col+':6_13'] = \
  np.where((X_prep[col] > 5) & 
           (X_prep[col] <= 13), 1, 0)

X_prep[col+':13_17'] = \
  np.where((X_prep[col] > 13) & 
           (X_prep[col] <= 17), 1, 0)

X_prep[col+':>17'] = \
  np.where((X_prep[col] > 17), 1, 0)

"""
There are many small number and woe curve is flat.
look at n_obs and group them.

open_acc:<=3
open_acc:4_5
open_acc:6_13
open_acc:13_17
open_acc:>17

""";

# df_woe

woe dummies continuous: pub_rec¶

col = 'pub_rec'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  17
nulls =  0

193817    0.0
174039    0.0
Name: pub_rec, dtype: float32

df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)

plot_woe(df_woe,xrot=90,color='k')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

X_prep[col].value_counts().sort_index()

0.0     81014
1.0     10635
2.0      1098
3.0       322
4.0        89
5.0        45
6.0        26
7.0        10
8.0         7
9.0         2
10.0        2
11.0        2
13.0        1
16.0        1
17.0        1
18.0        1
19.0        1
Name: pub_rec, dtype: int64

X_prep[col+':0'] = \
  np.where((X_prep[col] ==0), 1, 0)

X_prep[col+':1'] = \
  np.where((X_prep[col] ==1), 1, 0)

X_prep[col+':2'] = \
  np.where((X_prep[col] ==2), 1, 0)

X_prep[col+':3'] = \
  np.where((X_prep[col] ==3), 1, 0)


X_prep[col+':>3'] = \
  np.where((X_prep[col] > 3), 1, 0)

"""
pub_rec:0
pub_rec:1
pub_rec:2
pub_rec:3
pub_rec:>3

""";

woe dummies continuous: total_acc¶

col = 'total_acc'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  99
nulls =  0

193817     6.0
174039    16.0
Name: total_acc, dtype: float32

X_prep[col].value_counts()[lambda x: x>9000]

Series([], Name: total_acc, dtype: int64)

X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

bins = [0,6.24,21.84,37.44,40.56]
col = 'total_acc'

create_bins_dummies_cont(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

total_acc:0_6.24
total_acc:6.24_21.84
total_acc:21.84_37.44
total_acc:37.44_40.56
total_acc:>=40.56

"""
total_acc:0_6.24
total_acc:6.24_21.84
total_acc:21.84_37.44
total_acc:37.44_40.56
total_acc:>=40.56

""";

woe dummies continuous: acc_now_delinq¶

col = 'acc_now_delinq'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  5
nulls =  0

193817    0.0
174039    0.0
Name: acc_now_delinq, dtype: float32

X_prep[col].value_counts()

0.0    92922
1.0      323
2.0        9
3.0        2
4.0        1
Name: acc_now_delinq, dtype: int64

X_prep[col+':0'] = \
  np.where((X_prep[col] ==0), 1, 0)

X_prep[col+'>=1'] = \
  np.where((X_prep[col] > 3), 1, 0)

"""
acc_now_delinq:0
acc_now_delinq:>=1

""";

woe dummies continuous: total_rev_hi_lim¶

col = 'total_rev_hi_lim'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  4925
nulls =  0

193817    9600.0
174039    7300.0
Name: total_rev_hi_lim, dtype: float32

X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

df_woe = df_woe[df_woe.n_obs > 1000]
df_woe

plot_woe(df_woe,xrot=90,color='k')

"""
total_rev_hi_lim:<=5k
total_rev_hi_lim:5k_10k
total_rev_hi_lim:10k_20k
total_rev_hi_lim:20k_30k
total_rev_hi_lim:30k_40k
total_rev_hi_lim:40k_50k
total_rev_hi_lim:50k_65k
total_rev_hi_lim:65k_80k
total_rev_hi_lim:>80k

""";

X_prep['total_rev_hi_lim:<=5k'] = \
np.where((X_prep['total_rev_hi_lim'] <= 5000), 1, 0)

X_prep['total_rev_hi_lim:5k_10k'] = \
np.where((X_prep['total_rev_hi_lim'] > 5000) & 
         (X_prep['total_rev_hi_lim'] <= 10000), 1, 0)

X_prep['total_rev_hi_lim:10k_20k'] = \
np.where((X_prep['total_rev_hi_lim'] > 10000) & 
         (X_prep['total_rev_hi_lim'] <= 20000), 1, 0)

X_prep['total_rev_hi_lim:20k_30k'] = \
np.where((X_prep['total_rev_hi_lim'] > 20000) & 
         (X_prep['total_rev_hi_lim'] <= 30000), 1, 0)

X_prep['total_rev_hi_lim:30k_40k'] = \
np.where((X_prep['total_rev_hi_lim'] > 30000) & 
         (X_prep['total_rev_hi_lim'] <= 40000), 1, 0)

X_prep['total_rev_hi_lim:40k_50k'] = \
np.where((X_prep['total_rev_hi_lim'] > 40000) & 
         (X_prep['total_rev_hi_lim'] <= 50000), 1, 0)

X_prep['total_rev_hi_lim:50k_65k'] = \
np.where((X_prep['total_rev_hi_lim'] > 50000) & 
         (X_prep['total_rev_hi_lim'] <= 65000), 1, 0)

X_prep['total_rev_hi_lim:65k_80k'] = \
np.where((X_prep['total_rev_hi_lim'] > 65000) & 
         (X_prep['total_rev_hi_lim'] <= 80000), 1, 0)

X_prep['total_rev_hi_lim:>80k'] = np.where((X_prep['total_rev_hi_lim'] > 80000), 1, 0)

woe dummies continuous: installment¶

col = 'installment'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  26843
nulls =  0

193817    177.889999
174039    210.360001
Name: installment, dtype: float32

X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)

df_woe.head()

df_woe = df_woe[df_woe.n_obs > 500]

df_woe.style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})

plot_woe(df_woe,xrot=90,color='k')

bins = [0, 127,210,266,294,322,378,406,433,517,545,
       629,712,796,852,880,935,963]

col = 'installment'

create_bins_dummies_cont(X_prep,col,bins)

cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

installment:0_127
installment:127_210
installment:210_266
installment:266_294
installment:294_322
installment:322_378
installment:378_406
installment:406_433
installment:433_517
installment:517_545
installment:545_629
installment:629_712
installment:712_796
installment:796_852
installment:852_880
installment:880_935
installment:935_963
installment:>=963

"""
installment:0_127
installment:127_210
installment:210_266
installment:266_294
installment:294_322
installment:322_378
installment:378_406
installment:406_433
installment:433_517
installment:517_545
installment:545_629
installment:629_712
installment:712_796
installment:796_852
installment:852_880
installment:880_935
installment:935_963
installment:>=963

""";

woe dummies continuous: annual_inc¶

col = 'annual_inc'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  8952
nulls =  0

193817    18572.849609
174039    20000.000000
Name: annual_inc, dtype: float32

X_prep[col].describe()

count    9.325700e+04
mean     7.338044e+04
std      5.973804e+04
min      2.000000e+03
25%      4.500000e+04
50%      6.300000e+04
75%      8.826200e+04
max      7.446395e+06
Name: annual_inc, dtype: float64

X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)

df_woe.head()

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

# we see too many obs in first category. increase bins to 100
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)

df_woe.head()

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

"""
Looking at factors of annual income, we can make two
groups with 150k (or 140k) and analyze the group
earning more than 140k separately.

""";

X_prep_tmp = X_prep[X_prep[col] <= 140000]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
y_prep_tmp = y_prep[X_prep_tmp.index]

df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()

X_prep_tmp[col].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1aa38c128>

plot_woe(df_woe,xrot=90,color='k')

"""
WoE is monotonically increasing.
We can break into 10 equal parts.

""";

col = 'annual_inc'
bins = [20_000,30_000,40_000,50_000,
        60_000,70_000,80_000,90_000,
        100_000,120_000,140_000]

# we need one more labels than bins
labels = ['<20k',
          '20k_30k','30k_40k','40k_50k',
          '50k_60k','60k_70k','70k_80k',
          '80k_90k','90k_100k','100k_120k','120k_140k',
          '>140k']

create_dummies_left_inclusive(X_prep,col,bins,labels)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)

annual_inc:<20k
annual_inc:20k_30k
annual_inc:30k_40k
annual_inc:40k_50k
annual_inc:50k_60k
annual_inc:60k_70k
annual_inc:70k_80k
annual_inc:80k_90k
annual_inc:90k_100k
annual_inc:100k_120k
annual_inc:120k_140k
annual_inc:>140k

woe dummies continuous: mths_since_last_delinq **¶

col = 'mths_since_last_delinq'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  113
nulls =  49936

193817     NaN
174039    22.0
Name: mths_since_last_delinq, dtype: float32

X_prep[col].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1a8c06b70>

# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals

print(col+':missing')

mths_since_last_delinq:missing

"""
Do not forget missing:

mths_since_last_delinq:missing
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57

""";

X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)

df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/algorithms.py:1926: RuntimeWarning:

invalid value encountered in subtract

plot_woe(df_woe,xrot=90,color='k')

# month is an integer, make intervals integer.
# NOTE: create missing category

col = 'mths_since_last_delinq'
bins = [0,3,30,56]
create_bins_dummies_int(X_prep,col,bins)

mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57

cols = [i for i in X_prep.columns if 'delinq' in i]
X_prep[cols].head(2)

woe dummies continuous: dti **¶

col = 'dti'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  3868
nulls =  0

193817    13.370000
174039    19.559999
Name: dti, dtype: float32

X_prep[col].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1a071db00>

# we see very few observation after 35, we make one group
# we can create temp df with range 0 to 35 and bin into 50
# parts

X_prep_tmp = X_prep[X_prep[col]<35]
y_prep_tmp = y_prep[X_prep_tmp.index]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()

plot_woe(df_woe,xrot=90,color='k')

bins = [0,0.7,1.4,2.1,3.5,4.9,5.6,6.3,7,9,9.8,10.5,
       11.2,11.9,12.6,13.3,14,14.7,17.5,18.2,21,21.7,
       22.4,23.1,25.2,25.9,29.4,30.1,30.8,31.5,32.2,32.9,
       33.6]

create_bins_dummies_cont(X_prep,col,bins)

dti:0_0.7
dti:0.7_1.4
dti:1.4_2.1
dti:2.1_3.5
dti:3.5_4.9
dti:4.9_5.6
dti:5.6_6.3
dti:6.3_7
dti:7_9
dti:9_9.8
dti:9.8_10.5
dti:10.5_11.2
dti:11.2_11.9
dti:11.9_12.6
dti:12.6_13.3
dti:13.3_14
dti:14_14.7
dti:14.7_17.5
dti:17.5_18.2
dti:18.2_21
dti:21_21.7
dti:21.7_22.4
dti:22.4_23.1
dti:23.1_25.2
dti:25.2_25.9
dti:25.9_29.4
dti:29.4_30.1
dti:30.1_30.8
dti:30.8_31.5
dti:31.5_32.2
dti:32.2_32.9
dti:32.9_33.6
dti:>=33.6

woe dummies continuous: mths_since_last_record ** missing¶

col = 'mths_since_last_record'
colf = col + '_factor'

print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())

X_prep[col].head(2)

unique =  123
nulls =  80742

193817   NaN
174039   NaN
Name: mths_since_last_record, dtype: float32

X_prep[col].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x18bf28320>

# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals

print(col+':missing')

mths_since_last_record:missing

"""
mths_since_last_record:missing
mths_since_last_record:0_2
mths_since_last_record:3_22
mths_since_last_record:23_46
mths_since_last_record:47_68
mths_since_last_record:69_85
mths_since_last_record:>=86

""";

X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]

X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)

df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)

df_woe.head()

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning:

divide by zero encountered in log

plot_woe(df_woe,xrot=90,color='k')

# look at large falling line and create groups

bins = [0,2,22,46,68,85]
create_bins_dummies_int(X_prep,col,bins)

mths_since_last_record:0_2
mths_since_last_record:3_22
mths_since_last_record:23_46
mths_since_last_record:47_68
mths_since_last_record:69_85
mths_since_last_record:>=86

Save the train data and repeat for test¶

print(X_prep.shape)
X_prep.head(2)

(93257, 351)

# 1. after saving, comment these two lines
# 2. go up in notebook where we have defined X_prep
#    and make it test and run all the cells below that.

X_prep.to_csv(dat_pro + 'Xtrain.csv')
y_prep.to_csv(dat_pro + 'ytrain.csv')

# after running this, comment it and uncomment train part
# we always create from train, not from test.

# X_prep.to_csv(dat_pro + 'Xtest.csv')
# y_prep.to_csv(dat_pro + 'ytest.csv')

	issue_d	earliest_cr_line	last_pymnt_d	last_credit_pull_d
0	Dec-11	Jan-85	Jan-15	Jan-16
1	Dec-11	Apr-99	Apr-13	Sep-13

	count	mean	std	min	25%	50%	75%	max
mths_since_issue_d	466285.0	79.26	14.34	64.0	69.0	75.0	85.0	154.0
mths_since_earliest_cr_line	466256.0	267.48	93.97	-584.0	211.0	253.0	313.0	615.0
mths_since_last_pymnt_d	465909.0	59.29	12.81	51.0	51.0	52.0	63.0	148.0
mths_since_last_credit_pull_d	466243.0	55.05	9.64	51.0	51.0	51.0	53.0	155.0

	grade:A	grade:B	grade:C	grade:D	grade:E	grade:F	grade:G	sub_grade:A1	sub_grade:A2	sub_grade:A3	sub_grade:A4	sub_grade:A5	sub_grade:B1	sub_grade:B2	sub_grade:B3	sub_grade:B4	sub_grade:B5	sub_grade:C1	sub_grade:C2	sub_grade:C3	sub_grade:C4	sub_grade:C5	sub_grade:D1	sub_grade:D2	sub_grade:D3	sub_grade:D4	sub_grade:D5	sub_grade:E1	sub_grade:E2	sub_grade:E3	sub_grade:E4	sub_grade:E5	sub_grade:F1	sub_grade:F2	sub_grade:F3	sub_grade:F4	sub_grade:F5	sub_grade:G1	sub_grade:G2	sub_grade:G3	sub_grade:G4	sub_grade:G5	home_ownership:ANY	home_ownership:MORTGAGE	home_ownership:NONE	home_ownership:OTHER	home_ownership:OWN	home_ownership:RENT	verification_status:Not Verified	verification_status:Source Verified	verification_status:Verified	loan_status:Charged Off	loan_status:Current	loan_status:Default	loan_status:Does not meet the credit policy. Status:Charged Off	loan_status:Does not meet the credit policy. Status:Fully Paid	loan_status:Fully Paid	loan_status:In Grace Period	loan_status:Late (16-30 days)	loan_status:Late (31-120 days)	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY	initial_list_status:f	initial_list_status:w
0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0

	Unnamed: 0	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_title	emp_length	home_ownership	annual_inc	verification_status	issue_d	loan_status	pymnt_plan	url	desc	purpose	title	zip_code	addr_state	dti	delinq_2yrs	earliest_cr_line	inq_last_6mths	mths_since_last_delinq	mths_since_last_record	open_acc	pub_rec	revol_bal	revol_util	total_acc	initial_list_status	out_prncp	out_prncp_inv	total_pymnt	total_pymnt_inv	total_rec_prncp	total_rec_int	total_rec_late_fee	recoveries	collection_recovery_fee	last_pymnt_d	last_pymnt_amnt	next_pymnt_d	last_credit_pull_d	collections_12_mths_ex_med	mths_since_last_major_derog	policy_code	application_type	acc_now_delinq	tot_coll_amt	tot_cur_bal	total_rev_hi_lim	issue_d_date	earliest_cr_line_date	last_pymnt_d_date	last_credit_pull_d_date	mths_since_issue_d	mths_since_earliest_cr_line	mths_since_last_pymnt_d	mths_since_last_credit_pull_d	term_int	emp_length_int	loan_income_ratio	good_bad	grade:A	grade:B	grade:C	grade:D	grade:E	grade:F	grade:G	sub_grade:A1	sub_grade:A2	sub_grade:A3	sub_grade:A4	sub_grade:A5	sub_grade:B1	sub_grade:B2	sub_grade:B3	sub_grade:B4	sub_grade:B5	sub_grade:C1	sub_grade:C2	sub_grade:C3	sub_grade:C4	sub_grade:C5	sub_grade:D1	sub_grade:D2	sub_grade:D3	sub_grade:D4	sub_grade:D5	sub_grade:E1	sub_grade:E2	sub_grade:E3	sub_grade:E4	sub_grade:E5	sub_grade:F1	sub_grade:F2	sub_grade:F3	sub_grade:F4	sub_grade:F5	sub_grade:G1	sub_grade:G2	sub_grade:G3	sub_grade:G4	sub_grade:G5	home_ownership:ANY	home_ownership:MORTGAGE	home_ownership:NONE	home_ownership:OTHER	home_ownership:OWN	home_ownership:RENT	verification_status:Not Verified	verification_status:Source Verified	verification_status:Verified	loan_status:Charged Off	loan_status:Current	loan_status:Default	loan_status:Does not meet the credit policy. Status:Charged Off	loan_status:Does not meet the credit policy. Status:Fully Paid	loan_status:Fully Paid	loan_status:In Grace Period	loan_status:Late (16-30 days)	loan_status:Late (31-120 days)	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY	initial_list_status:f	initial_list_status:w
0	0	1077501	1296599	5000	5000	4975.0	36 months	10.65	162.869995	B	B2	NaN	10+ years	RENT	24000.0	Verified	Dec-11	Fully Paid	n	https://www.lendingclub.com/browse/loanDetail....	Borrower added on 12/22/11 > I need to upgra...	credit_card	Computer	860xx	AZ	27.65	0.0	Jan-85	1.0	NaN	NaN	3.0	0.0	13648	83.699997	9.0	f	0.0	0.0	5861.071289	5831.779785	5000.000000	861.070007	0.0	0.000000	0.00	Jan-15	171.619995	NaN	Jan-16	0.0	NaN	1	INDIVIDUAL	0.0	NaN	NaN	5000.0	2011-12-01	1985-01-01	2015-01-01	2016-01-01	100.0	423.0	63.0	51.0	36	10.0	0.208333	1	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0
1	1	1077430	1314167	2500	2500	2500.0	60 months	15.27	59.830002	C	C4	Ryder	< 1 year	RENT	30000.0	Source Verified	Dec-11	Charged Off	n	https://www.lendingclub.com/browse/loanDetail....	Borrower added on 12/22/11 > I plan to use t...	car	bike	309xx	GA	1.00	0.0	Apr-99	5.0	NaN	NaN	3.0	0.0	1687	9.400000	4.0	f	0.0	0.0	1008.710022	1008.710022	456.459991	435.170013	0.0	117.080002	1.11	Apr-13	119.660004	NaN	Sep-13	0.0	NaN	1	INDIVIDUAL	0.0	NaN	NaN	2500.0	2011-12-01	1999-04-01	2013-04-01	2013-09-01	100.0	252.0	84.0	79.0	60	1.0	0.083333	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0

	grade	n_obs	woe	diff_n_obs	text_plotly
0	G	703	-1.258162	nan	n_obs: 703 WoE: -1.2582
1	F	2,641	-0.922705	1,938	n_obs: 2,641 WoE: -0.9227
2	E	7,084	-0.692380	4,443	n_obs: 7,084 WoE: -0.6924
3	D	15,294	-0.396822	8,210	n_obs: 15,294 WoE: -0.3968
4	C	24,982	-0.054636	9,688	n_obs: 24,982 WoE: -0.0546

	n_missing	top_value	proportion	nunique
feature
loan_income_ratio	4	0.2	0.02	77022
annual_inc	4	60000	0.04	31901
title	20	Debt consolidation	0.35	63099
acc_now_delinq	29	0	1.00	6
total_acc	29	21	0.04	112
pub_rec	29	0	0.87	26
mths_since_earliest_cr_line	29	234	0.01	664
open_acc	29	9	0.09	62
earliest_cr_line	29	Oct-00	0.01	664
delinq_2yrs	29	0	0.82	24
inq_last_6mths	29	0	0.52	28
earliest_cr_line_date	29	2000-10-01 00:00:00	0.01	664
mths_since_last_credit_pull_d	42	51	0.70	103
last_credit_pull_d	42	Jan-16	0.70	103
last_credit_pull_d_date	42	2016-01-01 00:00:00	0.70	103
collections_12_mths_ex_med	145	0	0.99	9
revol_util	340	0	0.00	1269
mths_since_last_pymnt_d	376	51	0.39	98
last_pymnt_d_date	376	2016-01-01 00:00:00	0.39	98
last_pymnt_d	376	Jan-16	0.39	98
emp_length_int	21008	10	0.34	10
emp_length	21008	10+ years	0.34	11
emp_title	27588	Teacher	0.01	205475
tot_cur_bal	70276	0	0.00	220690
tot_coll_amt	70276	0	0.87	6321
total_rev_hi_lim	70276	15000	0.00	14612
next_pymnt_d	227214	Feb-16	0.87	100
mths_since_last_delinq	250351	9	0.02	145
desc	340302		0.00	124436
mths_since_last_major_derog	367311	45	0.02	162
mths_since_last_record	403647	0	0.02	123

	grade:A	grade:B	grade:C	grade:D	grade:E	grade:F	grade:G	sub_grade:A1	sub_grade:A2	sub_grade:A3	sub_grade:A4	sub_grade:A5	sub_grade:B1	sub_grade:B2	sub_grade:B3	sub_grade:B4	sub_grade:B5	sub_grade:C1	sub_grade:C2	sub_grade:C3	sub_grade:C4	sub_grade:C5	sub_grade:D1	sub_grade:D2	sub_grade:D3	sub_grade:D4	sub_grade:D5	sub_grade:E1	sub_grade:E2	sub_grade:E3	sub_grade:E4	sub_grade:E5	sub_grade:F1	sub_grade:F2	sub_grade:F3	sub_grade:F4	sub_grade:F5	sub_grade:G1	sub_grade:G2	sub_grade:G3	sub_grade:G4	sub_grade:G5	home_ownership:ANY	home_ownership:MORTGAGE	home_ownership:NONE	home_ownership:OTHER	home_ownership:OWN	home_ownership:RENT	verification_status:Not Verified	verification_status:Source Verified	verification_status:Verified	loan_status:Charged Off	loan_status:Current	loan_status:Default	loan_status:Does not meet the credit policy. Status:Charged Off	loan_status:Does not meet the credit policy. Status:Fully Paid	loan_status:Fully Paid	loan_status:In Grace Period	loan_status:Late (16-30 days)	loan_status:Late (31-120 days)	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY	initial_list_status:f	initial_list_status:w
0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0

	home_ownership	n_obs	woe	diff_n_obs	text_plotly
0	OTHER	37	-0.241722	8,297	n_obs: 37 WoE: -0.2417
1	RENT	37,331	-0.158371	nan	n_obs: 37,331 WoE: -0.1584
2	OWN	8,334	-0.010046	28,997	n_obs: 8,334 WoE: -0.01
3	MORTGAGE	47,544	0.141610	47,533	n_obs: 47,544 WoE: 0.1416
4	NONE	11	inf	26	n_obs: 11 WoE: inf

	addr_state	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY
193817	CA	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
174039	WA	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0

	purpose	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding
193817	debt_consolidation	0	0	1	0	0	0	0	0	0	0	0	0	0	0
174039	debt_consolidation	0	0	1	0	0	0	0	0	0	0	0	0	0	0

	emp_length_int	n_obs	woe	diff_n_obs	text_plotly
0	0.000000	4,131	-0.224541	8,992	n_obs: 4,131 WoE: -0.2245
1	1.000000	13,123	-0.058005	5,051	n_obs: 13,123 WoE: -0.058
2	2.000000	8,072	0.027248	827	n_obs: 8,072 WoE: 0.0272
3	3.000000	7,245	-0.012977	1,625	n_obs: 7,245 WoE: -0.013
4	4.000000	5,620	0.046928	575	n_obs: 5,620 WoE: 0.0469
5	5.000000	6,195	-0.014760	856	n_obs: 6,195 WoE: -0.0148
6	6.000000	5,339	-0.084708	221	n_obs: 5,339 WoE: -0.0847
7	7.000000	5,118	-0.065842	589	n_obs: 5,118 WoE: -0.0658
8	8.000000	4,529	-0.020316	917	n_obs: 4,529 WoE: -0.0203
9	9.000000	3,612	-0.042647	26,661	n_obs: 3,612 WoE: -0.0426
10	10.000000	30,273	0.087623	nan	n_obs: 30,273 WoE: 0.0876

	int_rate	int_rate_factor	int_rate:0_9.961	int_rate:9.961_12.025	int_rate:12.025_15.74	int_rate:15.74_20.281	int_rate:>=20.281
193817	10.16	(9.961, 10.374]	0	1	0	0	0
174039	15.80	(15.74, 16.153]	0	0	0	1	0

	total_rev_hi_lim_factor	n_obs	woe	diff_n_obs	text_plotly
0	(-1090.7, 10907.0]	21484	-0.173011	7121.0	n_obs: 21,484<br>WoE: -0.173<br>
1	(10907.0, 21814.0]	28605	-0.101744	10872.0	n_obs: 28,605<br>WoE: -0.1017<br>
2	(21814.0, 32721.0]	17733	0.011071	7346.0	n_obs: 17,733<br>WoE: 0.0111<br>
3	(32721.0, 43628.0]	10387	0.137990	4584.0	n_obs: 10,387<br>WoE: 0.138<br>
4	(43628.0, 54535.0]	5803	0.256852	2541.0	n_obs: 5,803<br>WoE: 0.2569<br>

	installment_factor	n_obs	woe	diff_n_obs	text_plotly
0	(14.518, 43.754]	454	0.159373	780.0	n_obs: 454<br>WoE: 0.1594<br>
1	(43.754, 71.599]	1234	0.130200	476.0	n_obs: 1,234<br>WoE: 0.1302<br>
2	(71.599, 99.443]	1710	0.067130	572.0	n_obs: 1,710<br>WoE: 0.0671<br>
3	(99.443, 127.288]	2282	-0.038149	144.0	n_obs: 2,282<br>WoE: -0.0381<br>
4	(127.288, 155.132]	2426	0.026251	1816.0	n_obs: 2,426<br>WoE: 0.0263<br>

	installment_factor	n_obs	woe	diff_n_obs	text_plotly
1	(43.754, 71.599]	1,234	0.130200	476	n_obs: 1,234 WoE: 0.1302
2	(71.599, 99.443]	1,710	0.067130	572	n_obs: 1,710 WoE: 0.0671
3	(99.443, 127.288]	2,282	-0.038149	144	n_obs: 2,282 WoE: -0.0381
4	(127.288, 155.132]	2,426	0.026251	1,816	n_obs: 2,426 WoE: 0.0263
5	(155.132, 182.976]	4,242	0.033381	41	n_obs: 4,242 WoE: 0.0334
6	(182.976, 210.821]	4,283	0.126215	541	n_obs: 4,283 WoE: 0.1262
7	(210.821, 238.665]	3,742	0.096831	821	n_obs: 3,742 WoE: 0.0968
8	(238.665, 266.51]	4,563	0.129521	422	n_obs: 4,563 WoE: 0.1295
9	(266.51, 294.354]	4,985	-0.028918	187	n_obs: 4,985 WoE: -0.0289
10	(294.354, 322.198]	5,172	0.067806	1,278	n_obs: 5,172 WoE: 0.0678
11	(322.198, 350.043]	6,450	-0.025363	1,643	n_obs: 6,450 WoE: -0.0254
12	(350.043, 377.887]	4,807	-0.101960	340	n_obs: 4,807 WoE: -0.102
13	(377.887, 405.732]	4,467	0.126107	1,182	n_obs: 4,467 WoE: 0.1261
14	(405.732, 433.576]	3,285	-0.125048	120	n_obs: 3,285 WoE: -0.125
15	(433.576, 461.42]	3,165	-0.135761	706	n_obs: 3,165 WoE: -0.1358
16	(461.42, 489.265]	3,871	0.099491	81	n_obs: 3,871 WoE: 0.0995
17	(489.265, 517.109]	3,952	-0.078377	1,163	n_obs: 3,952 WoE: -0.0784
18	(517.109, 544.954]	2,789	-0.201587	101	n_obs: 2,789 WoE: -0.2016
19	(544.954, 572.798]	2,688	-0.016065	357	n_obs: 2,688 WoE: -0.0161
20	(572.798, 600.642]	2,331	-0.177846	33	n_obs: 2,331 WoE: -0.1778
21	(600.642, 628.487]	2,298	0.183212	142	n_obs: 2,298 WoE: 0.1832
22	(628.487, 656.331]	2,156	0.076731	123	n_obs: 2,156 WoE: 0.0767
23	(656.331, 684.176]	2,279	0.046221	687	n_obs: 2,279 WoE: 0.0462
24	(684.176, 712.02]	1,592	-0.208344	376	n_obs: 1,592 WoE: -0.2083
25	(712.02, 739.864]	1,216	0.068516	113	n_obs: 1,216 WoE: 0.0685
26	(739.864, 767.709]	1,329	0.167071	145	n_obs: 1,329 WoE: 0.1671
27	(767.709, 795.553]	1,184	0.075039	138	n_obs: 1,184 WoE: 0.075
28	(795.553, 823.398]	1,046	0.138044	90	n_obs: 1,046 WoE: 0.138
29	(823.398, 851.242]	956	0.026938	88	n_obs: 956 WoE: 0.0269
30	(851.242, 879.086]	1,044	0.051178	272	n_obs: 1,044 WoE: 0.0512
31	(879.086, 906.931]	772	-0.060162	204	n_obs: 772 WoE: -0.0602
32	(906.931, 934.775]	568	-0.360655	35	n_obs: 568 WoE: -0.3607
33	(934.775, 962.62]	603	-0.234802	126	n_obs: 603 WoE: -0.2348

	annual_inc_factor	n_obs	woe	diff_n_obs	text_plotly
0	(-5444.395, 150887.9]	89166	-0.018654	85606.0	n_obs: 89,166<br>WoE: -0.0187<br>
1	(150887.9, 299775.8]	3560	0.498201	3169.0	n_obs: 3,560<br>WoE: 0.4982<br>
2	(299775.8, 448663.7]	391	0.503297	317.0	n_obs: 391<br>WoE: 0.5033<br>
3	(448663.7, 597551.6]	74	0.526648	47.0	n_obs: 74<br>WoE: 0.5266<br>
4	(597551.6, 746439.5]	27	inf	12.0	n_obs: 27<br>WoE: inf<br>

	annual_inc_factor	n_obs	woe	diff_n_obs	text_plotly
0	(-5444.395, 76443.95]	61174	-0.133182	33182.0	n_obs: 61,174<br>WoE: -0.1332<br>
1	(76443.95, 150887.9]	27992	0.274730	25035.0	n_obs: 27,992<br>WoE: 0.2747<br>
2	(150887.9, 225331.85]	2957	0.468022	2354.0	n_obs: 2,957<br>WoE: 0.468<br>
3	(225331.85, 299775.8]	603	0.658820	299.0	n_obs: 603<br>WoE: 0.6588<br>
4	(299775.8, 374219.75]	304	0.555222	217.0	n_obs: 304<br>WoE: 0.5552<br>

	annual_inc_factor	n_obs	woe	diff_n_obs	text_plotly
0	(1862.0, 4760.0]	4	-0.976810	9.0	n_obs: 4<br>WoE: -0.9768<br>
1	(4760.0, 7520.0]	13	-0.871449	58.0	n_obs: 13<br>WoE: -0.8714<br>
2	(7520.0, 10280.0]	71	-0.482791	84.0	n_obs: 71<br>WoE: -0.4828<br>
3	(10280.0, 13040.0]	155	-0.473706	126.0	n_obs: 155<br>WoE: -0.4737<br>
4	(13040.0, 15800.0]	281	-0.662220	129.0	n_obs: 281<br>WoE: -0.6622<br>

	mths_since_last_delinq_factor	n_obs	woe	diff_n_obs	text_plotly
0	(-0.152, 3.04]	1369	-0.268930	608.0	n_obs: 1,369<br>WoE: -0.2689<br>
1	(3.04, 6.08]	1977	-0.047493	515.0	n_obs: 1,977<br>WoE: -0.0475<br>
2	(6.08, 9.12]	2492	-0.105770	34.0	n_obs: 2,492<br>WoE: -0.1058<br>
3	(9.12, 12.16]	2458	-0.045329	19.0	n_obs: 2,458<br>WoE: -0.0453<br>
4	(12.16, 15.2]	2439	0.013693	108.0	n_obs: 2,439<br>WoE: 0.0137<br>

	grade:A	grade:B	grade:C	grade:D	grade:E	grade:F	grade:G	sub_grade:A1	sub_grade:A2	sub_grade:A3	sub_grade:A4	sub_grade:A5	sub_grade:B1	sub_grade:B2	sub_grade:B3	sub_grade:B4	sub_grade:B5	sub_grade:C1	sub_grade:C2	sub_grade:C3	sub_grade:C4	sub_grade:C5	sub_grade:D1	sub_grade:D2	sub_grade:D3	sub_grade:D4	sub_grade:D5	sub_grade:E1	sub_grade:E2	sub_grade:E3	sub_grade:E4	sub_grade:E5	sub_grade:F1	sub_grade:F2	sub_grade:F3	sub_grade:F4	sub_grade:F5	sub_grade:G1	sub_grade:G2	sub_grade:G3	sub_grade:G4	sub_grade:G5	home_ownership:ANY	home_ownership:MORTGAGE	home_ownership:NONE	home_ownership:OTHER	home_ownership:OWN	home_ownership:RENT	verification_status:Not Verified	verification_status:Source Verified	verification_status:Verified	loan_status:Charged Off	loan_status:Current	loan_status:Default	loan_status:Does not meet the credit policy. Status:Charged Off	loan_status:Does not meet the credit policy. Status:Fully Paid	loan_status:Fully Paid	loan_status:In Grace Period	loan_status:Late (16-30 days)	loan_status:Late (31-120 days)	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY	initial_list_status:f	initial_list_status:w
0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0

	installment	installment_factor	installment:0_127	installment:127_210	installment:210_266	installment:266_294	installment:294_322	installment:322_378	installment:378_406	installment:406_433	installment:433_517	installment:517_545	installment:545_629	installment:629_712	installment:712_796	installment:796_852	installment:852_880	installment:880_935	installment:935_963	installment:>=963
193817	177.889999	(155.132, 182.976]	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
174039	210.360001	(182.976, 210.821]	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	annual_inc	annual_inc_factor	annual_inc:<20k	annual_inc:20k_30k	annual_inc:30k_40k	annual_inc:40k_50k	annual_inc:50k_60k	annual_inc:60k_70k	annual_inc:70k_80k	annual_inc:80k_90k	annual_inc:90k_100k	annual_inc:100k_120k	annual_inc:120k_140k	annual_inc:>140k
193817	18572.849609	(-5444.395, 76443.95]	1	0	0	0	0	0	0	0	0	0	0	0
174039	20000.000000	(-5444.395, 76443.95]	1	1	0	0	0	0	0	0	0	0	0	0

	dti_factor	n_obs	woe	diff_n_obs	text_plotly
0	(-0.035, 0.7]	355	-0.141722	81.0	n_obs: 355<br>WoE: -0.1417<br>
1	(0.7, 1.4]	436	-0.007521	75.0	n_obs: 436<br>WoE: -0.0075<br>
2	(1.4, 2.099]	511	0.038375	134.0	n_obs: 511<br>WoE: 0.0384<br>
3	(2.099, 2.799]	645	0.517080	138.0	n_obs: 645<br>WoE: 0.5171<br>
4	(2.799, 3.499]	783	0.208073	209.0	n_obs: 783<br>WoE: 0.2081<br>

	mths_since_last_record_factor	n_obs	woe	diff_n_obs	text_plotly
0	(-0.129, 2.58]	287	-1.175688	257.0	n_obs: 287<br>WoE: -1.1757<br>
1	(2.58, 5.16]	30	-1.132177	2.0	n_obs: 30<br>WoE: -1.1322<br>
2	(5.16, 7.74]	32	1.290209	24.0	n_obs: 32<br>WoE: 1.2902<br>
3	(7.74, 10.32]	56	0.727902	13.0	n_obs: 56<br>WoE: 0.7279<br>
4	(10.32, 12.9]	43	0.133490	32.0	n_obs: 43<br>WoE: 0.1335<br>

	Unnamed: 0	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_title	emp_length	home_ownership	annual_inc	verification_status	issue_d	loan_status	pymnt_plan	url	desc	purpose	title	zip_code	addr_state	dti	delinq_2yrs	earliest_cr_line	inq_last_6mths	mths_since_last_delinq	mths_since_last_record	open_acc	pub_rec	revol_bal	revol_util	total_acc	initial_list_status	out_prncp	out_prncp_inv	total_pymnt	total_pymnt_inv	total_rec_prncp	total_rec_int	total_rec_late_fee	recoveries	collection_recovery_fee	last_pymnt_d	last_pymnt_amnt	next_pymnt_d	last_credit_pull_d	collections_12_mths_ex_med	mths_since_last_major_derog	policy_code	application_type	acc_now_delinq	tot_coll_amt	tot_cur_bal	total_rev_hi_lim	issue_d_date	earliest_cr_line_date	last_pymnt_d_date	last_credit_pull_d_date	mths_since_issue_d	mths_since_earliest_cr_line	mths_since_last_pymnt_d	mths_since_last_credit_pull_d	term_int	emp_length_int	loan_income_ratio	grade:A	grade:B	grade:C	grade:D	grade:E	grade:F	grade:G	sub_grade:A1	sub_grade:A2	sub_grade:A3	sub_grade:A4	sub_grade:A5	sub_grade:B1	sub_grade:B2	sub_grade:B3	sub_grade:B4	sub_grade:B5	sub_grade:C1	sub_grade:C2	sub_grade:C3	sub_grade:C4	sub_grade:C5	sub_grade:D1	sub_grade:D2	sub_grade:D3	sub_grade:D4	sub_grade:D5	sub_grade:E1	sub_grade:E2	sub_grade:E3	sub_grade:E4	sub_grade:E5	sub_grade:F1	sub_grade:F2	sub_grade:F3	sub_grade:F4	sub_grade:F5	sub_grade:G1	sub_grade:G2	sub_grade:G3	sub_grade:G4	sub_grade:G5	home_ownership:ANY	home_ownership:MORTGAGE	home_ownership:NONE	home_ownership:OTHER	home_ownership:OWN	home_ownership:RENT	verification_status:Not Verified	verification_status:Source Verified	verification_status:Verified	loan_status:Charged Off	loan_status:Current	loan_status:Default	loan_status:Does not meet the credit policy. Status:Charged Off	loan_status:Does not meet the credit policy. Status:Fully Paid	loan_status:Fully Paid	loan_status:In Grace Period	loan_status:Late (16-30 days)	loan_status:Late (31-120 days)	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY	initial_list_status:f	initial_list_status:w	home_ownership:RENT_OTHER_NONE_ANY	addr_state:ND	addr_state:ND_NE_IA_NV_FL_HI_AL	addr_state:NM_VA	addr_state:OK_TN_MO_LA_MD_NC	addr_state:UT_KY_AZ_NJ	addr_state:AR_MI_PA_OH_MN	addr_state:RI_MA_DE_SD_IN	addr_state:GA_WA_OR	addr_state:WI_MT	addr_state:IL_CT	addr_state:KS_SC_CO_VT_AK_MS	addr_state:WV_NH_WY_DC_ME_ID	purpose:small_business__educational__moving__renewable_energy	purpose:house__medical__wedding__vacation	purpose:home_improvement__major_purchase__car	term:36	term:60	emp_length:0	emp_length:1	emp_length:2_3	emp_length:4_5	emp_length:6_7	emp_length:8_9	emp_length:10	mths_since_issue_d_factor	mths_since_issue_d:0_66	mths_since_issue_d:67_70	mths_since_issue_d:71_75	mths_since_issue_d:76_82	mths_since_issue_d:83_93	mths_since_issue_d:94_128	mths_since_issue_d:>=129	int_rate_factor	int_rate:0_9.961	int_rate:9.961_12.025	int_rate:12.025_15.74	int_rate:15.74_20.281	int_rate:>=20.281	funded_amnt_factor	mths_since_earliest_cr_line_factor	mths_since_earliest_cr_line:0_207	mths_since_earliest_cr_line:208_350	mths_since_earliest_cr_line:>=351	delinq_2yrs:0	delinq_2yrs:1-3	delinq_2yrs:>=3	inq_last_6mths:0	inq_last_6mths:1	inq_last_6mths:2_3	inq_last_6mths:>3	open_acc:<=3	open_acc:4_5	open_acc:6_13	open_acc:13_17	open_acc:>17	pub_rec:0	pub_rec:1	pub_rec:2	pub_rec:3	pub_rec:>3	total_acc_factor	total_acc:0_6.24	total_acc:6.24_21.84	total_acc:21.84_37.44	total_acc:37.44_40.56	total_acc:>=40.56	acc_now_delinq:0	acc_now_delinq>=1	total_rev_hi_lim_factor	total_rev_hi_lim:<=5k	total_rev_hi_lim:5k_10k	total_rev_hi_lim:10k_20k	total_rev_hi_lim:20k_30k	total_rev_hi_lim:30k_40k	total_rev_hi_lim:40k_50k	total_rev_hi_lim:50k_65k	total_rev_hi_lim:65k_80k	total_rev_hi_lim:>80k	installment_factor	installment:0_127	installment:127_210	installment:210_266	installment:266_294	installment:294_322	installment:322_378	installment:378_406	installment:406_433	installment:433_517	installment:517_545	installment:545_629	installment:629_712	installment:712_796	installment:796_852	installment:852_880	installment:880_935	installment:935_963	installment:>=963	annual_inc_factor	annual_inc:<20k	annual_inc:20k_30k	annual_inc:30k_40k	annual_inc:40k_50k	annual_inc:50k_60k	annual_inc:60k_70k	annual_inc:70k_80k	annual_inc:80k_90k	annual_inc:90k_100k	annual_inc:100k_120k	annual_inc:120k_140k	annual_inc:>140k	mths_since_last_delinq:missing	mths_since_last_delinq:0_3	mths_since_last_delinq:4_30	mths_since_last_delinq:31_56	mths_since_last_delinq:>=57	dti:0_0.7	dti:0.7_1.4	dti:1.4_2.1	dti:2.1_3.5	dti:3.5_4.9	dti:4.9_5.6	dti:5.6_6.3	dti:6.3_7	dti:7_9	dti:9_9.8	dti:9.8_10.5	dti:10.5_11.2	dti:11.2_11.9	dti:11.9_12.6	dti:12.6_13.3	dti:13.3_14	dti:14_14.7	dti:14.7_17.5	dti:17.5_18.2	dti:18.2_21	dti:21_21.7	dti:21.7_22.4	dti:22.4_23.1	dti:23.1_25.2	dti:25.2_25.9	dti:25.9_29.4	dti:29.4_30.1	dti:30.1_30.8	dti:30.8_31.5	dti:31.5_32.2	dti:32.2_32.9	dti:32.9_33.6	dti:>=33.6	mths_since_last_record:missing	mths_since_last_record:0_2	mths_since_last_record:3_22	mths_since_last_record:23_46	mths_since_last_record:47_68	mths_since_last_record:69_85	mths_since_last_record:>=86
193817	193817	1581078	1850197	5500	5500	5500.0	36 months	10.16	177.889999	B	B1	UPS	5 years	RENT	18572.849609	Verified	Oct-12	Fully Paid	n	https://www.lendingclub.com/browse/loanDetail....	NaN	debt_consolidation	Debt consolidation	906xx	CA	13.370000	0.0	Apr-07	1.0	NaN	NaN	5.0	0.0	4892	51.0	6.0	f	0.0	0.0	6402.621094	6402.620117	5500.0	902.619995	0.0	0.0	0.0	Oct-15	181.750000	NaN	Jan-16	0.0	NaN	1	INDIVIDUAL	0.0	0.0	6632.0	9600.0	2012-10-01	2007-04-01	2015-10-01	2016-01-01	90.0	156.0	54.0	51.0	36	5.0	0.296131	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	(89.2, 91.0]	0	0	0	0	1	0	0	(9.961, 10.374]	0	1	0	0	0	(5330.0, 6020.0]	(135.4, 159.38]	1	0	0	1	0	0	0	1	0	0	0	1	0	0	0	1	0	0	0	0	(4.68, 7.02]	1	0	0	0	0	1	0	(-1090.7, 10907.0]	0	1	0	0	0	0	0	0	0	(155.132, 182.976]	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	(-5444.395, 76443.95]	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0
174039	174039	3055729	3728105	6000	6000	6000.0	36 months	15.80	210.360001	C	C3	peshastin market	< 1 year	OWN	20000.000000	Not Verified	Jan-13	Fully Paid	n	https://www.lendingclub.com/browse/loanDetail....	NaN	debt_consolidation	credit card pay off	988xx	WA	19.559999	1.0	Nov-95	0.0	22.0	NaN	7.0	0.0	5291	72.5	16.0	f	0.0	0.0	7468.176270	7468.180176	6000.0	1468.180054	0.0	0.0	0.0	Apr-15	2004.670044	NaN	May-15	0.0	29.0	1	INDIVIDUAL	0.0	0.0	11685.0	7300.0	2013-01-01	1995-11-01	2015-04-01	2015-05-01	87.0	293.0	60.0	59.0	36	1.0	0.300000	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	(85.6, 87.4]	0	0	0	0	1	0	0	(15.74, 16.153]	0	0	0	1	0	(5330.0, 6020.0]	(279.28, 303.26]	0	1	0	0	1	0	1	0	0	0	0	0	1	0	0	1	0	0	0	0	(14.04, 16.38]	0	1	0	0	0	1	0	(-1090.7, 10907.0]	0	1	0	0	0	0	0	0	0	(182.976, 210.821]	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	(-5444.395, 76443.95]	1	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0

Table of Contents

Load the data¶

Memory Reduction¶

data type: datetetime¶

data type: object¶

Hidden Continuous features¶

Create New Columns¶

Missing Values¶

fill nans with mean¶

fill nans with another column¶

fill nans with zero¶

Dummy variables¶

Split train test data¶

Prepare the data for train and test separately¶

Weight of Evidence WoE and Information Value IV¶

woe dummies: grade¶

woe dummies: home_ownership¶

woe dummies: addr_state¶

woe dummies: verification_status¶

woe dummies: purpose¶

woe dummies: initial_list_status¶

woe dummies: term_int¶

woe dummies continuous: emp_length_int¶

woe dummies continuous: mths_since_issue_d¶

woe dummies continuous: int_rate¶

woe dummies continuous: funded_amnt **¶

woe dummies continuous: mths_since_earliest_cr_line¶

woe dummies continuous: delinq_2yrs **¶

woe dummies continuous: inq_last_6mths¶

woe dummies continuous: open_acc¶

woe dummies continuous: pub_rec¶

woe dummies continuous: total_acc¶

woe dummies continuous: acc_now_delinq¶

woe dummies continuous: total_rev_hi_lim¶

woe dummies continuous: installment¶

woe dummies continuous: annual_inc¶

woe dummies continuous: mths_since_last_delinq **¶

woe dummies continuous: dti **¶

woe dummies continuous: mths_since_last_record ** missing¶

Save the train data and repeat for test¶

	grade:A	grade:B	grade:C	grade:D	grade:E	grade:F	grade:G	sub_grade:A1	sub_grade:A2	sub_grade:A3	sub_grade:A4	sub_grade:A5	sub_grade:B1	sub_grade:B2	sub_grade:B3	sub_grade:B4	sub_grade:B5	sub_grade:C1	sub_grade:C2	sub_grade:C3	sub_grade:C4	sub_grade:C5	sub_grade:D1	sub_grade:D2	sub_grade:D3	sub_grade:D4	sub_grade:D5	sub_grade:E1	sub_grade:E2	sub_grade:E3	sub_grade:E4	sub_grade:E5	sub_grade:F1	sub_grade:F2	sub_grade:F3	sub_grade:F4	sub_grade:F5	sub_grade:G1	sub_grade:G2	sub_grade:G3	sub_grade:G4	sub_grade:G5	home_ownership:ANY	home_ownership:MORTGAGE	home_ownership:NONE	home_ownership:OTHER	home_ownership:OWN	home_ownership:RENT	verification_status:Not Verified	verification_status:Source Verified	verification_status:Verified	loan_status:Charged Off	loan_status:Current	loan_status:Default	loan_status:Does not meet the credit policy. Status:Charged Off	loan_status:Does not meet the credit policy. Status:Fully Paid	loan_status:Fully Paid	loan_status:In Grace Period	loan_status:Late (16-30 days)	loan_status:Late (31-120 days)	purpose:car	purpose:credit_card	purpose:debt_consolidation	purpose:educational	purpose:home_improvement	purpose:house	purpose:major_purchase	purpose:medical	purpose:moving	purpose:other	purpose:renewable_energy	purpose:small_business	purpose:vacation	purpose:wedding	addr_state:AK	addr_state:AL	addr_state:AR	addr_state:AZ	addr_state:CA	addr_state:CO	addr_state:CT	addr_state:DC	addr_state:DE	addr_state:FL	addr_state:GA	addr_state:HI	addr_state:IA	addr_state:ID	addr_state:IL	addr_state:IN	addr_state:KS	addr_state:KY	addr_state:LA	addr_state:MA	addr_state:MD	addr_state:ME	addr_state:MI	addr_state:MN	addr_state:MO	addr_state:MS	addr_state:MT	addr_state:NC	addr_state:NE	addr_state:NH	addr_state:NJ	addr_state:NM	addr_state:NV	addr_state:NY	addr_state:OH	addr_state:OK	addr_state:OR	addr_state:PA	addr_state:RI	addr_state:SC	addr_state:SD	addr_state:TN	addr_state:TX	addr_state:UT	addr_state:VA	addr_state:VT	addr_state:WA	addr_state:WI	addr_state:WV	addr_state:WY	initial_list_status:f	initial_list_status:w
0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0