import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
np.random.seed(SEED)
[(x.__name__,x.__version__) for x in [np,pd,sns]]
pd.options.display.max_columns = None
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import datetime
import functools
from sklearn.model_selection import train_test_split
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
[(x.__name__,x.__version__) for x in [plotly]]
!ls ../data/raw
!du -sh ../data/raw/loan_data_2007_2014.csv
dat_raw = '../data/raw/'
dat_pro = '../data/processed/'
df = pd.read_csv(dat_raw + 'loan_data_2007_2014.csv',low_memory=False)
print(df.shape)
df.head(2).append(df.tail(2))
df_copy = df.copy(deep=True)
# df = df_copy.copy()
col_floats = df.select_dtypes(np.float64).columns
for col in col_floats:
df[col] = df[col].astype(np.float32)
col_ints = df.select_dtypes(np.int64).columns
for col in col_ints:
df[col] = df[col].astype(np.int32)
df.select_dtypes('object').head(1).T
cols_date = ['issue_d',
'earliest_cr_line',
'last_pymnt_d',
'last_credit_pull_d']
for col in cols_date:
df[col+'_date'] = pd.to_datetime(df[col],
format='%b-%y',
errors='coerce')
df[cols_date].dtypes
df[cols_date].head(2)
today = datetime.datetime.today()
today
today = datetime.datetime(2020, 3, 30)
today
df['earliest_cr_line_date'].head(2)
df['earliest_cr_line_date'].isnull().sum()
for col in cols_date:
df['mths_since_'+col] = \
round((today - df[col+'_date']) /
np.timedelta64(1,'M'))
cols_months_since = ['mths_since_' + i for i in cols_date]
df[cols_months_since].describe().round(2).T
df.select_dtypes('object').nunique().sort_values()
cols_cat_small = df.select_dtypes('object').nunique()[lambda x: x<12].index
for col in cols_cat_small:
print(col)
print(df[col].unique())
print()
df['term'].value_counts(dropna=False)
df['term_int'] = df['term'].str.extract('(\d+)').astype(int)
df['term_int'].unique()
df['emp_length'].value_counts(dropna=False)
df['emp_length_int'] = df['emp_length']\
.str.replace('< 1years','0')\
.str.extract(r'(\d+)').astype(np.float32)
df['emp_length_int'].value_counts(dropna=False)
df['loan_income_ratio']= df['loan_amnt']/df['annual_inc']
df.iloc[:2,-2:]
df['loan_status'].value_counts()
df['loan_status'].unique()
cols_bad = ['Charged Off',
'Default',
'Does not meet the credit policy. Status:Charged Off',
'Late (31-120 days)']
df['good_bad'] = np.where(df['loan_status'].isin(cols_bad),0,1)
df['good_bad'].value_counts()
pd.options.display.max_rows = 200
df.head(2)
missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()
df[missing_cols].head(2)
# df[missing_cols].isnull().sum()
df.shape
# drop columns if all of them are nans
df = df.dropna(how='all',axis=1)
df.isnull().sum()[lambda x: x>0]
missing_cols = df.isnull().sum()[lambda x: x>0].sort_values()\
.index.tolist()
df[missing_cols].head(2)
columns = ['feature','n_missing','top_value',
'proportion','nunique']
df_missing = pd.DataFrame(columns=columns)
for col in missing_cols:
x = df[col].value_counts(normalize=True).round(2)
unq = df[col].nunique()
n_missing = df[col].isnull().sum()
row = [col,n_missing, x.index[0],x.iloc[0],unq]
df_missing.loc[len(df_missing)] = row
df_missing = df_missing.set_index('feature')
df_missing
df['annual_inc'].isna().sum()
df['annual_inc'] = df['annual_inc'].fillna(df['annual_inc'].mean())
df['total_rev_hi_lim'] = df['total_rev_hi_lim'].fillna(df['funded_amnt'])
cols_zero = ['mths_since_earliest_cr_line','acc_now_delinq',
'total_acc','pub_rec','open_acc','inq_last_6mths',
'delinq_2yrs','emp_length_int']
for col in cols_zero:
df[col] = df[col].fillna(0)
df.select_dtypes('object').nunique().sort_values()
pd.get_dummies(df['grade'],prefix='grade',prefix_sep=':').head(2)
cols_dummy = ['grade','sub_grade','home_ownership',
'verification_status','loan_status',
'purpose','addr_state','initial_list_status']
df_dummies = [pd.get_dummies(df[col],
prefix=col,prefix_sep = ':')
for col in cols_dummy]
df_dummies = pd.concat(df_dummies,axis=1)
print(df_dummies.shape)
df_dummies.head(2)
df = pd.concat([df,df_dummies],axis=1)
print(df.shape)
df.head(2)
df.isnull().sum().sum()
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop('good_bad', axis = 1),
df['good_bad'],
train_size=0.8,
random_state=SEED
)
df_Xtrain.shape, df_Xtest.shape, ser_ytrain.shape, ser_ytest.shape
X_prep = df_Xtrain
y_prep = ser_ytrain
# In second round, we will comment train part and
# uncomment this test part.
#
# Note: after doing test part, comment test part and
# and keep train part uncommented, since we need to
# create features based on train not test.
#
# X_prep = df_Xtest
# y_prep = ser_ytest
def get_woe_iv(df1,
ser_target,
feature_cat,
target_name='good_bad',
sort='woe',
return_all=False):
"""Calculate Weight of Evidence and Infomation Value.
Parameters
-----------
df1: pandas.DataFrame
Input dataframe having categorical feature.
ser_target: pandas.Series
Pandas Series of binary values. e.g y_prep
feature_cat: str
Name of categorical column
target_name: str
Name of target column.
sort: str
Sort the output dataframe by this column.
Default is 'woe'. We use 'woe' for discrete
columns.
If the column values are continuous, then
use the actual column name for sorting.
return_all: boolean, optional
Whether or not return all columns
Returns:
--------
Returns dataframe with WOE, IV and other quantities.
Example:
--------
df_woe = get_woe_iv(X_prep,'grade',y_prep)
"""
df1 = pd.DataFrame({
feature_cat: df1[feature_cat],
target_name: ser_target
})
df1 = df1.groupby(feature_cat).agg(
n_obs=(target_name,'count'),
prop_good=(target_name,'mean'))
df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()
df1['n_good'] = df1['prop_good'] * df1['n_obs']
df1['n_bad'] = (1-df1['prop_good']) * df1['n_obs']
df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()
df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()
df1['woe'] = np.log(df1['prop_n_good']/df1['prop_n_bad'])
df1['diff_prop_good'] = df1['prop_good'].diff().abs()
df1['diff_woe'] = df1['woe'].diff().abs()
df1['diff_n_obs'] = df1['n_obs'].diff(-1).abs()
df1['iv'] = ((df1['prop_n_good']-df1['prop_n_bad']
) * df1['woe']
).sum()
# reorder columns
if not return_all:
df1 = df1[['n_obs', 'woe', 'diff_n_obs' ]]
if return_all:
df1 = df1[['n_obs', 'woe', 'diff_n_obs',
'prop_n_obs','iv',
'n_good', 'n_bad',
'prop_n_good','prop_n_bad',
'prop_good',
'diff_prop_good', 'diff_woe' ]]
# add one more column
n_obs1 = df1['n_obs'].apply(lambda x: "{:,d}".format(x)).astype(str)
df1['text_plotly'] = (
'n_obs: ' + n_obs1 + '<br>' +
'WoE: ' + df1['woe'].round(4).astype(str) + '<br>'
)
# sort values
df1 = df1.sort_values(sort)
df1 = df1.reset_index()
return df1
df_woe = get_woe_iv(X_prep,y_prep,'grade')
df_woe.head().style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})
def plot_woe(df_woe,xrot=0,figsize=(18,12),fontsize=20,color='red'):
x = df_woe.iloc[:,0].to_numpy().astype(str)
y = df_woe['woe']
fig,ax = plt.subplots(1,1,figsize=figsize)
plt.plot(x,y,marker='o',linestyle='--',color='k')
plt.xlabel(df_woe.columns[0],fontsize=fontsize)
plt.ylabel('Weight of Evidence',fontsize=fontsize)
plt.xticks(rotation=xrot,fontsize=fontsize,color=color)
plt.yticks(fontsize=fontsize)
plt.grid(color='k',linestyle='-.')
for i,txt in enumerate(df_woe['n_obs']):
txt = ' {:,.0f}'.format(txt)
try: # sometimes I get error in y[i]
plt.annotate(txt, (x[i],y[i]),
rotation=90,fontsize=18,color='blue')
except:
pass
def plot_woe_plotly(df_woe):
col = df_woe.columns.tolist()[0]
x = df_woe.iloc[:,0].astype(str)
y = df_woe['woe']
z = df_woe['text_plotly']
data = go.Scatter(x=x,y=y,text=z,mode='lines+markers')
fig = go.Figure(data=data)
fig.update_layout(
title=f"WoE Curve for **{col}**",
xaxis_title=f"{col}",
yaxis_title="Weight of Evidence",
)
# also display dataframe
df_style = df_woe.style\
.background_gradient(subset=['n_obs'],
cmap='Blues')\
.background_gradient(subset=['woe'],
cmap='Reds')\
.format({'n_obs':"{:,d}",
'diff_n_obs':"{:,.0f}"})
display(df_style)
fig.show()
def create_bins_dummies_cont(X_prep,col,bins):
"""Creates new dummy columns in-place in given dataframe.
For this continuous bins,
left numbers are inclusive.
Example:
========
bins = [0,9.961,12.025, 15.74, 20.281]
col = 'int_rate'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
Outputs:
========
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281
"""
nbins = len(bins)
ser = X_prep[col]
for i in range(nbins):
if i < (nbins-1):
a,b = bins[i], bins[i+1]
name = col + f':{a}_{b}'
print(name)
# condition
cond = (ser>=a) & (ser<b)
X_prep[name] = np.where(cond,1,0)
elif i == (nbins-1):
b = bins[i]
name = col + ':>=' + str(b)
print(name)
# condition
cond = ser >= b
X_prep[name] = np.where(cond,1,0)
def create_bins_dummies_int(X_prep,col,bins):
"""Creates new dummy columns in-place in given dataframe.
For this integer bins,
both left and right numbers are included.
Example:
---------
col = 'mths_since_last_delinq'
bins = [0,3,30,56]
create_bins_dummies_integer(X_prep,col,bins)
Outputs:
--------
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57
"""
nbins = len(bins)
ser = X_prep[col]
# assert last number of bin is integer
assert isinstance(bins[-1],int)
for i in range(nbins):
if i == 0:
a,b = bins[i], bins[i+1]
name = col + f':{a}_{b}'
print(name)
# condition
cond = (ser>=a) & (ser<=b)
X_prep[name] = np.where(cond,1,0)
elif i < (nbins-1):
a,b = bins[i]+1, bins[i+1]
name = col + f':{a}_{b}'
print(name)
# condition
cond = (ser>=a) & (ser<=b)
X_prep[name] = np.where(cond,1,0)
elif i == (nbins-1):
b = bins[i]+1
name = col + ':>=' + str(b)
print(name)
# condition
cond = ser >= b
X_prep[name] = np.where(cond,1,0)
def create_dummies_left_inclusive(X_prep,col,bins,labels):
nlabels = len(labels)
for i in range(nlabels):
name = col + ':' + labels[i]
ser = X_prep[col]
if i == 0:
cond = (ser <= bins[i])
X_prep[name] = np.where(cond,1,0)
print(name)
elif (i>0) & (i<(nlabels-1)):
cond = (ser >= bins[i-1]) & (ser < bins[i])
X_prep[name] = np.where(cond,1,0)
print(name)
elif i == (nlabels-1):
cond = (ser > bins[i-1])
X_prep[name] = np.where(cond,1,0)
print(name)
col = 'grade'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
df[col].value_counts().sort_index()
"""
as we expect, lower the grade, higher is the probabilty of default.
here, we will not group grades into grade:G_F and so on.
we already have one hot encoding of grade, so we are fine.
grade:A
grade:B
grade:C
grade:D
grade:E
grade:F
""";
col = 'home_ownership'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
cols = [i for i in X_prep.columns if 'home_ownership' in i]
X_prep[cols].head(2)
df_woe = get_woe_iv(X_prep,y_prep,'home_ownership')
plot_woe_plotly(df_woe)
"""
home_ownership:RENT_OTHER_NONE_ANY
home_ownership:OWN
home_ownership:MORTGAGE
""";
X_prep['home_ownership:RENT_OTHER_NONE_ANY'] = \
sum([X_prep['home_ownership:RENT'],
X_prep['home_ownership:OTHER'], X_prep['home_ownership:NONE'],
X_prep['home_ownership:ANY']])
col = 'addr_state'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
cols = [i for i in X_prep.columns if 'addr_state' in i]
X_prep[cols].head(2)
X_prep['addr_state'].unique()
train_state_names = ['GA', 'PA', 'WI', 'CA', 'FL', 'CT', 'TX', 'MA', 'OR', 'HI', 'MO',
'MS', 'NY', 'SC', 'WV', 'IL', 'NJ', 'AZ', 'MN', 'CO', 'TN', 'AL',
'LA', 'OH', 'VA', 'NV', 'WA', 'NC', 'WY', 'OK', 'MI', 'DC', 'NM',
'AK', 'MD', 'AR', 'KS', 'IN', 'UT', 'DE', 'NH', 'KY', 'RI', 'MT',
'VT', 'SD', 'NE', 'ID', 'ME', 'IA']
X_prep['addr_state'].nunique()
"""
We have 50 states and one DC, so, we should have 51 values.
one state is missing. and the missing state is ND.
""";
state_names51 = ['AK','AL','AR','AZ','CA','CO','CT',
'DC',
'DE','FL','GA','HI','IA','ID','IL',
'IN','KS','KY','LA','MA','MD','ME',
'MI','MN','MO','MS','MT','NC', 'ND',
'NE','NH','NJ','NM','NV','NY','OH',
'OK','OR','PA','RI','SC','SD','TN',
'TX','UT','VA','VT','WA','WI','WV',
'WY']
len(state_names51)
missing_states = [i for i in state_names51 if i not in
train_state_names]
missing_states
for col in missing_states:
name = 'addr_state:' + col
if name in X_prep.columns:
pass
else:
X_prep[name] = 0
# df_woe = get_woe_iv(X_prep,y_prep,'addr_state')
# plot_woe_plotly(df_woe)
"""
** do not forget the missing states.
We create the following categories:
'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'
'NM' 'VA'
'NY'
'OK' 'TN' 'MO' 'LA' 'MD' 'NC'
'CA'
'UT' 'KY' 'AZ' 'NJ'
'AR' 'MI' 'PA' 'OH' 'MN'
'RI' 'MA' 'DE' 'SD' 'IN'
'GA' 'WA' 'OR'
'WI' 'MT'
'TX'
'IL' 'CT'
'KS' 'SC' 'CO' 'VT' 'AK' 'MS'
'WV' 'NH' 'WY' 'DC' 'ME' 'ID'
'ND_NE_IA_NV_FL_HI_AL' will be the reference category.
""";
multiple = [
'ND_NE_IA_NV_FL_HI_AL',
'NM_VA',
'OK_TN_MO_LA_MD_NC',
'UT_KY_AZ_NJ',
'AR_MI_PA_OH_MN',
'RI_MA_DE_SD_IN',
'GA_WA_OR',
'WI_MT',
'IL_CT',
'KS_SC_CO_VT_AK_MS',
'WV_NH_WY_DC_ME_ID'
]
dummies = ['addr_state:'+i for i in multiple]
for d in dummies:
print(d)
"""
Store these dummies:
addr_state:ND_NE_IA_NV_FL_HI_AL
addr_state:NM_VA
addr_state:OK_TN_MO_LA_MD_NC
addr_state:UT_KY_AZ_NJ
addr_state:AR_MI_PA_OH_MN
addr_state:RI_MA_DE_SD_IN
addr_state:GA_WA_OR
addr_state:WI_MT
addr_state:IL_CT
addr_state:KS_SC_CO_VT_AK_MS
addr_state:WV_NH_WY_DC_ME_ID
""";
sep = '_'
one = 'ND_NE_IA_NV_FL_HI_AL'
lst_one = one.split(sep)
lst_one
colname = 'addr_state'
new_cat = colname + ':' + one
new_cat
lst = [X_prep[colname + ':' + i] for i in lst_one]
sum(lst).head(2)
colname = 'addr_state'
sep = '_'
for one in multiple:
lst_one = one.split(sep)
new_cat = colname + ':' + one
lst = [X_prep[colname + ':' + i] for i in lst_one]
X_prep[new_cat] = sum(lst)
X_prep.iloc[:2,-2:]
col = 'verification_status'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].value_counts()
cols = [i for i in X_prep.columns if 'verification_status' in i]
X_prep[cols].head(2)
for col in cols:
print(col)
"""
verification_status:Not Verified
verification_status:Source Verified
verification_status:Verified
""";
# they all have significant n_obs.
# no need to group them.
col = 'purpose'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
cols = [i for i in X_prep.columns if 'purpose' in i]
X_prep[cols].head(2)
# df_woe = get_woe_iv(X_prep,y_prep,'purpose')
# plot_woe_plotly(df_woe)
dummies = \
"""
small_business__educational__moving__renewable_energy
other
house__medical__wedding__vacation
debt_consolidation
home_improvement__major_purchase__car
credit_card
""".strip().split('\n')
dummies = ['purpose:'+ i for i in dummies]
for d in dummies:
print(d)
multiple =[
'small_business__educational__moving__renewable_energy',
'house__medical__wedding__vacation',
'home_improvement__major_purchase__car'
]
colname = 'purpose'
sep = '__'
multiple =[
'small_business__educational__moving__renewable_energy',
'house__medical__wedding__vacation',
'home_improvement__major_purchase__car'
]
for one in multiple:
lst_one = one.split(sep)
new_cat = colname + ':' + one
lst = [X_prep[colname + ':' + i] for i in lst_one]
X_prep[new_cat] = sum(lst)
X_prep.iloc[:2,-2:]
col = 'initial_list_status'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
cols = [i for i in X_prep.columns if 'initial_list_status' in i]
X_prep[cols].head(2)
# we already have binary encoding.
dummies = """
initial_list_status:f
initial_list_status:w
""";
col = 'term_int'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep['term:36'] = np.where((X_prep['term_int'] == 36), 1, 0)
X_prep['term:60'] = np.where((X_prep['term_int'] == 60), 1, 0)
dummies = \
"""
term:36
term:60
""";
col = 'emp_length_int'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].value_counts()
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe_plotly(df_woe)
"""
emp_length:0
emp_length:1
emp_length:2_3
emp_length:4_5
emp_length:6_7
emp_length:8_9
emp_length:10
""";
X_prep['emp_length:0'] = np.where(X_prep['emp_length_int'].isin([0]), 1, 0)
X_prep['emp_length:1'] = np.where(X_prep['emp_length_int'].isin([1]), 1, 0)
X_prep['emp_length:2_3'] = np.where(X_prep['emp_length_int'].isin([2,3]), 1, 0)
X_prep['emp_length:4_5'] = np.where(X_prep['emp_length_int'].isin([4,5]), 1, 0)
X_prep['emp_length:6_7'] = np.where(X_prep['emp_length_int'].isin([6,7]), 1, 0)
X_prep['emp_length:8_9'] = np.where(X_prep['emp_length_int'].isin([8,9]), 1, 0)
X_prep['emp_length:10'] = np.where(X_prep['emp_length_int'].isin([10]), 1, 0)
col = 'mths_since_issue_d'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[colf] = pd.cut(X_prep[col], 50)
X_prep[colf].head(2)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
# plot_woe_plotly(df_woe)
bins = [0,66,70,75,82,93,128]
col = 'mths_since_issue_d'
create_bins_dummies_int(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
# after 93, there are very few obervations
# we can group them all to one or make the two.
"""
mths_since_issue_d:0_66
mths_since_issue_d:67_70
mths_since_issue_d:71_75
mths_since_issue_d:76_82
mths_since_issue_d:83_93
mths_since_issue_d:94_128
mths_since_issue_d:>=129
""";
col = 'int_rate'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[colf] = pd.cut(X_prep[col], 50)
X_prep[colf].head(2)
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
bins = [0,9.961,12.025, 15.74, 20.281,27]
# we have monotonically decreasing curve, its easy to partition.
# first few point are zigzag, we suspect low n_obs.
# there is low n_obs, so bundle them up.
#
# look at graph, where woe drops sharply
"""
int_rate:0_9.961
int_rate:9.961_12.025
int_rate:12.025_15.74
int_rate:15.74_20.281
int_rate:>=20.281
""";
bins = [0,9.961,12.025, 15.74, 20.281]
col = 'int_rate'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
col = 'funded_amnt'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[colf] = pd.cut(X_prep[col], 50)
# for continuous variables use sort=colname
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
# woe is almost constant
# woe does not depend on dependent variable
# we can remove the variable from analysis.
col = 'mths_since_earliest_cr_line'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
bins = [0,207, 350]
col = 'mths_since_earliest_cr_line'
create_bins_dummies_int(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
"""
Ignore first few low numbers,
we see upward trend
mths_since_earliest_cr_line:0_207
mths_since_earliest_cr_line:208_350
mths_since_earliest_cr_line:>=351
""";
col = 'delinq_2yrs'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].value_counts()
# we do not want factor here.
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
"""
we might be attemped to ignore the feature
but this might be useful feature
we see lots of zeros, make it one feature.
then make 1-3 and greater than 3.
delinq_2yrs:0
delinq_2yrs:1_3
delinq_2yrs:>=3
""";
X_prep['delinq_2yrs:0'] = \
np.where((X_prep['delinq_2yrs'] ==0), 1, 0)
X_prep['delinq_2yrs:1-3'] = \
np.where((X_prep['delinq_2yrs'] >= 1) &
(X_prep['delinq_2yrs'] <= 3), 1, 0)
X_prep['delinq_2yrs:>=3'] = \
np.where((X_prep['delinq_2yrs'] > 3), 1, 0)
col = 'inq_last_6mths'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].value_counts()
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
"""
we see lots of zeros, make it one feature.
then make 1-3 and greater than 3.
inq_last_6mths:0
inq_last_6mths:1
inq_last_6mths:2_3
inq_last_6mths:>=3
""";
X_prep['inq_last_6mths:0'] = \
np.where((X_prep['inq_last_6mths'] ==0), 1, 0)
X_prep['inq_last_6mths:1'] = \
np.where((X_prep['inq_last_6mths'] ==1), 1, 0)
X_prep['inq_last_6mths:2_3'] = \
np.where((X_prep['inq_last_6mths'] > 1) &
(X_prep['inq_last_6mths'] <= 3), 1, 0)
X_prep['inq_last_6mths:>3'] = \
np.where((X_prep['inq_last_6mths'] > 3), 1, 0)
col = 'open_acc'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
X_prep[col].value_counts().sort_index()[lambda x: x>600]
X_prep[col+':<=3'] = \
np.where((X_prep[col] <=3), 1, 0)
X_prep[col+':4_5'] = \
np.where((X_prep[col] > 3) &
(X_prep[col] <= 5), 1, 0)
X_prep[col+':6_13'] = \
np.where((X_prep[col] > 5) &
(X_prep[col] <= 13), 1, 0)
X_prep[col+':13_17'] = \
np.where((X_prep[col] > 13) &
(X_prep[col] <= 17), 1, 0)
X_prep[col+':>17'] = \
np.where((X_prep[col] > 17), 1, 0)
"""
There are many small number and woe curve is flat.
look at n_obs and group them.
open_acc:<=3
open_acc:4_5
open_acc:6_13
open_acc:13_17
open_acc:>17
""";
# df_woe
col = 'pub_rec'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
df_woe = get_woe_iv(X_prep,y_prep,col,sort=col)
plot_woe(df_woe,xrot=90,color='k')
X_prep[col].value_counts().sort_index()
X_prep[col+':0'] = \
np.where((X_prep[col] ==0), 1, 0)
X_prep[col+':1'] = \
np.where((X_prep[col] ==1), 1, 0)
X_prep[col+':2'] = \
np.where((X_prep[col] ==2), 1, 0)
X_prep[col+':3'] = \
np.where((X_prep[col] ==3), 1, 0)
X_prep[col+':>3'] = \
np.where((X_prep[col] > 3), 1, 0)
"""
pub_rec:0
pub_rec:1
pub_rec:2
pub_rec:3
pub_rec:>3
""";
col = 'total_acc'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].value_counts()[lambda x: x>9000]
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
plot_woe(df_woe,xrot=90,color='k')
bins = [0,6.24,21.84,37.44,40.56]
col = 'total_acc'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
"""
total_acc:0_6.24
total_acc:6.24_21.84
total_acc:21.84_37.44
total_acc:37.44_40.56
total_acc:>=40.56
""";
col = 'acc_now_delinq'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].value_counts()
X_prep[col+':0'] = \
np.where((X_prep[col] ==0), 1, 0)
X_prep[col+'>=1'] = \
np.where((X_prep[col] > 3), 1, 0)
"""
acc_now_delinq:0
acc_now_delinq:>=1
""";
col = 'total_rev_hi_lim'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
df_woe = df_woe[df_woe.n_obs > 1000]
df_woe
plot_woe(df_woe,xrot=90,color='k')
"""
total_rev_hi_lim:<=5k
total_rev_hi_lim:5k_10k
total_rev_hi_lim:10k_20k
total_rev_hi_lim:20k_30k
total_rev_hi_lim:30k_40k
total_rev_hi_lim:40k_50k
total_rev_hi_lim:50k_65k
total_rev_hi_lim:65k_80k
total_rev_hi_lim:>80k
""";
X_prep['total_rev_hi_lim:<=5k'] = \
np.where((X_prep['total_rev_hi_lim'] <= 5000), 1, 0)
X_prep['total_rev_hi_lim:5k_10k'] = \
np.where((X_prep['total_rev_hi_lim'] > 5000) &
(X_prep['total_rev_hi_lim'] <= 10000), 1, 0)
X_prep['total_rev_hi_lim:10k_20k'] = \
np.where((X_prep['total_rev_hi_lim'] > 10000) &
(X_prep['total_rev_hi_lim'] <= 20000), 1, 0)
X_prep['total_rev_hi_lim:20k_30k'] = \
np.where((X_prep['total_rev_hi_lim'] > 20000) &
(X_prep['total_rev_hi_lim'] <= 30000), 1, 0)
X_prep['total_rev_hi_lim:30k_40k'] = \
np.where((X_prep['total_rev_hi_lim'] > 30000) &
(X_prep['total_rev_hi_lim'] <= 40000), 1, 0)
X_prep['total_rev_hi_lim:40k_50k'] = \
np.where((X_prep['total_rev_hi_lim'] > 40000) &
(X_prep['total_rev_hi_lim'] <= 50000), 1, 0)
X_prep['total_rev_hi_lim:50k_65k'] = \
np.where((X_prep['total_rev_hi_lim'] > 50000) &
(X_prep['total_rev_hi_lim'] <= 65000), 1, 0)
X_prep['total_rev_hi_lim:65k_80k'] = \
np.where((X_prep['total_rev_hi_lim'] > 65000) &
(X_prep['total_rev_hi_lim'] <= 80000), 1, 0)
X_prep['total_rev_hi_lim:>80k'] = np.where((X_prep['total_rev_hi_lim'] > 80000), 1, 0)
col = 'installment'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
df_woe = df_woe[df_woe.n_obs > 500]
df_woe.style\
.background_gradient(subset=['n_obs'],cmap='Blues')\
.background_gradient(subset=['woe'],cmap='Reds')\
.format({'n_obs':"{:,d}", 'diff_n_obs':"{:,.0f}"})
plot_woe(df_woe,xrot=90,color='k')
bins = [0, 127,210,266,294,322,378,406,433,517,545,
629,712,796,852,880,935,963]
col = 'installment'
create_bins_dummies_cont(X_prep,col,bins)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
"""
installment:0_127
installment:127_210
installment:210_266
installment:266_294
installment:294_322
installment:322_378
installment:378_406
installment:406_433
installment:433_517
installment:517_545
installment:545_629
installment:629_712
installment:712_796
installment:796_852
installment:852_880
installment:880_935
installment:935_963
installment:>=963
""";
col = 'annual_inc'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].describe()
X_prep[colf] = pd.cut(X_prep[col], 50)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
# we see too many obs in first category. increase bins to 100
X_prep[colf] = pd.cut(X_prep[col], 100)
df_woe = get_woe_iv(X_prep,y_prep,colf,sort=colf)
df_woe.head()
"""
Looking at factors of annual income, we can make two
groups with 150k (or 140k) and analyze the group
earning more than 140k separately.
""";
X_prep_tmp = X_prep[X_prep[col] <= 140000]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
y_prep_tmp = y_prep[X_prep_tmp.index]
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
X_prep_tmp[col].hist()
plot_woe(df_woe,xrot=90,color='k')
"""
WoE is monotonically increasing.
We can break into 10 equal parts.
""";
col = 'annual_inc'
bins = [20_000,30_000,40_000,50_000,
60_000,70_000,80_000,90_000,
100_000,120_000,140_000]
# we need one more labels than bins
labels = ['<20k',
'20k_30k','30k_40k','40k_50k',
'50k_60k','60k_70k','70k_80k',
'80k_90k','90k_100k','100k_120k','120k_140k',
'>140k']
create_dummies_left_inclusive(X_prep,col,bins,labels)
cols = [i for i in X_prep.columns if col in i]
X_prep[cols].head(2)
col = 'mths_since_last_delinq'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].hist()
# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals
print(col+':missing')
"""
Do not forget missing:
mths_since_last_delinq:missing
mths_since_last_delinq:0_3
mths_since_last_delinq:4_30
mths_since_last_delinq:31_56
mths_since_last_delinq:>=57
""";
X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
plot_woe(df_woe,xrot=90,color='k')
# month is an integer, make intervals integer.
# NOTE: create missing category
col = 'mths_since_last_delinq'
bins = [0,3,30,56]
create_bins_dummies_int(X_prep,col,bins)
cols = [i for i in X_prep.columns if 'delinq' in i]
X_prep[cols].head(2)
col = 'dti'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].hist()
# we see very few observation after 35, we make one group
# we can create temp df with range 0 to 35 and bin into 50
# parts
X_prep_tmp = X_prep[X_prep[col]<35]
y_prep_tmp = y_prep[X_prep_tmp.index]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
plot_woe(df_woe,xrot=90,color='k')
bins = [0,0.7,1.4,2.1,3.5,4.9,5.6,6.3,7,9,9.8,10.5,
11.2,11.9,12.6,13.3,14,14.7,17.5,18.2,21,21.7,
22.4,23.1,25.2,25.9,29.4,30.1,30.8,31.5,32.2,32.9,
33.6]
create_bins_dummies_cont(X_prep,col,bins)
col = 'mths_since_last_record'
colf = col + '_factor'
print('unique = ', X_prep[col].nunique())
print('nulls = ', X_prep[col].isnull().sum())
X_prep[col].head(2)
X_prep[col].hist()
# create missing column
vals = np.where((X_prep[col].isnull()), 1, 0)
X_prep[col+':missing'] = vals
print(col+':missing')
"""
mths_since_last_record:missing
mths_since_last_record:0_2
mths_since_last_record:3_22
mths_since_last_record:23_46
mths_since_last_record:47_68
mths_since_last_record:69_85
mths_since_last_record:>=86
""";
X_prep_tmp = X_prep[pd.notnull(X_prep[col])]
y_prep_tmp = y_prep[X_prep_tmp.index]
X_prep_tmp[colf] = pd.cut(X_prep_tmp[col], 50)
df_woe = get_woe_iv(X_prep_tmp,y_prep_tmp,colf,sort=colf)
df_woe.head()
plot_woe(df_woe,xrot=90,color='k')
# look at large falling line and create groups
bins = [0,2,22,46,68,85]
create_bins_dummies_int(X_prep,col,bins)
print(X_prep.shape)
X_prep.head(2)
# 1. after saving, comment these two lines
# 2. go up in notebook where we have defined X_prep
# and make it test and run all the cells below that.
X_prep.to_csv(dat_pro + 'Xtrain.csv')
y_prep.to_csv(dat_pro + 'ytrain.csv')
# after running this, comment it and uncomment train part
# we always create from train, not from test.
# X_prep.to_csv(dat_pro + 'Xtest.csv')
# y_prep.to_csv(dat_pro + 'ytest.csv')