%load_ext autoreload


%autoreload 2


# my personal library
from bhishan import bp

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm


import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 

# random state
SEED=100

[(x.__name__,x.__version__) for x in [np,pd,sns]]

[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]


pd.options.display.max_rows = 100


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


from sklearn.linear_model import LogisticRegression
from sklearn import metrics


import scipy.stats as stats


class LogisticRegressionWithPvalues:
    """Logistic Regression with p-values.
    
    Usage:
    model = LogisticRegressionWithPvalues(
              n_jobs=-1,solver='lbfgs',random_state=SEED)
              
    model.fit(Xtrain,ytrain)
    model.model.predict(Xtest,ytest) #!! NOT: model.predict(Xtest)
    
    """
    def __init__(self,*args,**kwargs):
        self.model = LogisticRegression(*args,**kwargs)

    def fit(self,X,y):
        self.model.fit(X,y)
        denom = (2.0 * (1.0 + 
                        np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X / denom).T,X)
        Cramer_Rao = np.linalg.inv(F_ij)
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0] / sigma_estimates
        p_values = [stats.norm.sf(abs(x)) * 2 for x in z_scores]
        self.coef_ = self.model.coef_
        self.intercept_ = self.model.intercept_
        self.p_values = p_values


!ls ../data/processed

Xtest.csv            Xtrain.csv           dummy_variables.xlsx ytest.csv            ytrain.csv


dat_pro = '../data/processed/'


Xtrain = pd.read_csv(dat_pro + 'Xtrain.csv',index_col=0)
ytrain = pd.read_csv(dat_pro + 'ytrain.csv',index_col=0)
Xtest = pd.read_csv(dat_pro + 'Xtest.csv',index_col=0)
ytest = pd.read_csv(dat_pro + 'ytest.csv',index_col=0)


Xtrain.shape, ytrain.shape,Xtest.shape,ytest.shape

((373028, 351), (373028, 1), (93257, 351), (93257, 1))


Xtrain.head(2)


ytrain['good_bad'].value_counts()

1    332252
0     40776
Name: good_bad, dtype: int64


xls = pd.ExcelFile(dat_pro + 'dummy_variables.xlsx')
print(xls.sheet_names)
df_features = xls.parse(0,header=None,names=['feature'])
df_references = xls.parse(1,header=None,names=['reference'])

['dummy', 'reference']


print(df_features.shape)
df_features.head(2).append(df_features.tail(2))

(164, 1)


df_features['orig_feature'] = df_features['feature'].str.split(':').str[0]

df_features.sample(5)


print(df_references.shape)
df_references.head(2).append(df_references.tail(2))

(23, 1)


features_all = df_features['feature'].to_numpy()
features_ref = df_references['reference'].to_numpy()

features = [i for i in features_all if i not in features_ref]

print(len(features))

142


# make sure we have required features in Xtrain and Xtest

Xtrain_features = Xtrain.columns.to_numpy()
Xtest_features = Xtest.columns.to_numpy()

missing_train = [i for i in features if i not in Xtrain_features]
missing_test = [i for i in features if i not in Xtest_features]
missing_train, missing_test

([], [])


Xtr = Xtrain[features]
Xtx = Xtest[features]

ytr = ytrain['good_bad']
ytx = ytest['good_bad']


from sklearn.linear_model import LogisticRegression
from sklearn import metrics


model = LogisticRegression()
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


# always use seed 
model = LogisticRegression(n_jobs=-1, random_state=SEED,solver='lbfgs')


%%time
model.fit(Xtr,ytr);

CPU times: user 384 ms, sys: 435 ms, total: 819 ms
Wall time: 1min 40s

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=100, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


model.intercept_

array([-0.44301985])


all_coeffs = np.append(model.intercept_, model.coef_.ravel())
all_coeffs[:2]

array([-0.44301985,  1.1145688 ])


df_summary = pd.DataFrame({
    'feature': ['intercept'] + features,
    'coefficient': all_coeffs
})


df_summary.head()


%%time
model = LogisticRegressionWithPvalues(n_jobs=-1,solver='lbfgs',random_state=SEED)

model.fit(Xtr,ytr);

CPU times: user 1.9 s, sys: 1.34 s, total: 3.24 s
Wall time: 1min 41s


all_coeffs = np.append(model.model.intercept_, model.model.coef_.ravel())
all_coeffs[:2]

array([-0.44301985,  1.1145688 ])


all_pvals = np.append(np.nan, model.p_values)
all_pvals[:2]

array([           nan, 2.89290871e-48])


df_summary = pd.DataFrame({
    'feature': ['intercept'] + features,
    'coef': all_coeffs,
    'p_value': all_pvals
})


df_summary.head()


df_summary['orig_feature'] = df_summary['feature'].str.split(':').str[0]

df_summary.head(2)


df_summary['n_cats'] = df_summary.groupby('orig_feature')['feature'].transform('count')

df_summary.head(3)


# if one of the categories have significant p-value, keep all of it.


df_summary_significant = df_summary.groupby('orig_feature').filter(
lambda x: any(x['p_value']<0.05))

df_summary_significant.head(2)


df_summary.shape[0] - df_summary_significant.shape[0]

15


# all of the sub categories are insignificant
df_insig = df_summary[~df_summary.index.isin(df_summary_significant.index)]

df_insig


features_drop = df_insig.drop_duplicates(subset='orig_feature')['orig_feature'].to_numpy()[1:]

drop1 = features_drop.tolist()
drop1

['home_ownership', 'delinq_2yrs', 'pub_rec', 'total_acc', 'acc_now_delinq']


pd.options.display.max_rows = 200


# check if only one of many subfeatures is significant.
df_tmp = (df_summary_significant
.drop('coef',axis=1)
.query('p_value > 0.05')
.assign(
    insig = 
    lambda dfx: dfx.groupby('orig_feature')
        ['feature'].transform('count'))
 
.drop_duplicates(subset=['orig_feature'])
.assign(insig_ratio = lambda x: x['insig'] / x['n_cats'])
)

df_tmp


"""
Look all suspected features one by one.

for dti, out of 32,  23 are insignificant, drop it.

mths since last record has 6 cats, but 5 are insignificant, drop it.


drop these:

['delinq_2yrs', 'pub_rec', 'total_acc', 'acc_now_delinq',
'dti', 'mths_since_last_record']


""";


drop2 = df_tmp[df_tmp['insig_ratio']>0.6]['orig_feature'].to_numpy().tolist()

drop2

[]


orig_feature_drop = drop1 + drop2
orig_feature_drop

['home_ownership', 'delinq_2yrs', 'pub_rec', 'total_acc', 'acc_now_delinq']


df_features.head(2)


features_drop = df_features[df_features['orig_feature'].isin(orig_feature_drop)]['feature'].to_numpy()


features_drop

array(['home_ownership:RENT_OTHER_NONE_ANY', 'home_ownership:OWN',
       'home_ownership:MORTGAGE', 'delinq_2yrs:0', 'delinq_2yrs:1_3',
       'delinq_2yrs:>=3', 'pub_rec:0', 'pub_rec:1', 'pub_rec:2',
       'pub_rec:3', 'pub_rec:>3', 'total_acc:0_6.24',
       'total_acc:6.24_21.84', 'total_acc:21.84_37.44',
       'total_acc:37.44_40.56', 'total_acc:>=40.56', 'acc_now_delinq:0',
       'acc_now_delinq:>=1'], dtype=object)


features_pval = [i for i in features_all
                if i not in features_ref
                if i not in features_drop
                ]


len(features_pval), len(features), len(features_all)

(128, 142, 164)


Xtr = Xtrain[features_pval]
Xtx = Xtest[features_pval]

ytr = ytrain['good_bad']
ytx = ytest['good_bad']


%%time
model = LogisticRegressionWithPvalues(n_jobs=-1,solver='lbfgs',
                                      random_state=SEED)

model.fit(Xtr,ytr);

CPU times: user 1.54 s, sys: 933 ms, total: 2.47 s
Wall time: 1min 28s


all_coeffs = np.append(model.intercept_, model.coef_.ravel())
all_pvals = np.append(np.nan, model.p_values)
df_summary = pd.DataFrame({
    'feature': ['intercept'] + features_pval,
    'coef': all_coeffs,
    'p_value': all_pvals
})

df_summary['orig_feature'] = df_summary['feature'].str.split(':').str[0]

df_summary['n_cats'] = df_summary.groupby('orig_feature')['feature'].transform('count')

df_summary.head()


df_summary.query('p_value > 0.05')


# all features are good now.


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


ypreds_tx = model.model.predict(Xtx)


df_eval = bp.get_binary_classification_scalar_metrics(
    "Logistic Regression",
    model.model,
    Xtx,ytx,
    ypreds_tx,
    desc="Features selected from p-values", df_eval=None,show=False)


df_eval


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy: ', accuracy_score(ytx,ypreds_tx))
print('Precision: ', precision_score(ytx,ypreds_tx))
print('Recall: ', recall_score(ytx,ypreds_tx))
print('F1-score: ', f1_score(ytx,ypreds_tx))

Accuracy:  0.8907642321756007
Precision:  0.8908002831342099
Recall:  0.9999398061758864
F1-score:  0.942220104703727


from sklearn.metrics import classification_report

print(classification_report(ytx, ypreds_tx))

              precision    recall  f1-score   support

           0       0.67      0.00      0.00     10192
           1       0.89      1.00      0.94     83065

    accuracy                           0.89     93257
   macro avg       0.78      0.50      0.47     93257
weighted avg       0.87      0.89      0.84     93257


from sklearn.metrics import confusion_matrix

confusion_matrix(ytx, ypreds_tx)

array([[   10, 10182],
       [    5, 83060]])


bp.print_confusion_matrix('Logistic Regression', 
                    ytx,ypreds_tx,
                    zero='Default',one='NonDefault')


yprobs_tx = model.model.predict_proba(Xtx)


yprobs_tx[:2]

array([[0.10826462, 0.89173538],
       [0.20604209, 0.79395791]])


yprobs_tx.shape

(93257, 2)


# first column is 0 and second is 1
# 0 is default, 1 is not-default.

yprobs_tx = yprobs_tx[:][:,1]
yprobs_tx[:2]

array([0.89173538, 0.79395791])


ytx.head(2)

193817    1
174039    1
Name: good_bad, dtype: int64


ytest.head(2)


ytest['yprobs'] = yprobs_tx
ytest.head()


tr = 0.5 # threshold
ytest['yhat'] = ytest['yprobs'].gt(tr).astype(int)

ytest.head(2)


# help(pd.crosstab)


df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],margins=True)


df_confusion


df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],normalize='index')

# look at diagonal 
df_confusion.style.format('{:.2%}')


"""
We have a problem. We get very low accuracy for the
customers who defaults for given threshold of 0.5

Increase the threshold.

""";


tr = 0.9 # too convervative model gives too few loans,
         # we want to lower frauds, but also want to give loans.
ytest['yhat'] = ytest['yprobs'].gt(tr).astype(int)

df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],normalize='index')

# look at diagonal 
df_confusion.style.format('{:.2%}')


tr = 0.7 
ytest['yhat'] = ytest['yprobs'].gt(tr).astype(int)

df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],normalize='index')

# look at diagonal 
df_confusion.style.format('{:.2%}')


def plot_auc(ytx,yprobs_tx):

    from sklearn import metrics

    fpr, tpr, thresholds = metrics.roc_curve(ytx,  yprobs_tx)
    auc = metrics.roc_auc_score(ytx, yprobs_tx)

    plt.plot(fpr,tpr,label=f"AUC={auc:.4f}")
    plt.plot(fpr,fpr,ls='--',color='blue',label='Random Guess')
    plt.legend(loc=4)

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.show()
    

plot_auc(ytx,yprobs_tx)


ytest.head(2)


ytest = ytest.sort_values('yprobs')
ytest.head()


def get_yprobs_sorted_proportions(df_ytest,col_ytrue,col_yprobs):
    """Sort the df_ytest by predicted probabilities and return
       dataframe with various proportions.
       
    Parameters:
    ------------
    df_ytest: pd.core.frame.DataFrame
        Test dataframe
    col_ytrue: str
        Name of column for true label.
    col_yprobs: str
        Name of column for predicted probabilities 
    """

    df_ytest = df_ytest.sort_values(col_yprobs)
    n_test = len(df_ytest)
    sum_test = df_ytest[col_ytrue].sum()

    df_ytest['cum_n_pop'] = range(1,n_test+1)
    df_ytest['cum_n_good'] = df_ytest['good_bad'].cumsum()
    df_ytest['cum_n_bad'] = (df_ytest['cum_n_pop']
                             - df_ytest['cum_n_good'])

    df_ytest['cum_perc_pop'] = df_ytest['cum_n_pop'] / n_test
    df_ytest['cum_perc_good'] = df_ytest['cum_n_good'] / sum_test
    df_ytest['cum_perc_bad'] = (df_ytest['cum_n_bad'] 
                                / (n_test - sum_test))
    
    return df_ytest

ytest = get_yprobs_sorted_proportions(ytest,'good_bad','yprobs')
ytest.head(20)


def plot_gini(df_ytest_proportions,col_ytrue,col_yprobs):
    """Plot Kolmogorov-Smirnov Curve.
    
    Parameters:
    ------------
    df_ytest_proportions: pd.core.frame.DataFrame
        Pandas dataframe with at least two columns:
        - cum_perc_pop
        - cum_perc_bad
        
    Usage:
    -------
    df_ytest = get_yprobs_sorted_proportions(
                   df_ytest,'ytrue','yprobs')
    
    plot_gini(df_ytest,'ytrue','yprobs')

    """
    from sklearn import metrics
    auc = metrics.roc_auc_score(
        df_ytest_proportions[col_ytrue],
        df_ytest_proportions[col_yprobs])

    gini = 2*auc-1

    x = df_ytest_proportions['cum_perc_pop']
    y = df_ytest_proportions['cum_perc_bad']

    plt.plot(x,y,label=f'Gini = {gini:.4f}')
    plt.plot(x,x,ls='--',c='k')

    plt.xlabel('Cumulative % Population')
    plt.ylabel('Cumulative % Bad')
    plt.title('Gini')

    plt.legend(loc=2)
    plt.show()

plot_gini(ytest,'good_bad','yprobs')


auc = metrics.roc_auc_score(ytx, yprobs_tx)
gini = auc * 2 - 1

print(auc,gini)

0.6977663685387198 0.3955327370774395


ytest.head(2)


def plot_ks(df_ytest_proportions,col_yprobs,
            col_cum_perc_good,col_cum_perc_bad):
    """Plot Kolmogorov-Smirnov Curve.

    Parameters:
    ------------
    df_ytest_proportions: pd.core.frame.DataFrame
        Pandas dataframe with at least three columns:
        - yprob
        - cum_perc_good
        - cum_perc_bad
    col_yprobs: str
        Name of column for test probabilities
    col_cum_perc_good: str
        Name of column for cumulative percent for good
    col_cum_perc_bad: str
        Name of column for cumulative percent for bad

    Usage:
    -------
    df_ytest = get_yprobs_sorted_proportions(
                   df_ytest,'ytrue','yprobs')

    plot_ks(ytest,'yprobs','cum_perc_good','cum_perc_bad')
    """
    x = df_ytest_proportions[col_yprobs]
    y1 = df_ytest_proportions[col_cum_perc_bad]
    y2 = df_ytest_proportions[col_cum_perc_good]

    KS = max(  df_ytest_proportions[col_cum_perc_bad]
             - df_ytest_proportions[col_cum_perc_good]
            )
    KS = round(KS,4)
    
    plt.plot(x,y1,color='red',label=f'KS = {KS}')
    plt.plot(x,y2,color='blue')

    plt.xlabel('Estimated Probability for being Good')
    plt.ylabel('Cumulative %')
    plt.title('Kolmogorov-Smirnov')

    plt.legend(loc=2)
    plt.show()

plot_ks(ytest,'yprobs','cum_perc_good','cum_perc_bad')


KS = max(ytest['cum_perc_bad'] - ytest['cum_perc_good'])
KS

0.29042659868701864


df_summary.round(2).head(10)


min_score = 300
max_score = 850

range_score = max_score - min_score
range_score

550


df_summary.groupby('orig_feature')['coef'].min()

orig_feature
addr_state                     0.002845
annual_inc                    -0.122785
dti                           -0.080395
emp_length                     0.193470
grade                          0.121276
initial_list_status            0.044801
inq_last_6mths                 0.202110
installment                   -0.675995
int_rate                       0.135878
intercept                     -0.643885
mths_since_earliest_cr_line    0.036126
mths_since_issue_d            -0.010289
mths_since_last_delinq        -0.064225
mths_since_last_record        -0.207305
open_acc                      -0.150863
purpose                        0.311027
term                          -0.075838
total_rev_hi_lim               0.163580
verification_status           -0.054779
Name: coef, dtype: float64


min_sum_coef = df_summary.groupby('orig_feature')['coef'].min().sum()

max_sum_coef = df_summary.groupby('orig_feature')['coef'].max().sum()


range_sum_coef = max_sum_coef - min_sum_coef

min_sum_coef, max_sum_coef, range_sum_coef

(-0.8752439586235148, 5.762234944096122, 6.637478902719637)


df_scorecard = df_summary.drop('n_cats',axis=1)


range_score = max_score - min_score
range_sum_coef = max_sum_coef - min_sum_coef
factor = range_score / range_sum_coef

df_scorecard['score_raw'] = df_scorecard['coef'] * factor 

df_scorecard.head()


df_scorecard['score_raw'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x123d78208>


intercept = df_scorecard['coef'][0]
intercept_diff = intercept - min_sum_coef

intercept_score =  ((intercept_diff / range_sum_coef) * range_score
            + min_score)

intercept_score

319.17102369468336


df_scorecard['score_raw'][0] = intercept_score


df_scorecard['score_prel'] = df_scorecard['score_raw'].round()
df_scorecard.head()


df_scorecard['score_prel'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1253a1518>


min_sum_score_prel = df_scorecard.groupby('orig_feature')['score_prel'].min().sum()
min_sum_score_prel

300.0


max_sum_score_prel = df_scorecard.groupby('orig_feature')['score_prel'].max(). sum()
max_sum_score_prel

849.0


df_scorecard['score'] = df_scorecard['score_prel']

df_scorecard['score'].describe()

count    129.000000
mean      13.883721
std       38.051903
min      -56.000000
25%        0.000000
50%       14.000000
75%       26.000000
max      319.000000
Name: score, dtype: float64


df_scorecard['score'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x151554208>


# if we have scores between 300 and 850, we dont have to modify.
# but, lets suppose due to rounding accumulation, max score is 851
# then we subtract 1 from highest difference score.
# find maximum difference variable


df_scorecard['score_diff'] = df_scorecard['score_prel'] - df_scorecard['score_raw']
df_scorecard.head()


idx_max_diff = df_scorecard['score_diff'].argmax()
idx_max_diff

22


df_scorecard.loc[idx_max_diff]

feature         purpose:home_improvement__major_purchase__car
coef                                                 0.452658
p_value                                           4.66353e-41
orig_feature                                          purpose
score_raw                                             37.5085
score_prel                                                 38
score_diff                                           0.491458
Name: 22, dtype: object


new_value = df_scorecard.loc[idx_max_diff]['score_prel'] - 1
new_value

37.0


# df_scorecard['score'][idx_max_diff] = new_value

# df_scorecard.head(10)


# min_sum_score_prel = df_scorecard.groupby('orig_feature')['score'].min().sum()

# min_sum_score_prel


# max_sum_score_prel = df_scorecard.groupby('orig_feature')['score_prel'].max(). sum()
# max_sum_score_prel


# df_scorecard['score'].hist()


# we have both min and max between 300 and 850


df_scorecard.to_csv(dat_pro + 'df_scorecard.csv')


Xtx.head(2)


df_scorecard.head(2)


Xtx.insert(0, 'intercept', 1)


Xtx.head(2)


scorecard_scores = df_scorecard['score']
scorecard_scores.shape

(129,)


scorecard_scores = scorecard_scores.to_numpy().reshape(-1,1)
scorecard_scores.shape

(129, 1)


y_scores = Xtx.dot(scorecard_scores)


y_scores.hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x151bb50b8>]],
      dtype=object)


y_scores.describe()


sum_coef_from_score = ((y_scores - min_score) / (max_score - min_score)) * (max_sum_coef - min_sum_coef) + min_sum_coef


y_hat_proba_from_score = np.exp(sum_coef_from_score) / (np.exp(sum_coef_from_score) + 1)

y_hat_proba_from_score.sort_index().head()


ytest.sort_index().head()


ytx.head(2)

193817    1
174039    1
Name: good_bad, dtype: int64


from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(ytx,  yprobs_tx)


df_cutoffs = pd.DataFrame({
    'thresholds': thresholds,
    'fpr': fpr,
    'tpr': tpr
})

df_cutoffs.head()


df_cutoffs['thresholds'][0] = 1 - 1 / np.power(10, 16)


df_cutoffs.head()


df_cutoffs['Score'] = (
    (np.log(df_cutoffs['thresholds'] 
            / (1 - df_cutoffs['thresholds'])
           ) - min_sum_coef
    ) * (
          (max_score - min_score) / (max_sum_coef - min_sum_coef)
        ) + min_score).round()


df_cutoffs.head()


df_cutoffs['Score'][0] = max_score


ytest.head(2)


def n_approved(p):
    return np.where(ytest['yprobs'] >= p, 1, 0).sum()


n_ytest = ytest.shape[0]
df_cutoffs['N Approved'] = df_cutoffs['thresholds'].apply(n_approved)

df_cutoffs['N Rejected'] = n_ytest - df_cutoffs['N Approved']

df_cutoffs['Approval Rate'] = df_cutoffs['N Approved'] / n_test

df_cutoffs['Rejection Rate'] = 1 - df_cutoffs['Approval Rate']

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-136-b1e703dea05c> in <module>
      4 df_cutoffs['N Rejected'] = n_ytest - df_cutoffs['N Approved']
      5 
----> 6 df_cutoffs['Approval Rate'] = df_cutoffs['N Approved'] / n_test
      7 
      8 df_cutoffs['Rejection Rate'] = 1 - df_cutoffs['Approval Rate']

NameError: name 'n_test' is not defined


df_cutoffs.head()

	Unnamed: 0.1	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	...	dti:32.2_32.9	dti:32.9_33.6	dti:>=33.6	mths_since_last_record:missing	mths_since_last_record:0_2	mths_since_last_record:3_22	mths_since_last_record:23_46	mths_since_last_record:47_68	mths_since_last_record:69_85	mths_since_last_record:>=86
366463	366463	18514656	20677353	4000	4000	4000.0	36 months	16.29	141.21	D	...	0	0	0	0	0	0	0	0	0	1
409215	409215	14388577	16430959	10000	10000	10000.0	36 months	11.99	332.10	B	...	0	0	0	0	0	0	0	0	0	1

	feature
0	grade:A
1	grade:B
162	mths_since_last_record:69_85
163	mths_since_last_record:>=86

	feature	orig_feature
132	dti:7_9	dti
43	mths_since_issue_d:71_75	mths_since_issue_d
149	dti:25.9_29.4	dti
113	annual_inc:70k_80k	annual_inc
144	dti:21_21.7	dti

	reference
0	grade:G
1	home_ownership:ANY_OTHER_NONE_RENT
21	dti:>=33.6
22	mths_since_last_record:>=86

	feature	coefficient
0	intercept	-0.443020
1	grade:A	1.114569
2	grade:B	0.775111
3	grade:C	0.577956
4	grade:D	0.418663

Table of Contents

Imports¶

Useful Functions¶

Load the data¶

Data for modelling¶

Modelling¶

Logistic Regression¶

Logistic Regression with p-values¶

Feature selection using p_values¶

Logistic Regression with p-values selected features¶

Model evaluation on test data¶

Predicted Probabilities¶

Model evaluation: confusion matrix¶

Model evaluation: AUC¶

Sort by yhat_prob and find Gini and Kolmogorov-Smirnov¶

Plot Gini Curve¶

Plot K-S curve¶

Create Scorecard¶

If preliminary values are out of range¶

Calculate Credit Score¶

Convert Credit Score to PD¶

Setting Cut-offs¶

	feature	coef	p_value	orig_feature	n_cats
1	grade:A	1.114569	2.892909e-48	grade	6
2	grade:B	0.775111	2.336615e-36	grade	6

	feature	p_value	orig_feature	n_cats	insig	insig_ratio
10	addr_state:NM_VA	0.729884	addr_state	10	4	0.400000
20	verification_status:Not Verified	0.696505	verification_status	2	1	0.500000
40	mths_since_issue_d:94_128	0.952232	mths_since_issue_d	6	1	0.166667
45	mths_since_earliest_cr_line:208_350	0.069849	mths_since_earliest_cr_line	2	1	0.500000
52	open_acc:4_5	0.596992	open_acc	4	1	0.250000
91	annual_inc:30k_40k	0.535520	annual_inc	11	1	0.090909
101	mths_since_last_delinq:0_3	0.321859	mths_since_last_delinq	4	1	0.250000
106	dti:0.7_1.4	0.160640	dti	32	11	0.343750
137	mths_since_last_record:missing	0.902412	mths_since_last_record	6	2	0.333333

	feature	coef	p_value	orig_feature	n_cats
0	intercept	-0.643885	NaN	intercept	1
1	grade:A	1.122714	1.577662e-50	grade	6
2	grade:B	0.777610	1.881894e-38	grade	6
3	grade:C	0.577341	1.145423e-25	grade	6
4	grade:D	0.414289	1.174786e-15	grade	6

	feature	coef	p_value	orig_feature	n_cats
7	addr_state:NM_VA	0.002845	0.922274	addr_state	10
8	addr_state:OK_TN_MO_LA_MD_NC	0.016196	0.383998	addr_state	10
9	addr_state:UT_KY_AZ_NJ	0.026434	0.203075	addr_state	10
11	addr_state:RI_MA_DE_SD_IN	0.046824	0.077776	addr_state	10
17	verification_status:Not Verified	-0.007047	0.646578	verification_status	2
37	mths_since_issue_d:94_128	-0.010289	0.824434	mths_since_issue_d	6
47	open_acc:4_5	0.029303	0.488598	open_acc	4
77	annual_inc:30k_40k	-0.035252	0.370798	annual_inc	11
87	mths_since_last_delinq:0_3	-0.064225	0.164137	mths_since_last_delinq	4
92	dti:0.7_1.4	0.122486	0.178167	dti	32
112	dti:21.7_22.4	0.043443	0.351534	dti	32
113	dti:22.4_23.1	0.058822	0.210731	dti	32
114	dti:23.1_25.2	0.025895	0.510759	dti	32
115	dti:25.2_25.9	-0.024075	0.632538	dti	32
116	dti:25.9_29.4	-0.039683	0.303729	dti	32
117	dti:29.4_30.1	-0.080395	0.148416	dti	32
118	dti:30.1_30.8	0.001336	0.982654	dti	32
119	dti:30.8_31.5	0.000775	0.990123	dti	32
120	dti:31.5_32.2	0.127653	0.057800	dti	32
121	dti:32.2_32.9	-0.033026	0.619906	dti	32
122	dti:32.9_33.6	0.037537	0.591143	dti	32

	Predicted_Default	PredictedNonDefault	Total_Default	Correct_Default	Incorrect_Default	Default_Detection	Total_NonDefault	Correct_NonDefault	Incorrect_NonDefault	NonDefault_Detection
Default	10	10,182	10,192	10	10,182	0.10%	83,065	83,060	5	99.99%
NonDefault	5	83,060	10,192	10	10,182	0.10%	83,065	83,060	5	99.99%

	good_bad	yprobs
193817	1	0.891735
174039	1	0.793958
37506	1	0.765724
182976	1	0.794416
148379	1	0.849916

	good_bad	yprobs
42438	1	0.415615
42295	0	0.416287
42396	0	0.419668
42368	0	0.422580
42398	0	0.428667

	good_bad	yprobs	cum_n_pop	cum_n_good	cum_n_bad	cum_perc_pop	cum_perc_good	cum_perc_bad
42438	1	0.415615	1	1	0	0.000011	0.000012	0.000000
42295	0	0.416287	2	1	1	0.000021	0.000012	0.000098
42396	0	0.419668	3	1	2	0.000032	0.000012	0.000196
42368	0	0.422580	4	1	3	0.000043	0.000012	0.000294
42398	0	0.428667	5	1	4	0.000054	0.000012	0.000392
42187	0	0.451400	6	1	5	0.000064	0.000012	0.000491
41887	0	0.461260	7	1	6	0.000075	0.000012	0.000589
12508	0	0.483478	8	1	7	0.000086	0.000012	0.000687
41787	0	0.488549	9	1	8	0.000097	0.000012	0.000785
41033	1	0.489062	10	2	8	0.000107	0.000024	0.000785
215253	0	0.491543	11	2	9	0.000118	0.000024	0.000883
134462	1	0.491717	12	3	9	0.000129	0.000036	0.000883
40781	1	0.492173	13	4	9	0.000139	0.000048	0.000883
39932	0	0.496644	14	4	10	0.000150	0.000048	0.000981
13858	1	0.499887	15	5	10	0.000161	0.000060	0.000981
39810	0	0.501561	16	5	11	0.000172	0.000060	0.001079
213386	1	0.503342	17	6	11	0.000182	0.000072	0.001079
154698	1	0.503370	18	7	11	0.000193	0.000084	0.001079
40291	0	0.503606	19	7	12	0.000204	0.000084	0.001177
146740	1	0.505241	20	8	12	0.000214	0.000096	0.001177

	feature	coef	p_value	orig_feature	n_cats
0	intercept	-0.64	NaN	intercept	1
1	grade:A	1.12	0.00	grade	6
2	grade:B	0.78	0.00	grade	6
3	grade:C	0.58	0.00	grade	6
4	grade:D	0.41	0.00	grade	6
5	grade:E	0.28	0.00	grade	6
6	grade:F	0.12	0.01	grade	6
7	addr_state:NM_VA	0.00	0.92	addr_state	10
8	addr_state:OK_TN_MO_LA_MD_NC	0.02	0.38	addr_state	10
9	addr_state:UT_KY_AZ_NJ	0.03	0.20	addr_state	10

	0
count	93257.000000
mean	564.329820
std	65.480216
min	343.000000
25%	517.000000
50%	560.000000
75%	608.000000
max	795.000000

	feature	coef	p_value	orig_feature	n_cats
0	intercept	-0.443020	NaN	intercept	1
7	home_ownership:RENT_OTHER_NONE_ANY	-0.226803	0.670968	home_ownership	3
8	home_ownership:OWN	-0.114844	0.829748	home_ownership	3
9	home_ownership:MORTGAGE	-0.086809	0.870873	home_ownership	3
47	delinq_2yrs:1_3	-0.027512	0.309531	delinq_2yrs	2
48	delinq_2yrs:>=3	-0.094025	0.095610	delinq_2yrs	2
56	pub_rec:1	-0.039595	0.940252	pub_rec	4
57	pub_rec:2	-0.096114	0.856153	pub_rec	4
58	pub_rec:3	-0.002483	0.996320	pub_rec	4
59	pub_rec:>3	-0.073385	0.892027	pub_rec	4
60	total_acc:6.24_21.84	0.009303	0.827957	total_acc	4
61	total_acc:21.84_37.44	0.008958	0.842667	total_acc	4
62	total_acc:37.44_40.56	-0.009261	0.862913	total_acc	4
63	total_acc:>=40.56	-0.027960	0.572622	total_acc	4
64	acc_now_delinq:>=1	-0.026172	0.984648	acc_now_delinq	1

yhat	0	1	All
good_bad
0	10	10182	10192
1	5	83060	83065
All	15	93242	93257

yhat	0	1
good_bad
0	71.37%	28.63%
1	42.70%	57.30%

yhat	0	1
good_bad
0	6.16%	93.84%
1	1.50%	98.50%

	good_bad	yprobs	yhat	cum_n_pop	cum_n_good	cum_n_bad	cum_perc_pop	cum_perc_good	cum_perc_bad
12	0	0.774833	1	7266	5312	1954	0.077914	0.063950	0.191719
15	1	0.689977	0	1497	988	509	0.016052	0.011894	0.049941
21	0	0.903478	1	44565	37110	7455	0.477873	0.446759	0.731456
24	0	0.909839	1	48035	40245	7790	0.515082	0.484500	0.764325
25	1	0.929176	1	59471	50721	8750	0.637711	0.610618	0.858516

	thresholds	fpr	tpr
0	1.994019	0.000000	0.000000
1	0.994019	0.000000	0.000012
2	0.990552	0.000000	0.000590
3	0.990545	0.000098	0.000590
4	0.989683	0.000098	0.001192

	thresholds	fpr	tpr
0	1.000000	0.000000	0.000000
1	0.994019	0.000000	0.000012
2	0.990552	0.000000	0.000590
3	0.990545	0.000098	0.000590
4	0.989683	0.000098	0.001192