%load_ext autoreload
%autoreload 2
# my personal library
from bhishan import bp
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
[(x.__name__,x.__version__) for x in [np,pd,sns]]
[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]
pd.options.display.max_rows = 100
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as stats
class LogisticRegressionWithPvalues:
"""Logistic Regression with p-values.
Usage:
model = LogisticRegressionWithPvalues(
n_jobs=-1,solver='lbfgs',random_state=SEED)
model.fit(Xtrain,ytrain)
model.model.predict(Xtest,ytest) #!! NOT: model.predict(Xtest)
"""
def __init__(self,*args,**kwargs):
self.model = LogisticRegression(*args,**kwargs)
def fit(self,X,y):
self.model.fit(X,y)
denom = (2.0 * (1.0 +
np.cosh(self.model.decision_function(X))))
denom = np.tile(denom,(X.shape[1],1)).T
F_ij = np.dot((X / denom).T,X)
Cramer_Rao = np.linalg.inv(F_ij)
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
z_scores = self.model.coef_[0] / sigma_estimates
p_values = [stats.norm.sf(abs(x)) * 2 for x in z_scores]
self.coef_ = self.model.coef_
self.intercept_ = self.model.intercept_
self.p_values = p_values
!ls ../data/processed
Xtest.csv Xtrain.csv dummy_variables.xlsx ytest.csv ytrain.csv
dat_pro = '../data/processed/'
Xtrain = pd.read_csv(dat_pro + 'Xtrain.csv',index_col=0)
ytrain = pd.read_csv(dat_pro + 'ytrain.csv',index_col=0)
Xtest = pd.read_csv(dat_pro + 'Xtest.csv',index_col=0)
ytest = pd.read_csv(dat_pro + 'ytest.csv',index_col=0)
Xtrain.shape, ytrain.shape,Xtest.shape,ytest.shape
((373028, 351), (373028, 1), (93257, 351), (93257, 1))
Xtrain.head(2)
Unnamed: 0.1 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | ... | dti:32.2_32.9 | dti:32.9_33.6 | dti:>=33.6 | mths_since_last_record:missing | mths_since_last_record:0_2 | mths_since_last_record:3_22 | mths_since_last_record:23_46 | mths_since_last_record:47_68 | mths_since_last_record:69_85 | mths_since_last_record:>=86 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
366463 | 366463 | 18514656 | 20677353 | 4000 | 4000 | 4000.0 | 36 months | 16.29 | 141.21 | D | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
409215 | 409215 | 14388577 | 16430959 | 10000 | 10000 | 10000.0 | 36 months | 11.99 | 332.10 | B | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 rows × 351 columns
ytrain['good_bad'].value_counts()
1 332252 0 40776 Name: good_bad, dtype: int64
xls = pd.ExcelFile(dat_pro + 'dummy_variables.xlsx')
print(xls.sheet_names)
df_features = xls.parse(0,header=None,names=['feature'])
df_references = xls.parse(1,header=None,names=['reference'])
['dummy', 'reference']
print(df_features.shape)
df_features.head(2).append(df_features.tail(2))
(164, 1)
feature | |
---|---|
0 | grade:A |
1 | grade:B |
162 | mths_since_last_record:69_85 |
163 | mths_since_last_record:>=86 |
df_features['orig_feature'] = df_features['feature'].str.split(':').str[0]
df_features.sample(5)
feature | orig_feature | |
---|---|---|
132 | dti:7_9 | dti |
43 | mths_since_issue_d:71_75 | mths_since_issue_d |
149 | dti:25.9_29.4 | dti |
113 | annual_inc:70k_80k | annual_inc |
144 | dti:21_21.7 | dti |
print(df_references.shape)
df_references.head(2).append(df_references.tail(2))
(23, 1)
reference | |
---|---|
0 | grade:G |
1 | home_ownership:ANY_OTHER_NONE_RENT |
21 | dti:>=33.6 |
22 | mths_since_last_record:>=86 |
features_all = df_features['feature'].to_numpy()
features_ref = df_references['reference'].to_numpy()
features = [i for i in features_all if i not in features_ref]
print(len(features))
142
# make sure we have required features in Xtrain and Xtest
Xtrain_features = Xtrain.columns.to_numpy()
Xtest_features = Xtest.columns.to_numpy()
missing_train = [i for i in features if i not in Xtrain_features]
missing_test = [i for i in features if i not in Xtest_features]
missing_train, missing_test
([], [])
Xtr = Xtrain[features]
Xtx = Xtest[features]
ytr = ytrain['good_bad']
ytx = ytest['good_bad']
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
model = LogisticRegression()
model
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
# always use seed
model = LogisticRegression(n_jobs=-1, random_state=SEED,solver='lbfgs')
%%time
model.fit(Xtr,ytr);
CPU times: user 384 ms, sys: 435 ms, total: 819 ms Wall time: 1min 40s
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=-1, penalty='l2', random_state=100, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
model.intercept_
array([-0.44301985])
all_coeffs = np.append(model.intercept_, model.coef_.ravel())
all_coeffs[:2]
array([-0.44301985, 1.1145688 ])
df_summary = pd.DataFrame({
'feature': ['intercept'] + features,
'coefficient': all_coeffs
})
df_summary.head()
feature | coefficient | |
---|---|---|
0 | intercept | -0.443020 |
1 | grade:A | 1.114569 |
2 | grade:B | 0.775111 |
3 | grade:C | 0.577956 |
4 | grade:D | 0.418663 |
%%time
model = LogisticRegressionWithPvalues(n_jobs=-1,solver='lbfgs',random_state=SEED)
model.fit(Xtr,ytr);
CPU times: user 1.9 s, sys: 1.34 s, total: 3.24 s Wall time: 1min 41s
all_coeffs = np.append(model.model.intercept_, model.model.coef_.ravel())
all_coeffs[:2]
array([-0.44301985, 1.1145688 ])
all_pvals = np.append(np.nan, model.p_values)
all_pvals[:2]
array([ nan, 2.89290871e-48])
df_summary = pd.DataFrame({
'feature': ['intercept'] + features,
'coef': all_coeffs,
'p_value': all_pvals
})
df_summary.head()
feature | coef | p_value | |
---|---|---|---|
0 | intercept | -0.443020 | NaN |
1 | grade:A | 1.114569 | 2.892909e-48 |
2 | grade:B | 0.775111 | 2.336615e-36 |
3 | grade:C | 0.577956 | 4.185329e-24 |
4 | grade:D | 0.418663 | 1.077725e-14 |
df_summary['orig_feature'] = df_summary['feature'].str.split(':').str[0]
df_summary.head(2)
feature | coef | p_value | orig_feature | |
---|---|---|---|---|
0 | intercept | -0.443020 | NaN | intercept |
1 | grade:A | 1.114569 | 2.892909e-48 | grade |
df_summary['n_cats'] = df_summary.groupby('orig_feature')['feature'].transform('count')
df_summary.head(3)
feature | coef | p_value | orig_feature | n_cats | |
---|---|---|---|---|---|
0 | intercept | -0.443020 | NaN | intercept | 1 |
1 | grade:A | 1.114569 | 2.892909e-48 | grade | 6 |
2 | grade:B | 0.775111 | 2.336615e-36 | grade | 6 |
# if one of the categories have significant p-value, keep all of it.
df_summary_significant = df_summary.groupby('orig_feature').filter(
lambda x: any(x['p_value']<0.05))
df_summary_significant.head(2)
feature | coef | p_value | orig_feature | n_cats | |
---|---|---|---|---|---|
1 | grade:A | 1.114569 | 2.892909e-48 | grade | 6 |
2 | grade:B | 0.775111 | 2.336615e-36 | grade | 6 |
df_summary.shape[0] - df_summary_significant.shape[0]
15
# all of the sub categories are insignificant
df_insig = df_summary[~df_summary.index.isin(df_summary_significant.index)]
df_insig
feature | coef | p_value | orig_feature | n_cats | |
---|---|---|---|---|---|
0 | intercept | -0.443020 | NaN | intercept | 1 |
7 | home_ownership:RENT_OTHER_NONE_ANY | -0.226803 | 0.670968 | home_ownership | 3 |
8 | home_ownership:OWN | -0.114844 | 0.829748 | home_ownership | 3 |
9 | home_ownership:MORTGAGE | -0.086809 | 0.870873 | home_ownership | 3 |
47 | delinq_2yrs:1_3 | -0.027512 | 0.309531 | delinq_2yrs | 2 |
48 | delinq_2yrs:>=3 | -0.094025 | 0.095610 | delinq_2yrs | 2 |
56 | pub_rec:1 | -0.039595 | 0.940252 | pub_rec | 4 |
57 | pub_rec:2 | -0.096114 | 0.856153 | pub_rec | 4 |
58 | pub_rec:3 | -0.002483 | 0.996320 | pub_rec | 4 |
59 | pub_rec:>3 | -0.073385 | 0.892027 | pub_rec | 4 |
60 | total_acc:6.24_21.84 | 0.009303 | 0.827957 | total_acc | 4 |
61 | total_acc:21.84_37.44 | 0.008958 | 0.842667 | total_acc | 4 |
62 | total_acc:37.44_40.56 | -0.009261 | 0.862913 | total_acc | 4 |
63 | total_acc:>=40.56 | -0.027960 | 0.572622 | total_acc | 4 |
64 | acc_now_delinq:>=1 | -0.026172 | 0.984648 | acc_now_delinq | 1 |
features_drop = df_insig.drop_duplicates(subset='orig_feature')['orig_feature'].to_numpy()[1:]
drop1 = features_drop.tolist()
drop1
['home_ownership', 'delinq_2yrs', 'pub_rec', 'total_acc', 'acc_now_delinq']
pd.options.display.max_rows = 200
# check if only one of many subfeatures is significant.
df_tmp = (df_summary_significant
.drop('coef',axis=1)
.query('p_value > 0.05')
.assign(
insig =
lambda dfx: dfx.groupby('orig_feature')
['feature'].transform('count'))
.drop_duplicates(subset=['orig_feature'])
.assign(insig_ratio = lambda x: x['insig'] / x['n_cats'])
)
df_tmp
feature | p_value | orig_feature | n_cats | insig | insig_ratio | |
---|---|---|---|---|---|---|
10 | addr_state:NM_VA | 0.729884 | addr_state | 10 | 4 | 0.400000 |
20 | verification_status:Not Verified | 0.696505 | verification_status | 2 | 1 | 0.500000 |
40 | mths_since_issue_d:94_128 | 0.952232 | mths_since_issue_d | 6 | 1 | 0.166667 |
45 | mths_since_earliest_cr_line:208_350 | 0.069849 | mths_since_earliest_cr_line | 2 | 1 | 0.500000 |
52 | open_acc:4_5 | 0.596992 | open_acc | 4 | 1 | 0.250000 |
91 | annual_inc:30k_40k | 0.535520 | annual_inc | 11 | 1 | 0.090909 |
101 | mths_since_last_delinq:0_3 | 0.321859 | mths_since_last_delinq | 4 | 1 | 0.250000 |
106 | dti:0.7_1.4 | 0.160640 | dti | 32 | 11 | 0.343750 |
137 | mths_since_last_record:missing | 0.902412 | mths_since_last_record | 6 | 2 | 0.333333 |
"""
Look all suspected features one by one.
for dti, out of 32, 23 are insignificant, drop it.
mths since last record has 6 cats, but 5 are insignificant, drop it.
drop these:
['delinq_2yrs', 'pub_rec', 'total_acc', 'acc_now_delinq',
'dti', 'mths_since_last_record']
""";
drop2 = df_tmp[df_tmp['insig_ratio']>0.6]['orig_feature'].to_numpy().tolist()
drop2
[]
orig_feature_drop = drop1 + drop2
orig_feature_drop
['home_ownership', 'delinq_2yrs', 'pub_rec', 'total_acc', 'acc_now_delinq']
df_features.head(2)
feature | orig_feature | |
---|---|---|
0 | grade:A | grade |
1 | grade:B | grade |
features_drop = df_features[df_features['orig_feature'].isin(orig_feature_drop)]['feature'].to_numpy()
features_drop
array(['home_ownership:RENT_OTHER_NONE_ANY', 'home_ownership:OWN', 'home_ownership:MORTGAGE', 'delinq_2yrs:0', 'delinq_2yrs:1_3', 'delinq_2yrs:>=3', 'pub_rec:0', 'pub_rec:1', 'pub_rec:2', 'pub_rec:3', 'pub_rec:>3', 'total_acc:0_6.24', 'total_acc:6.24_21.84', 'total_acc:21.84_37.44', 'total_acc:37.44_40.56', 'total_acc:>=40.56', 'acc_now_delinq:0', 'acc_now_delinq:>=1'], dtype=object)
features_pval = [i for i in features_all
if i not in features_ref
if i not in features_drop
]
len(features_pval), len(features), len(features_all)
(128, 142, 164)
Xtr = Xtrain[features_pval]
Xtx = Xtest[features_pval]
ytr = ytrain['good_bad']
ytx = ytest['good_bad']
%%time
model = LogisticRegressionWithPvalues(n_jobs=-1,solver='lbfgs',
random_state=SEED)
model.fit(Xtr,ytr);
CPU times: user 1.54 s, sys: 933 ms, total: 2.47 s Wall time: 1min 28s
all_coeffs = np.append(model.intercept_, model.coef_.ravel())
all_pvals = np.append(np.nan, model.p_values)
df_summary = pd.DataFrame({
'feature': ['intercept'] + features_pval,
'coef': all_coeffs,
'p_value': all_pvals
})
df_summary['orig_feature'] = df_summary['feature'].str.split(':').str[0]
df_summary['n_cats'] = df_summary.groupby('orig_feature')['feature'].transform('count')
df_summary.head()
feature | coef | p_value | orig_feature | n_cats | |
---|---|---|---|---|---|
0 | intercept | -0.643885 | NaN | intercept | 1 |
1 | grade:A | 1.122714 | 1.577662e-50 | grade | 6 |
2 | grade:B | 0.777610 | 1.881894e-38 | grade | 6 |
3 | grade:C | 0.577341 | 1.145423e-25 | grade | 6 |
4 | grade:D | 0.414289 | 1.174786e-15 | grade | 6 |
df_summary.query('p_value > 0.05')
feature | coef | p_value | orig_feature | n_cats | |
---|---|---|---|---|---|
7 | addr_state:NM_VA | 0.002845 | 0.922274 | addr_state | 10 |
8 | addr_state:OK_TN_MO_LA_MD_NC | 0.016196 | 0.383998 | addr_state | 10 |
9 | addr_state:UT_KY_AZ_NJ | 0.026434 | 0.203075 | addr_state | 10 |
11 | addr_state:RI_MA_DE_SD_IN | 0.046824 | 0.077776 | addr_state | 10 |
17 | verification_status:Not Verified | -0.007047 | 0.646578 | verification_status | 2 |
37 | mths_since_issue_d:94_128 | -0.010289 | 0.824434 | mths_since_issue_d | 6 |
47 | open_acc:4_5 | 0.029303 | 0.488598 | open_acc | 4 |
77 | annual_inc:30k_40k | -0.035252 | 0.370798 | annual_inc | 11 |
87 | mths_since_last_delinq:0_3 | -0.064225 | 0.164137 | mths_since_last_delinq | 4 |
92 | dti:0.7_1.4 | 0.122486 | 0.178167 | dti | 32 |
112 | dti:21.7_22.4 | 0.043443 | 0.351534 | dti | 32 |
113 | dti:22.4_23.1 | 0.058822 | 0.210731 | dti | 32 |
114 | dti:23.1_25.2 | 0.025895 | 0.510759 | dti | 32 |
115 | dti:25.2_25.9 | -0.024075 | 0.632538 | dti | 32 |
116 | dti:25.9_29.4 | -0.039683 | 0.303729 | dti | 32 |
117 | dti:29.4_30.1 | -0.080395 | 0.148416 | dti | 32 |
118 | dti:30.1_30.8 | 0.001336 | 0.982654 | dti | 32 |
119 | dti:30.8_31.5 | 0.000775 | 0.990123 | dti | 32 |
120 | dti:31.5_32.2 | 0.127653 | 0.057800 | dti | 32 |
121 | dti:32.2_32.9 | -0.033026 | 0.619906 | dti | 32 |
122 | dti:32.9_33.6 | 0.037537 | 0.591143 | dti | 32 |
# all features are good now.
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
ypreds_tx = model.model.predict(Xtx)
df_eval = bp.get_binary_classification_scalar_metrics(
"Logistic Regression",
model.model,
Xtx,ytx,
ypreds_tx,
desc="Features selected from p-values", df_eval=None,show=False)
df_eval
Model | Description | Accuracy | Precision | Recall | F1 | Mathews_Correlation_Coefficient | Cohens_Kappa | Area_Under_Precision_Curve | Area_Under_ROC_Curve | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Logistic Regression | Features selected from p-values | 0.890764 | 0.8908 | 0.99994 | 0.94222 | 0.022659 | 0.001639 | 0.946481 | 0.697766 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy: ', accuracy_score(ytx,ypreds_tx))
print('Precision: ', precision_score(ytx,ypreds_tx))
print('Recall: ', recall_score(ytx,ypreds_tx))
print('F1-score: ', f1_score(ytx,ypreds_tx))
Accuracy: 0.8907642321756007 Precision: 0.8908002831342099 Recall: 0.9999398061758864 F1-score: 0.942220104703727
from sklearn.metrics import classification_report
print(classification_report(ytx, ypreds_tx))
precision recall f1-score support 0 0.67 0.00 0.00 10192 1 0.89 1.00 0.94 83065 accuracy 0.89 93257 macro avg 0.78 0.50 0.47 93257 weighted avg 0.87 0.89 0.84 93257
from sklearn.metrics import confusion_matrix
confusion_matrix(ytx, ypreds_tx)
array([[ 10, 10182], [ 5, 83060]])
bp.print_confusion_matrix('Logistic Regression',
ytx,ypreds_tx,
zero='Default',one='NonDefault')
Predicted_Default | PredictedNonDefault | Total_Default | Correct_Default | Incorrect_Default | Default_Detection | Total_NonDefault | Correct_NonDefault | Incorrect_NonDefault | NonDefault_Detection | |
---|---|---|---|---|---|---|---|---|---|---|
Default | 10 | 10,182 | 10,192 | 10 | 10,182 | 0.10% | 83,065 | 83,060 | 5 | 99.99% |
NonDefault | 5 | 83,060 | 10,192 | 10 | 10,182 | 0.10% | 83,065 | 83,060 | 5 | 99.99% |
yprobs_tx = model.model.predict_proba(Xtx)
yprobs_tx[:2]
array([[0.10826462, 0.89173538], [0.20604209, 0.79395791]])
yprobs_tx.shape
(93257, 2)
# first column is 0 and second is 1
# 0 is default, 1 is not-default.
yprobs_tx = yprobs_tx[:][:,1]
yprobs_tx[:2]
array([0.89173538, 0.79395791])
ytx.head(2)
193817 1 174039 1 Name: good_bad, dtype: int64
ytest.head(2)
good_bad | |
---|---|
193817 | 1 |
174039 | 1 |
ytest['yprobs'] = yprobs_tx
ytest.head()
good_bad | yprobs | |
---|---|---|
193817 | 1 | 0.891735 |
174039 | 1 | 0.793958 |
37506 | 1 | 0.765724 |
182976 | 1 | 0.794416 |
148379 | 1 | 0.849916 |
tr = 0.5 # threshold
ytest['yhat'] = ytest['yprobs'].gt(tr).astype(int)
ytest.head(2)
good_bad | yprobs | yhat | |
---|---|---|---|
193817 | 1 | 0.891735 | 1 |
174039 | 1 | 0.793958 | 1 |
# help(pd.crosstab)
df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],margins=True)
df_confusion
yhat | 0 | 1 | All |
---|---|---|---|
good_bad | |||
0 | 10 | 10182 | 10192 |
1 | 5 | 83060 | 83065 |
All | 15 | 93242 | 93257 |
df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],normalize='index')
# look at diagonal
df_confusion.style.format('{:.2%}')
yhat | 0 | 1 |
---|---|---|
good_bad | ||
0 | 0.10% | 99.90% |
1 | 0.01% | 99.99% |
"""
We have a problem. We get very low accuracy for the
customers who defaults for given threshold of 0.5
Increase the threshold.
""";
tr = 0.9 # too convervative model gives too few loans,
# we want to lower frauds, but also want to give loans.
ytest['yhat'] = ytest['yprobs'].gt(tr).astype(int)
df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],normalize='index')
# look at diagonal
df_confusion.style.format('{:.2%}')
yhat | 0 | 1 |
---|---|---|
good_bad | ||
0 | 71.37% | 28.63% |
1 | 42.70% | 57.30% |
tr = 0.7
ytest['yhat'] = ytest['yprobs'].gt(tr).astype(int)
df_confusion = pd.crosstab(ytest['good_bad'],ytest['yhat'],normalize='index')
# look at diagonal
df_confusion.style.format('{:.2%}')
yhat | 0 | 1 |
---|---|---|
good_bad | ||
0 | 6.16% | 93.84% |
1 | 1.50% | 98.50% |
def plot_auc(ytx,yprobs_tx):
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(ytx, yprobs_tx)
auc = metrics.roc_auc_score(ytx, yprobs_tx)
plt.plot(fpr,tpr,label=f"AUC={auc:.4f}")
plt.plot(fpr,fpr,ls='--',color='blue',label='Random Guess')
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.show()
plot_auc(ytx,yprobs_tx)
ytest.head(2)
good_bad | yprobs | yhat | |
---|---|---|---|
193817 | 1 | 0.891735 | 1 |
174039 | 1 | 0.793958 | 1 |
ytest = ytest.sort_values('yprobs')
ytest.head()
good_bad | yprobs | yhat | |
---|---|---|---|
42438 | 1 | 0.415615 | 0 |
42295 | 0 | 0.416287 | 0 |
42396 | 0 | 0.419668 | 0 |
42368 | 0 | 0.422580 | 0 |
42398 | 0 | 0.428667 | 0 |
def get_yprobs_sorted_proportions(df_ytest,col_ytrue,col_yprobs):
"""Sort the df_ytest by predicted probabilities and return
dataframe with various proportions.
Parameters:
------------
df_ytest: pd.core.frame.DataFrame
Test dataframe
col_ytrue: str
Name of column for true label.
col_yprobs: str
Name of column for predicted probabilities
"""
df_ytest = df_ytest.sort_values(col_yprobs)
n_test = len(df_ytest)
sum_test = df_ytest[col_ytrue].sum()
df_ytest['cum_n_pop'] = range(1,n_test+1)
df_ytest['cum_n_good'] = df_ytest['good_bad'].cumsum()
df_ytest['cum_n_bad'] = (df_ytest['cum_n_pop']
- df_ytest['cum_n_good'])
df_ytest['cum_perc_pop'] = df_ytest['cum_n_pop'] / n_test
df_ytest['cum_perc_good'] = df_ytest['cum_n_good'] / sum_test
df_ytest['cum_perc_bad'] = (df_ytest['cum_n_bad']
/ (n_test - sum_test))
return df_ytest
ytest = get_yprobs_sorted_proportions(ytest,'good_bad','yprobs')
ytest.head(20)
good_bad | yprobs | yhat | cum_n_pop | cum_n_good | cum_n_bad | cum_perc_pop | cum_perc_good | cum_perc_bad | |
---|---|---|---|---|---|---|---|---|---|
42438 | 1 | 0.415615 | 0 | 1 | 1 | 0 | 0.000011 | 0.000012 | 0.000000 |
42295 | 0 | 0.416287 | 0 | 2 | 1 | 1 | 0.000021 | 0.000012 | 0.000098 |
42396 | 0 | 0.419668 | 0 | 3 | 1 | 2 | 0.000032 | 0.000012 | 0.000196 |
42368 | 0 | 0.422580 | 0 | 4 | 1 | 3 | 0.000043 | 0.000012 | 0.000294 |
42398 | 0 | 0.428667 | 0 | 5 | 1 | 4 | 0.000054 | 0.000012 | 0.000392 |
42187 | 0 | 0.451400 | 0 | 6 | 1 | 5 | 0.000064 | 0.000012 | 0.000491 |
41887 | 0 | 0.461260 | 0 | 7 | 1 | 6 | 0.000075 | 0.000012 | 0.000589 |
12508 | 0 | 0.483478 | 0 | 8 | 1 | 7 | 0.000086 | 0.000012 | 0.000687 |
41787 | 0 | 0.488549 | 0 | 9 | 1 | 8 | 0.000097 | 0.000012 | 0.000785 |
41033 | 1 | 0.489062 | 0 | 10 | 2 | 8 | 0.000107 | 0.000024 | 0.000785 |
215253 | 0 | 0.491543 | 0 | 11 | 2 | 9 | 0.000118 | 0.000024 | 0.000883 |
134462 | 1 | 0.491717 | 0 | 12 | 3 | 9 | 0.000129 | 0.000036 | 0.000883 |
40781 | 1 | 0.492173 | 0 | 13 | 4 | 9 | 0.000139 | 0.000048 | 0.000883 |
39932 | 0 | 0.496644 | 0 | 14 | 4 | 10 | 0.000150 | 0.000048 | 0.000981 |
13858 | 1 | 0.499887 | 0 | 15 | 5 | 10 | 0.000161 | 0.000060 | 0.000981 |
39810 | 0 | 0.501561 | 0 | 16 | 5 | 11 | 0.000172 | 0.000060 | 0.001079 |
213386 | 1 | 0.503342 | 0 | 17 | 6 | 11 | 0.000182 | 0.000072 | 0.001079 |
154698 | 1 | 0.503370 | 0 | 18 | 7 | 11 | 0.000193 | 0.000084 | 0.001079 |
40291 | 0 | 0.503606 | 0 | 19 | 7 | 12 | 0.000204 | 0.000084 | 0.001177 |
146740 | 1 | 0.505241 | 0 | 20 | 8 | 12 | 0.000214 | 0.000096 | 0.001177 |
def plot_gini(df_ytest_proportions,col_ytrue,col_yprobs):
"""Plot Kolmogorov-Smirnov Curve.
Parameters:
------------
df_ytest_proportions: pd.core.frame.DataFrame
Pandas dataframe with at least two columns:
- cum_perc_pop
- cum_perc_bad
Usage:
-------
df_ytest = get_yprobs_sorted_proportions(
df_ytest,'ytrue','yprobs')
plot_gini(df_ytest,'ytrue','yprobs')
"""
from sklearn import metrics
auc = metrics.roc_auc_score(
df_ytest_proportions[col_ytrue],
df_ytest_proportions[col_yprobs])
gini = 2*auc-1
x = df_ytest_proportions['cum_perc_pop']
y = df_ytest_proportions['cum_perc_bad']
plt.plot(x,y,label=f'Gini = {gini:.4f}')
plt.plot(x,x,ls='--',c='k')
plt.xlabel('Cumulative % Population')
plt.ylabel('Cumulative % Bad')
plt.title('Gini')
plt.legend(loc=2)
plt.show()
plot_gini(ytest,'good_bad','yprobs')
auc = metrics.roc_auc_score(ytx, yprobs_tx)
gini = auc * 2 - 1
print(auc,gini)
0.6977663685387198 0.3955327370774395
ytest.head(2)
good_bad | yprobs | yhat | cum_n_pop | cum_n_good | cum_n_bad | cum_perc_pop | cum_perc_good | cum_perc_bad | |
---|---|---|---|---|---|---|---|---|---|
42438 | 1 | 0.415615 | 0 | 1 | 1 | 0 | 0.000011 | 0.000012 | 0.000000 |
42295 | 0 | 0.416287 | 0 | 2 | 1 | 1 | 0.000021 | 0.000012 | 0.000098 |
def plot_ks(df_ytest_proportions,col_yprobs,
col_cum_perc_good,col_cum_perc_bad):
"""Plot Kolmogorov-Smirnov Curve.
Parameters:
------------
df_ytest_proportions: pd.core.frame.DataFrame
Pandas dataframe with at least three columns:
- yprob
- cum_perc_good
- cum_perc_bad
col_yprobs: str
Name of column for test probabilities
col_cum_perc_good: str
Name of column for cumulative percent for good
col_cum_perc_bad: str
Name of column for cumulative percent for bad
Usage:
-------
df_ytest = get_yprobs_sorted_proportions(
df_ytest,'ytrue','yprobs')
plot_ks(ytest,'yprobs','cum_perc_good','cum_perc_bad')
"""
x = df_ytest_proportions[col_yprobs]
y1 = df_ytest_proportions[col_cum_perc_bad]
y2 = df_ytest_proportions[col_cum_perc_good]
KS = max( df_ytest_proportions[col_cum_perc_bad]
- df_ytest_proportions[col_cum_perc_good]
)
KS = round(KS,4)
plt.plot(x,y1,color='red',label=f'KS = {KS}')
plt.plot(x,y2,color='blue')
plt.xlabel('Estimated Probability for being Good')
plt.ylabel('Cumulative %')
plt.title('Kolmogorov-Smirnov')
plt.legend(loc=2)
plt.show()
plot_ks(ytest,'yprobs','cum_perc_good','cum_perc_bad')
KS = max(ytest['cum_perc_bad'] - ytest['cum_perc_good'])
KS
0.29042659868701864
df_summary.round(2).head(10)
feature | coef | p_value | orig_feature | n_cats | |
---|---|---|---|---|---|
0 | intercept | -0.64 | NaN | intercept | 1 |
1 | grade:A | 1.12 | 0.00 | grade | 6 |
2 | grade:B | 0.78 | 0.00 | grade | 6 |
3 | grade:C | 0.58 | 0.00 | grade | 6 |
4 | grade:D | 0.41 | 0.00 | grade | 6 |
5 | grade:E | 0.28 | 0.00 | grade | 6 |
6 | grade:F | 0.12 | 0.01 | grade | 6 |
7 | addr_state:NM_VA | 0.00 | 0.92 | addr_state | 10 |
8 | addr_state:OK_TN_MO_LA_MD_NC | 0.02 | 0.38 | addr_state | 10 |
9 | addr_state:UT_KY_AZ_NJ | 0.03 | 0.20 | addr_state | 10 |
min_score = 300
max_score = 850
range_score = max_score - min_score
range_score
550
df_summary.groupby('orig_feature')['coef'].min()
orig_feature addr_state 0.002845 annual_inc -0.122785 dti -0.080395 emp_length 0.193470 grade 0.121276 initial_list_status 0.044801 inq_last_6mths 0.202110 installment -0.675995 int_rate 0.135878 intercept -0.643885 mths_since_earliest_cr_line 0.036126 mths_since_issue_d -0.010289 mths_since_last_delinq -0.064225 mths_since_last_record -0.207305 open_acc -0.150863 purpose 0.311027 term -0.075838 total_rev_hi_lim 0.163580 verification_status -0.054779 Name: coef, dtype: float64
min_sum_coef = df_summary.groupby('orig_feature')['coef'].min().sum()
max_sum_coef = df_summary.groupby('orig_feature')['coef'].max().sum()
range_sum_coef = max_sum_coef - min_sum_coef
min_sum_coef, max_sum_coef, range_sum_coef
(-0.8752439586235148, 5.762234944096122, 6.637478902719637)
df_scorecard = df_summary.drop('n_cats',axis=1)
range_score = max_score - min_score
range_sum_coef = max_sum_coef - min_sum_coef
factor = range_score / range_sum_coef
df_scorecard['score_raw'] = df_scorecard['coef'] * factor
df_scorecard.head()
feature | coef | p_value | orig_feature | score_raw | |
---|---|---|---|---|---|
0 | intercept | -0.643885 | NaN | intercept | -53.354130 |
1 | grade:A | 1.122714 | 1.577662e-50 | grade | 93.031247 |
2 | grade:B | 0.777610 | 1.881894e-38 | grade | 64.434920 |
3 | grade:C | 0.577341 | 1.145423e-25 | grade | 47.840095 |
4 | grade:D | 0.414289 | 1.174786e-15 | grade | 34.329117 |
df_scorecard['score_raw'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x123d78208>
intercept = df_scorecard['coef'][0]
intercept_diff = intercept - min_sum_coef
intercept_score = ((intercept_diff / range_sum_coef) * range_score
+ min_score)
intercept_score
319.17102369468336
df_scorecard['score_raw'][0] = intercept_score
df_scorecard['score_prel'] = df_scorecard['score_raw'].round()
df_scorecard.head()
feature | coef | p_value | orig_feature | score_raw | score_prel | |
---|---|---|---|---|---|---|
0 | intercept | -0.643885 | NaN | intercept | 319.171024 | 319.0 |
1 | grade:A | 1.122714 | 1.577662e-50 | grade | 93.031247 | 93.0 |
2 | grade:B | 0.777610 | 1.881894e-38 | grade | 64.434920 | 64.0 |
3 | grade:C | 0.577341 | 1.145423e-25 | grade | 47.840095 | 48.0 |
4 | grade:D | 0.414289 | 1.174786e-15 | grade | 34.329117 | 34.0 |
df_scorecard['score_prel'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1253a1518>
min_sum_score_prel = df_scorecard.groupby('orig_feature')['score_prel'].min().sum()
min_sum_score_prel
300.0
max_sum_score_prel = df_scorecard.groupby('orig_feature')['score_prel'].max(). sum()
max_sum_score_prel
849.0
df_scorecard['score'] = df_scorecard['score_prel']
df_scorecard['score'].describe()
count 129.000000 mean 13.883721 std 38.051903 min -56.000000 25% 0.000000 50% 14.000000 75% 26.000000 max 319.000000 Name: score, dtype: float64
df_scorecard['score'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x151554208>
# if we have scores between 300 and 850, we dont have to modify.
# but, lets suppose due to rounding accumulation, max score is 851
# then we subtract 1 from highest difference score.
# find maximum difference variable
df_scorecard['score_diff'] = df_scorecard['score_prel'] - df_scorecard['score_raw']
df_scorecard.head()
feature | coef | p_value | orig_feature | score_raw | score_prel | score_diff | |
---|---|---|---|---|---|---|---|
0 | intercept | -0.643885 | NaN | intercept | 319.171024 | 319.0 | -0.171024 |
1 | grade:A | 1.122714 | 1.577662e-50 | grade | 93.031247 | 93.0 | -0.031247 |
2 | grade:B | 0.777610 | 1.881894e-38 | grade | 64.434920 | 64.0 | -0.434920 |
3 | grade:C | 0.577341 | 1.145423e-25 | grade | 47.840095 | 48.0 | 0.159905 |
4 | grade:D | 0.414289 | 1.174786e-15 | grade | 34.329117 | 34.0 | -0.329117 |
idx_max_diff = df_scorecard['score_diff'].argmax()
idx_max_diff
22
df_scorecard.loc[idx_max_diff]
feature purpose:home_improvement__major_purchase__car coef 0.452658 p_value 4.66353e-41 orig_feature purpose score_raw 37.5085 score_prel 38 score_diff 0.491458 Name: 22, dtype: object
new_value = df_scorecard.loc[idx_max_diff]['score_prel'] - 1
new_value
37.0
# df_scorecard['score'][idx_max_diff] = new_value
# df_scorecard.head(10)
# min_sum_score_prel = df_scorecard.groupby('orig_feature')['score'].min().sum()
# min_sum_score_prel
# max_sum_score_prel = df_scorecard.groupby('orig_feature')['score_prel'].max(). sum()
# max_sum_score_prel
# df_scorecard['score'].hist()
# we have both min and max between 300 and 850
df_scorecard.to_csv(dat_pro + 'df_scorecard.csv')
Xtx.head(2)
grade:A | grade:B | grade:C | grade:D | grade:E | grade:F | addr_state:NM_VA | addr_state:OK_TN_MO_LA_MD_NC | addr_state:UT_KY_AZ_NJ | addr_state:AR_MI_PA_OH_MN | ... | dti:30.8_31.5 | dti:31.5_32.2 | dti:32.2_32.9 | dti:32.9_33.6 | mths_since_last_record:missing | mths_since_last_record:0_2 | mths_since_last_record:3_22 | mths_since_last_record:23_46 | mths_since_last_record:47_68 | mths_since_last_record:69_85 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
193817 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
174039 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 rows × 128 columns
df_scorecard.head(2)
feature | coef | p_value | orig_feature | score_raw | score_prel | score_diff | score | |
---|---|---|---|---|---|---|---|---|
0 | intercept | -0.643885 | NaN | intercept | 319.171024 | 319.0 | -0.171024 | 319.0 |
1 | grade:A | 1.122714 | 1.577662e-50 | grade | 93.031247 | 93.0 | -0.031247 | 93.0 |
Xtx.insert(0, 'intercept', 1)
Xtx.head(2)
intercept | grade:A | grade:B | grade:C | grade:D | grade:E | grade:F | addr_state:NM_VA | addr_state:OK_TN_MO_LA_MD_NC | addr_state:UT_KY_AZ_NJ | ... | dti:30.8_31.5 | dti:31.5_32.2 | dti:32.2_32.9 | dti:32.9_33.6 | mths_since_last_record:missing | mths_since_last_record:0_2 | mths_since_last_record:3_22 | mths_since_last_record:23_46 | mths_since_last_record:47_68 | mths_since_last_record:69_85 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
193817 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
174039 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 rows × 129 columns
scorecard_scores = df_scorecard['score']
scorecard_scores.shape
(129,)
scorecard_scores = scorecard_scores.to_numpy().reshape(-1,1)
scorecard_scores.shape
(129, 1)
y_scores = Xtx.dot(scorecard_scores)
y_scores.hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x151bb50b8>]], dtype=object)
y_scores.describe()
0 | |
---|---|
count | 93257.000000 |
mean | 564.329820 |
std | 65.480216 |
min | 343.000000 |
25% | 517.000000 |
50% | 560.000000 |
75% | 608.000000 |
max | 795.000000 |
sum_coef_from_score = ((y_scores - min_score) / (max_score - min_score)) * (max_sum_coef - min_sum_coef) + min_sum_coef
y_hat_proba_from_score = np.exp(sum_coef_from_score) / (np.exp(sum_coef_from_score) + 1)
y_hat_proba_from_score.sort_index().head()
0 | |
---|---|
12 | 0.770749 |
15 | 0.685269 |
21 | 0.901518 |
24 | 0.909763 |
25 | 0.928520 |
ytest.sort_index().head()
good_bad | yprobs | yhat | cum_n_pop | cum_n_good | cum_n_bad | cum_perc_pop | cum_perc_good | cum_perc_bad | |
---|---|---|---|---|---|---|---|---|---|
12 | 0 | 0.774833 | 1 | 7266 | 5312 | 1954 | 0.077914 | 0.063950 | 0.191719 |
15 | 1 | 0.689977 | 0 | 1497 | 988 | 509 | 0.016052 | 0.011894 | 0.049941 |
21 | 0 | 0.903478 | 1 | 44565 | 37110 | 7455 | 0.477873 | 0.446759 | 0.731456 |
24 | 0 | 0.909839 | 1 | 48035 | 40245 | 7790 | 0.515082 | 0.484500 | 0.764325 |
25 | 1 | 0.929176 | 1 | 59471 | 50721 | 8750 | 0.637711 | 0.610618 | 0.858516 |
ytx.head(2)
193817 1 174039 1 Name: good_bad, dtype: int64
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(ytx, yprobs_tx)
df_cutoffs = pd.DataFrame({
'thresholds': thresholds,
'fpr': fpr,
'tpr': tpr
})
df_cutoffs.head()
thresholds | fpr | tpr | |
---|---|---|---|
0 | 1.994019 | 0.000000 | 0.000000 |
1 | 0.994019 | 0.000000 | 0.000012 |
2 | 0.990552 | 0.000000 | 0.000590 |
3 | 0.990545 | 0.000098 | 0.000590 |
4 | 0.989683 | 0.000098 | 0.001192 |
df_cutoffs['thresholds'][0] = 1 - 1 / np.power(10, 16)
df_cutoffs.head()
thresholds | fpr | tpr | |
---|---|---|---|
0 | 1.000000 | 0.000000 | 0.000000 |
1 | 0.994019 | 0.000000 | 0.000012 |
2 | 0.990552 | 0.000000 | 0.000590 |
3 | 0.990545 | 0.000098 | 0.000590 |
4 | 0.989683 | 0.000098 | 0.001192 |
df_cutoffs['Score'] = (
(np.log(df_cutoffs['thresholds']
/ (1 - df_cutoffs['thresholds'])
) - min_sum_coef
) * (
(max_score - min_score) / (max_sum_coef - min_sum_coef)
) + min_score).round()
df_cutoffs.head()
thresholds | fpr | tpr | Score | |
---|---|---|---|---|
0 | 1.000000 | 0.000000 | 0.000000 | 3417.0 |
1 | 0.994019 | 0.000000 | 0.000012 | 796.0 |
2 | 0.990552 | 0.000000 | 0.000590 | 758.0 |
3 | 0.990545 | 0.000098 | 0.000590 | 758.0 |
4 | 0.989683 | 0.000098 | 0.001192 | 751.0 |
df_cutoffs['Score'][0] = max_score
ytest.head(2)
good_bad | yprobs | yhat | cum_n_pop | cum_n_good | cum_n_bad | cum_perc_pop | cum_perc_good | cum_perc_bad | |
---|---|---|---|---|---|---|---|---|---|
42438 | 1 | 0.415615 | 0 | 1 | 1 | 0 | 0.000011 | 0.000012 | 0.000000 |
42295 | 0 | 0.416287 | 0 | 2 | 1 | 1 | 0.000021 | 0.000012 | 0.000098 |
def n_approved(p):
return np.where(ytest['yprobs'] >= p, 1, 0).sum()
n_ytest = ytest.shape[0]
df_cutoffs['N Approved'] = df_cutoffs['thresholds'].apply(n_approved)
df_cutoffs['N Rejected'] = n_ytest - df_cutoffs['N Approved']
df_cutoffs['Approval Rate'] = df_cutoffs['N Approved'] / n_test
df_cutoffs['Rejection Rate'] = 1 - df_cutoffs['Approval Rate']
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-136-b1e703dea05c> in <module> 4 df_cutoffs['N Rejected'] = n_ytest - df_cutoffs['N Approved'] 5 ----> 6 df_cutoffs['Approval Rate'] = df_cutoffs['N Approved'] / n_test 7 8 df_cutoffs['Rejection Rate'] = 1 - df_cutoffs['Approval Rate'] NameError: name 'n_test' is not defined
df_cutoffs.head()