This project we do the model evaluation using scikit-plot library.
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import tqdm
import matplotlib.pyplot as plt
# local scripts
import util
import config
ifile = config.clean_data_path
SEED = config.SEED
model_linsvc_tfidf_path = config.model_linsvc_tfidf_path
tfidf_fitted_vec_path = config.tfidf_fitted_vec_path
compression= config.compression
# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
SEED=100
pd.options.display.max_colwidth = 200
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib
#Visualizers
import scikitplot
from scikitplot import metrics as skpmetrics
from sklearn import metrics as skmetrics
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-10-23 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit pandas 1.1.0 scikitplot 0.3.7 joblib 0.17.0 seaborn 0.11.0 sklearn 0.23.1 watermark 2.0.2 tqdm 4.50.0 numpy 1.18.4
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
!ls ../outputs
ypreds_linsvc.csv ytest.csv
ytest = np.loadtxt('../outputs/ytest.csv',dtype=np.int8)
ypreds = np.loadtxt('../outputs/ypreds_linsvc.csv',dtype=np.int8)
yprobs = np.loadtxt('../outputs/yprobs_linsvc.csv',dtype=np.float64)
ytest[:5], ypreds[:5]
(array([2, 1, 2, 2, 3], dtype=int8), array([2, 1, 2, 2, 3], dtype=int8))
yprobs[0]
array([0.00103394, 0.01114477, 0.94680318, 0.02295366, 0.00225909, 0.00179027, 0.00261764, 0.00352172, 0.00787573])
print('Accuracy : {:.4f} '.format(skmetrics.accuracy_score(ytest,ypreds)))
Accuracy : 0.8125
df_preds = pd.DataFrame({'ytest': ytest, 'ypreds': ypreds},dtype=np.int8)
df_preds.head(2)
ytest | ypreds | |
---|---|---|
0 | 2 | 2 |
1 | 1 | 1 |
df_preds.query("ytest != ypreds").head(2)
ytest | ypreds | |
---|---|---|
5 | 2 | 3 |
7 | 2 | 1 |
show_methods(scikitplot)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | absolute_import | clustering | estimators | plotters |
1 | classifier_factory | clustering_factory | helpers | print_function |
2 | classifiers | decomposition | metrics | unicode_literals |
3 | cluster | division |
show_methods(scikitplot.metrics)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | LabelEncoder | division | plot_ks_statistic | precision_recall_curve |
1 | absolute_import | interp | plot_lift_curve | print_function |
2 | auc | itertools | plot_precision_recall | roc_curve |
3 | average_precision_score | label_binarize | plot_precision_recall_curve | silhouette_samples |
4 | binary_ks_curve | np | plot_roc | silhouette_score |
5 | calibration_curve | plot_calibration_curve | plot_roc_curve | unicode_literals |
6 | confusion_matrix | plot_confusion_matrix | plot_silhouette | unique_labels |
7 | cumulative_gain_curve | plot_cumulative_gain | plt | validate_labels |
8 | deprecated |
classes = df['product'].unique()
classes
array(['Student loan', 'Credit reporting, credit repair services, or other personal consumer reports', 'Mortgage', 'Debt collection', 'Money transfer, virtual currency, or money service', 'Vehicle loan or lease', 'Credit card or prepaid card', 'Checking or savings account', 'Payday loan, title loan, or personal loan'], dtype=object)
plot_confusion_matrix(
y_true,
y_pred,
labels=None,
true_labels=None,
pred_labels=None,
title=None,
normalize=False,
hide_zeros=False,
hide_counts=False,
x_tick_rotation=0,
ax=None,
figsize=None,
cmap='Blues',
title_fontsize='large',
text_fontsize='medium',
)
# skpmetrics.plot_confusion_matrix?
skpmetrics.plot_confusion_matrix(ytest,ypreds,hide_zeros=True,figsize=(8,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7ffa15c7e3d0>
plot_precision_recall_curve(
y_true,
y_probas,
title='Precision-Recall Curve',
curves=('micro', 'each_class'),
ax=None,
figsize=None,
cmap='nipy_spectral',
title_fontsize='large',
text_fontsize='medium',
)
# skpmetrics.plot_precision_recall?
ytest.shape, yprobs.shape
((400,), (400,))
skpmetrics.plot_precision_recall(ytest,yprobs,figsize=(12,8))
<matplotlib.axes._subplots.AxesSubplot at 0x7ffa2f8137d0>
plot_roc(
y_true,
y_probas,
title='ROC Curves',
plot_micro=True,
plot_macro=True,
classes_to_plot=None,
ax=None,
figsize=None,
cmap='nipy_spectral',
title_fontsize='large',
text_fontsize='medium',
)
# skpmetrics.plot_roc?
skpmetrics.plot_roc(ytest,yprobs,figsize=(12,8))
<matplotlib.axes._subplots.AxesSubplot at 0x7ffa15c0ead0>
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 7 min 10 secs