In this notebook we will do the model evaluation using yellowbrick library.
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import tqdm
import matplotlib.pyplot as plt
# local scripts
import util
import config
ifile = config.clean_data_path
SEED = config.SEED
model_linsvc_tfidf_path = config.model_linsvc_tfidf_path
tfidf_fitted_vec_path = config.tfidf_fitted_vec_path
compression= config.compression
# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
SEED=100
pd.options.display.max_colwidth = 200
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib
#Visualizers
import yellowbrick
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-10-23 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit tqdm 4.50.0 watermark 2.0.2 yellowbrick 1.1 pandas 1.1.0 seaborn 0.11.0 joblib 0.17.0 sklearn 0.23.1 numpy 1.18.4
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
!ls ../data
complaints_2019.csv.zip complaints_2019_clean.csv.zip orig_data_head_tail.csv
df = pd.read_csv('../data/complaints_2019_clean.csv.zip',compression='zip')
# make data small
df = df.sample(n=2_000, random_state=SEED)
df.head(2).append(df.tail(2))
product | complaint | complaint_lst_clean | complaint_clean | total_length | num_words | num_sent | num_unique_words | avg_word_len | avg_unique | |
---|---|---|---|---|---|---|---|---|---|---|
82392 | Student loan | On XX/XX/2019 I sent a dispute letter to Fed Loan Servicing about the student loans they claim I owe. I asked them to send me verifiable information for the accounts and the information that they ... | ['sent', 'dispute', 'letter', 'fed', 'loan', 'servicing', 'student', 'loan', 'claim', 'owe', 'asked', 'send', 'verifiable', 'information', 'account', 'information', 'sent', 'constitute', 'sent', '... | sent dispute letter fed loan servicing student loan claim owe asked send verifiable information account information sent constitute sent promissory note school lot information redacted supposed do... | 970 | 172 | 1 | 97 | 4.645349 | 0.563953 |
1435 | Credit reporting, credit repair services, or other personal consumer reports | Someone applied for a vehicle in my name and now it is reflecting on my credit report and this is not my account | ['someone', 'applied', 'vehicle', 'name', 'reflecting', 'credit', 'report', 'account'] | someone applied vehicle name reflecting credit report account | 112 | 23 | 1 | 19 | 3.913043 | 0.826087 |
13448 | Credit reporting, credit repair services, or other personal consumer reports | My exwife opened a XXXX Credit card in 2009 ( 3 years before we ever met ). Shortly after we met, she added me as an authorized user and I never even had a card. The three credit reporting agencie... | ['exwife', 'opened', 'credit', 'card', 'year', 'ever', 'met', 'shortly', 'met', 'added', 'authorized', 'user', 'never', 'even', 'card', 'three', 'credit', 'reporting', 'agency', 'claiming', 'joint... | exwife opened credit card year ever met shortly met added authorized user never even card three credit reporting agency claiming jointly owned account filed bankruptcy im responsible debt card nev... | 601 | 117 | 1 | 79 | 4.145299 | 0.675214 |
61809 | Credit reporting, credit repair services, or other personal consumer reports | AFTER RECEIVING A CURRENT COPY OF MY CREDIT REPORT, I DISCOVERED SOME ENTRIES THAT WERE IDENITIFIED AS INQUIRIES WHICH QUALIFIED FOR DELETION FROM MY REPORT. | ['receiving', 'current', 'copy', 'credit', 'report', 'discovered', 'entry', 'idenitified', 'inquiry', 'qualified', 'deletion', 'report'] | receiving current copy credit report discovered entry idenitified inquiry qualified deletion report | 157 | 25 | 1 | 24 | 5.320000 | 0.960000 |
df['product_id'] = df['product'].astype('category').cat.codes
maincol = 'complaint'
mc = maincol + '_clean'
target = 'product_id'
%%time
from sklearn.model_selection import train_test_split
X = df['complaint_clean'] # documents
y = df['product_id'] # target
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=config.train_size,
random_state=config.SEED)
y_train = np.array(y_train).flatten()
y_test = np.array(y_test).flatten()
CPU times: user 1.99 ms, sys: 195 µs, total: 2.18 ms Wall time: 2.19 ms
model = joblib.load(model_linsvc_tfidf_path)
fitted_vectorizer = joblib.load(tfidf_fitted_vec_path)
X_train_text = fitted_vectorizer.transform(X_train)
X_test_text = fitted_vectorizer.transform(X_test)
y_pred = model.predict(X_test_text)
print('Accuracy : {:.4f} '.format(metrics.accuracy_score(y_test,y_pred)))
Accuracy : 0.8125
df_preds = pd.DataFrame({'ytest': y_test, 'ypreds': y_pred})
df_preds.head()
ytest | ypreds | |
---|---|---|
0 | 2 | 2 |
1 | 1 | 1 |
2 | 2 | 2 |
3 | 2 | 2 |
4 | 3 | 3 |
y_test[:5], y_pred[:5]
(array([2, 1, 2, 2, 3], dtype=int8), array([2, 1, 2, 2, 3], dtype=int8))
classes = df['product'].unique()
classes
array(['Student loan', 'Credit reporting, credit repair services, or other personal consumer reports', 'Mortgage', 'Debt collection', 'Money transfer, virtual currency, or money service', 'Vehicle loan or lease', 'Credit card or prepaid card', 'Checking or savings account', 'Payday loan, title loan, or personal loan'], dtype=object)
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve
def viz_metrics(visualizer,outpath=None):
visualizer.fit(X_train_text, y_train)
visualizer.score(X_test_text, y_test)
return visualizer.poof(outpath=outpath)
# ?ClassificationReport
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ClassificationReport(model, classes=classes, support=True)
viz_metrics(visualizer)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None. FutureWarning) /Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
<matplotlib.axes._subplots.AxesSubplot at 0x7fc584f4c090>
def viz_metrics2(visualizer,outpath=None,
Xtr=X_train_text,Xtx=X_test_text,
ytr=y_train,ytx=y_test):
visualizer.fit(Xtr, ytr)
visualizer.score(Xtx, ytx)
visualizer.poof(outpath=outpath)
plt.close()
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ClassificationReport(model, classes=classes, support=True, ax=ax)
viz_metrics2(visualizer,outpath='a.png')
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None. FutureWarning) /Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
!rm -rf a.png
show_methods(visualizer)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | ax | estimator | macro | score_ |
1 | class_counts_ | fig | micro | set_params |
2 | classes | finalize | name | set_title |
3 | classes_ | fit | per_class | show |
4 | color | force_model | poof | size |
5 | colors | fpr | roc_auc | title |
6 | draw | get_params | score | tpr |
7 | encoder | is_fitted |
# visualizer?
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ClassificationReport(model, classes=classes, support=True)
out = viz_metrics(visualizer)
out.figure.savefig('../images/classification_report.png',dpi=300)
plt.close()
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None. FutureWarning) /Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
type(out)
matplotlib.axes._subplots.AxesSubplot
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ConfusionMatrix(model, classes=classes,percent=True)
out = viz_metrics(visualizer)
out.figure.savefig('../images/confusion_matrix.png',dpi=300)
plt.close()
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None. FutureWarning)
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ROCAUC(model, classes=classes)
out = viz_metrics(visualizer)
out.figure.savefig('../images/roc_auc.png',dpi=300)
plt.close()
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None. FutureWarning)
fig,ax = plt.subplots(figsize=(12,8))
visualizer = PrecisionRecallCurve(model,classes=classes,per_class=True,
iso_f1_curves=False,fill_area=False, micro=False)
out = viz_metrics(visualizer)
out.figure.savefig('../images/precision_recall.png',dpi=100)
plt.close()
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ClassPredictionError(model, classes=classes)
out = viz_metrics(visualizer)
out.figure.savefig('../images/class_prediction_error.png',dpi=300)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None. FutureWarning)
%%writefile c01_model_eval_yellowbrick.py
# time
import time
time_start_notebook = time.time()
# local scripts
import util
import config
ifile = config.clean_data_path
model_linsvc_tfidf_path = config.model_linsvc_tfidf_path
tfidf_fitted_vec_path = config.tfidf_fitted_vec_path
compression= config.compression
SEED = config.SEED
N_SAMPLES = config.N_SAMPLES
png_clf_report = config.png_clf_report
png_conf_mat = config.png_conf_mat
png_auc_roc = config.png_auc_roc # area under curve receiver operating characteristics
png_pr = config.png_pr # precision-recall
png_cpe = config.png_cpe # class prediction error
# usual imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib
# Visualizers
import yellowbrick
from yellowbrick import classifier as yclf
#===========================================================================
# Load the data
df = pd.read_csv('../data/complaints_2019_clean.csv.zip',compression='zip')
df = df.sample(n=N_SAMPLES, random_state=SEED)
#============================================================================
# Data preparation
df['product_id'] = df['product'].astype('category').cat.codes
X = df['complaint_clean'] # documents
y = df['product_id'] # target
classes = df['product'].unique()
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=config.train_size,
random_state=config.SEED)
y_train = np.array(y_train).flatten()
y_test = np.array(y_test).flatten()
model = joblib.load(model_linsvc_tfidf_path)
fitted_vectorizer = joblib.load(tfidf_fitted_vec_path)
X_train = fitted_vectorizer.transform(X_train)
X_test = fitted_vectorizer.transform(X_test)
#=============================================================================
def viz_metrics(visualizer,outpath=None,
Xtr=X_train,Xtx=X_test,
ytr=y_train,ytx=y_test):
visualizer.fit(Xtr, ytr)
visualizer.score(Xtx, ytx)
visualizer.poof(outpath=outpath)
plt.close()
# classification report
fig,ax = plt.subplots(figsize=(12,8))
visualizer = yclf.ClassificationReport(model, classes=classes, support=True, ax=ax)
viz_metrics(visualizer,png_clf_report)
# confusion matrix
fig,ax = plt.subplots(figsize=(12,8))
visualizer = yclf.ConfusionMatrix(model, classes=classes,percent=True, ax=ax)
viz_metrics(visualizer, png_conf_mat)
# roc auc
fig,ax = plt.subplots(figsize=(12,8))
visualizer = yclf.ROCAUC(model, classes=classes, ax=ax)
viz_metrics(visualizer, png_auc_roc)
# precision-recall
fig,ax = plt.subplots(figsize=(12,8))
visualizer = yclf.PrecisionRecallCurve(model,classes=classes,per_class=True,
iso_f1_curves=False,fill_area=False, micro=False, ax=ax)
viz_metrics(visualizer, png_pr)
# class prediction error
fig,ax = plt.subplots(figsize=(12,8))
visualizer = yclf.ClassPredictionError(model, classes=classes, ax=ax)
viz_metrics(visualizer, png_cpe)
Writing c01_model_eval_yellowbrick.py
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 7 min 10 secs