import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import tqdm

import matplotlib.pyplot as plt

# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 
SEED=100
pd.options.display.max_colwidth = 200

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import joblib

#Visualizers
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
  warnings.warn(message, FutureWarning)

Bhishan Poudel 2020-10-23 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

joblib    0.17.0
tqdm      4.50.0
pandas    1.1.0
watermark 2.0.2
seaborn   0.11.0
numpy     1.18.4
sklearn   0.23.1


def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


!ls ../data

complaints_2019.csv.zip       complaints_2019_clean.csv.zip orig_data_head_tail.csv


df = pd.read_csv('../data/complaints_2019_clean.csv.zip',compression='zip')

# make data small
df = df.sample(n=2_000, random_state=SEED)
df.head(2).append(df.tail(2))


maincol = 'complaint'
mc = maincol + '_clean'
target = 'product'


ax = df['product'].value_counts().plot.bar()


%%time 
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# create vectors
features = tfidf.fit_transform(df['complaint_clean']).toarray()
labels = df['product'].astype('category').cat.codes

CPU times: user 270 ms, sys: 17.8 ms, total: 288 ms
Wall time: 325 ms


%%time

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test,indices_train,indices_test = \
    train_test_split(features, labels, df.index,
                     test_size=0.25, random_state=SEED,stratify=labels)

y_train[:2]

CPU times: user 26.9 ms, sys: 18.9 ms, total: 45.8 ms
Wall time: 72.5 ms

24678     1
102166    3
dtype: int8


sorted(y_train.unique()), sorted(y_test.unique())

([0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8])


from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


# ?RandomForestClassifier


models = [
    LogisticRegression(random_state=SEED),
    LinearSVC(),
    MultinomialNB(),
    
]

model_names = ['LogisticRegression', 'SVC', 'MultinomialNB']
dic_models = dict(zip(model_names,models))

for model_name, model in dic_models.items():
    print(model_name)

LogisticRegression
SVC
MultinomialNB


%%time

from sklearn.model_selection import cross_val_score

accs = []

for model in models:
    acc = cross_val_score(model, X_train, y_train,scoring='accuracy').mean()
    accs.append(acc)
    
df_models = pd.DataFrame({'Model': model_names, 'Accuracy': accs})
df_models

CPU times: user 22.2 s, sys: 827 ms, total: 23.1 s
Wall time: 8.65 s


model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# Classification report
print(metrics.classification_report(y_test, y_pred, 
    target_names= df['product'].unique()))

                                                                              precision    recall  f1-score   support

                                                                Student loan       0.67      0.67      0.67        30
Credit reporting, credit repair services, or other personal consumer reports       0.66      0.57      0.61        54
                                                                    Mortgage       0.79      0.89      0.84       234
                                                             Debt collection       0.73      0.75      0.74        92
                          Money transfer, virtual currency, or money service       0.75      0.25      0.38        12
                                                       Vehicle loan or lease       0.88      0.83      0.85        42
                                                 Credit card or prepaid card       0.67      0.25      0.36         8
                                                 Checking or savings account       1.00      0.59      0.74        17
                                   Payday loan, title loan, or personal loan       0.71      0.45      0.56        11

                                                                    accuracy                           0.77       500
                                                                   macro avg       0.76      0.58      0.64       500
                                                                weighted avg       0.77      0.77      0.76       500


conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[ 20,   5,   4,   0,   0,   1,   0,   0,   0],
       [  5,  31,  12,   5,   1,   0,   0,   0,   0],
       [  1,   6, 209,  16,   0,   0,   0,   0,   2],
       [  1,   0,  21,  69,   0,   0,   1,   0,   0],
       [  2,   5,   2,   0,   3,   0,   0,   0,   0],
       [  0,   0,   6,   1,   0,  35,   0,   0,   0],
       [  1,   0,   2,   1,   0,   2,   2,   0,   0],
       [  0,   0,   5,   2,   0,   0,   0,  10,   0],
       [  0,   0,   3,   1,   0,   2,   0,   0,   5]])


from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve

# select classifiers 
classifiers=[
LogisticRegression(random_state=SEED),
LinearSVC()
]

model = LinearSVC()
classes = df[target].unique()


# ?ClassificationReport


def viz_metrics(visualizer):
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    return visualizer.poof()

for clf in classifiers:
    plt.close()
    fig,ax = plt.subplots(figsize=(12,8))
    visualizer = ClassificationReport(clf, classes=classes, support=True)
    viz_metrics(visualizer)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)


fig,ax = plt.subplots(figsize=(12,8))
visualizer = ClassificationReport(model, classes=classes, support=True)
out = viz_metrics(visualizer)
out.figure.savefig('../images/classification_report.png',dpi=100)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)


type(out)

matplotlib.axes._subplots.AxesSubplot


fig,ax = plt.subplots(figsize=(12,8))
visualizer = ClassPredictionError(LinearSVC(), classes=classes)
out = viz_metrics(visualizer)
out.figure.savefig('../images/class_prediction_error.png',dpi=100)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)


fig,ax = plt.subplots(figsize=(12,8))
visualizer = ConfusionMatrix(model, classes=classes,percent=True)
out = viz_metrics(visualizer)
out.figure.savefig('../images/confusion_matrix.png',dpi=100)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)


# ROC-AUC
fig,ax = plt.subplots(figsize=(12,8))
visualizer = ROCAUC(model, classes=classes)
out = viz_metrics(visualizer)
out.figure.savefig('../images/roc_auc.png',dpi=100)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)


# Precision Recall Curve
fig,ax = plt.subplots(figsize=(12,8))
visualizer = PrecisionRecallCurve(model,classes=classes,per_class=True,
                                  iso_f1_curves=False,fill_area=False, micro=False)
out = viz_metrics(visualizer)
out.figure.savefig('../images/precision_recall.png',dpi=100)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/base.py:213: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)


dic_id_to_product = dict(enumerate(df['product'].unique()))
dic_product_to_id = {v:k for k,v in dic_id_to_product.items()}

ser_id_to_product = pd.Series(dic_id_to_product)
ser_product_to_id = pd.Series(dic_product_to_id)


for predicted in ser_id_to_product.index:
    for actual in ser_id_to_product.index:
        if predicted != actual and conf_mat[actual, predicted] >= 20:
            print("'{}' predicted as '{}' : {} examples.".format(dic_id_to_product[actual], 
                                                               dic_id_to_product[predicted], 
                                                               conf_mat[actual, predicted]))
            # indices_test is from train-test split
            display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['product', 
                                                                    'complaint']])
            print('')

'Debt collection' predicted as 'Mortgage' : 21 examples.


# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
#                         ngram_range=(1, 2), 
#                         stop_words='english')

# # create vectors
# features = tfidf.fit_transform(df['complaint_clean']).toarray()
# labels = df['category_id']

model = LinearSVC()
model.fit(features, labels)

LinearSVC()


def get_top_N_correlated(N=4,ser_id_to_product=ser_id_to_product):
    products,top_uni,top_bi = [],[],[]
    for category_id, product in ser_id_to_product.iteritems():
        indices = np.argsort(model.coef_[category_id])
        feature_names = np.array(tfidf.get_feature_names())[indices]
        unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
        bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
        products.append(product)
        top_uni.append(', '.join(unigrams[-N:]))
        top_bi.append(', '.join(bigrams[-N:]))
    # dataframe
    df_top_corr = pd.DataFrame({'product': products,
                                'unigram': top_uni,
                                'bigram': top_bi})
    
    return df_top_corr

df_top_corr = get_top_N_correlated(N=4)
df_top_corr.style.set_caption('Top Correlated Terms per Category')


X = df['complaint_clean'] # documents
y = df['product'].astype('category').cat.codes # target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = SEED)


tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)


# Save the fitted model (model persistence)
joblib.dump(model, '../models/tfidf.pkl')

['../models/tfidf.pkl']


new_complaint = """Hello : ditech.com is my mortgagecompany.
They placed an automatic forbearance on my account
and removed my auto payment after
Hurricane Irma. 
I called about a week after the storm
to ask that they remove the forbearance
and return the auto payment.
This was confirm by the agent
and recorded by them. 
I received a letter just a few 
weeks ago stating that my auto payment
was never returned and the agent who
I spoke with after I received the
letter actually read back the notes
confirming that I called and asked 
to have forbearance removed and auto
payment reinstated.
So I asked again the agent 
to remove the forbearance and install auto payment.
\n\nI called this past week to check 
if this was done yet, and the agent
at that time said I still have 
a forbearance and no auto payment.
\n\nAs I right this complaint,
I spoke with an agent today that
informs me that I dont have auto 
payment and forbearance is still active.
She placed me on hold, which has lasted an hour.
\n\nDitech is not responsive,
and it is purposely choosing 
to keep my in forbearance when
I have asked countless times to remove me.
I also have asked countless times 
to reinstate auto payment and yet 
they choose not to listen.
\n\nPlease help XXXX XXXX, XXXX"""


model_loaded = joblib.load('../models/tfidf.pkl')
new_comp_vec = fitted_vectorizer.transform([new_complaint])
pred = model_loaded.predict(new_comp_vec)

print(pred)

[2]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 7 min 10 secs

	product	complaint	complaint_lst_clean	complaint_clean	total_length	num_words	num_sent	num_unique_words	avg_word_len	avg_unique
82392	Student loan	On XX/XX/2019 I sent a dispute letter to Fed Loan Servicing about the student loans they claim I owe. I asked them to send me verifiable information for the accounts and the information that they ...	['sent', 'dispute', 'letter', 'fed', 'loan', 'servicing', 'student', 'loan', 'claim', 'owe', 'asked', 'send', 'verifiable', 'information', 'account', 'information', 'sent', 'constitute', 'sent', '...	sent dispute letter fed loan servicing student loan claim owe asked send verifiable information account information sent constitute sent promissory note school lot information redacted supposed do...	970	172	1	97	4.645349	0.563953
1435	Credit reporting, credit repair services, or other personal consumer reports	Someone applied for a vehicle in my name and now it is reflecting on my credit report and this is not my account	['someone', 'applied', 'vehicle', 'name', 'reflecting', 'credit', 'report', 'account']	someone applied vehicle name reflecting credit report account	112	23	1	19	3.913043	0.826087
13448	Credit reporting, credit repair services, or other personal consumer reports	My exwife opened a XXXX Credit card in 2009 ( 3 years before we ever met ). Shortly after we met, she added me as an authorized user and I never even had a card. The three credit reporting agencie...	['exwife', 'opened', 'credit', 'card', 'year', 'ever', 'met', 'shortly', 'met', 'added', 'authorized', 'user', 'never', 'even', 'card', 'three', 'credit', 'reporting', 'agency', 'claiming', 'joint...	exwife opened credit card year ever met shortly met added authorized user never even card three credit reporting agency claiming jointly owned account filed bankruptcy im responsible debt card nev...	601	117	1	79	4.145299	0.675214
61809	Credit reporting, credit repair services, or other personal consumer reports	AFTER RECEIVING A CURRENT COPY OF MY CREDIT REPORT, I DISCOVERED SOME ENTRIES THAT WERE IDENITIFIED AS INQUIRIES WHICH QUALIFIED FOR DELETION FROM MY REPORT.	['receiving', 'current', 'copy', 'credit', 'report', 'discovered', 'entry', 'idenitified', 'inquiry', 'qualified', 'deletion', 'report']	receiving current copy credit report discovered entry idenitified inquiry qualified deletion report	157	25	1	24	5.320000	0.960000

	product	complaint
23968	Debt collection	XXXX XXXX XXXX believes I owe them {$7800.00} for terminating a lease in 2015. I gave them more than enough notice that I would have to leave the apartment due t the lack of affordability because ...
23699	Debt collection	In response to a denial of an extension of credit this consumer checked with the consumer reporting agencies and found the following : 1. Your company has furnished negative information about this...
11400	Debt collection	I was Evicted from my home in XX/XX/2017 I paid all my debt from the landlord and then a year later the name Hunter Warfield showed on my credit report. I was never given a notice about this charg...
39758	Debt collection	I've lived in P.R all my life, never in the U.S. Since XX/XX/2017 I have received collection notifications from different creditors of the U.S. I already reported to the P.R. Police Department, at...
89226	Debt collection	This letter is to inform you that Lending Club has failed to respond to my credit dispute letter and failed to verify that this account belongs to me that I sent certified mail on XX/XX/2019. This...
51947	Debt collection	On XX/XX/2018 I have contacted a agency called Credit Collection Services and XXXX XXXX XXXX advising them that I discovered a account that has been opened as a result of fraud. This agency failed...
30341	Debt collection	I am a single mother. I recently tried to purchase a home for my family and was denied. I than reviewed my own credit report and seen a lot of unauthorized credit inquires on my credit report that...
14577	Debt collection	I have called this company and told them this is not my account, they continue to refuse to accept it. I asked for proof to be provided, they sent me a letter with an address that I do not recogni...
13315	Debt collection	FCO has reported a collection on my credit report this year. I had no idea that I had a collection because I pay all my bills on time. I reached out to FCO who collected all my personal info, conf...
25393	Debt collection	The following Hard inquiries were made on my credit : XXXX XXXX XXXX XX/XX/XXXX XXXX XXXX XXXX XXXX XX/XX/2018 XXXX XXXX XXXX XXXX XX/XX/XXXX XXXX XXXX XXXX XX/XX/XXXX In XX/XX/2018, I Applied cre...
90015	Debt collection	I recently reviewed my XXXX credit report and I was totally shocked to find Capital One Bank is still reporting these fraudulent accounts on my credit report. I am requesting for Capital One Bank ...
58634	Debt collection	I received several emails from Bank of America about settling an outstanding debt for {$42.00}. I reached out to the company on XX/XX/2019. I spoke to a collection agent regarding the email and sh...
57907	Debt collection	KINGS CREDIT SERVICE XXXX XXXX XXXX XXXX XXXX, CA XXXX ( XXXX ) XXXX Kings Credit Service Opened XX/XX/2018 {$46.00} Original creditor : XXXX XXXX XXXX XXXX
80607	Debt collection	I applied to rent an apartment at XXXX in XXXX XXXX while it was still under construction in XXXX of 2019. My application was denied and I never moved in. A few months later I noticed that I had a...
85527	Debt collection	NOTICE OF PENDING LITIGATION SEEKING RELIEF AND MONETARY DAMAGES UNDER FCRA SECTION 616 & SECTION 617/// TRIDENT ASST MANAG IS REPORTING AN ACCOUNT ON MY CREDIT THAT IS NOT MINE/INACCURATE.FRAUD. ...
42457	Debt collection	SOUTHERN MANAGEMENT SYSTEMS IS REPORTING FALSE INFORMATION ON MY CREDIT REPORT! REMOVE ALL NEGATIVE ITEMS ON CREDIT REPORT.
8244	Debt collection	I was an XXXX customer. I had a complaint filed with the XXXX for unfair sales practices and fraud. ( they added services specifically XXXX XXXX/XXXX XXXX that I did not request. They also chang...
27520	Debt collection	Again this XXXX XXXX XXXX has sent nothing other than a generic letter. They responded to your company saying it will be removed soon but nothing explaining when and this is past the statue of lim...
5846	Debt collection	The Mini Van was reported stolen to the Police but resolved that it was retrieve by XXXX XXXX as repossess without notice.
99224	Debt collection	Transworld apparently purchased an account from XXXX or took over an account from XXXX for which I had an open dispute, they have reported it to credit reporting agencies negatively impacting my ...
13818	Debt collection	XXXX XXXX XXXX XXXX XXXX XXXX XXXX, GA XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX XXXX XXXX # XXXX To Whom It May Concern : This letter is being sent to you in response to notic...

	product	unigram	bigram
0	Student loan	branch, bank, deposit, overdraft	saving account, called bank, checking account, card payment
1	Credit reporting, credit repair services, or other personal consumer reports	card, capital, express, statement	credit card, american express, card account, fraudulent charge
2	Mortgage	experian, report, equifax, reporting	credit bureau, xxxx reporting, fraud alert, victim identity
3	Debt collection	debt, collection, calling, phone	account credit, certified mail, time day, funding llc
4	Money transfer, virtual currency, or money service	paypal, transfer, ticket, transaction	money order, money account, transfer fund, account said
5	Vehicle loan or lease	mortgage, escrow, home, foreclosure	loan modification, escrow account, short sale, loan officer
6	Credit card or prepaid card	loan, lending, title, lied	received loan, loan told, called asked, loan agreement
7	Checking or savings account	navient, university, loan, owned	loan forgiveness, fed loan, student loan, thank time
8	Payday loan, title loan, or personal loan	car, vehicle, leased, gm	gm financial, auto loan, fee payment, auto finance

Table of Contents

Description¶

Data Description¶

Business Problem¶

Imports¶

Useful Scripts¶

Load the data¶

Get vectorized data from text¶

Train Test Split¶

Fit various Models¶

Model Evaluation¶

Mis-classified Cases¶

Most correlated terms with each category¶

Predictions¶

Total Time Taken¶

	Model	Accuracy
0	LogisticRegression	0.724667
1	SVC	0.772000
2	MultinomialNB	0.546000