import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 
SEED=100
pd.options.display.max_colwidth = None

import re
import string
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

import sklearn
import tqdm

print([(x.__name__,x.__version__) for x in 
 [np,pd,sns,sklearn,tqdm,nltk]])

[('numpy', '1.18.4'), ('pandas', '1.1.0'), ('seaborn', '0.11.0'), ('sklearn', '0.23.1'), ('tqdm', '4.50.0'), ('nltk', '3.5')]


def add_text_barplot(ax, decimals=4, rot=30,percent=False,comma=False):
    assert hasattr(ax,'patches')
    for p in ax.patches:
        txt = f"{p.get_height():,.2f}"
        txt = np.round(p.get_height(), decimals=decimals)
        
        if comma:
            if (int(txt)==txt):
                txt = "{:,}".format(int(txt))
            else:
                txt = "{:,.2f}".format(txt)

        txt = str(txt) + '%' if percent else txt
        x = p.get_x()+p.get_width()/2.
        y = p.get_height()
        ax.annotate(txt, (x,y), ha='center', va='center', 
                xytext=(0, 10), rotation=rot,textcoords='offset points')


df = pd.read_csv('../data/complaints_2019_clean.csv.zip',compression='zip')
df.head(1).append(df.tail(1))


df['product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    58529
Debt collection                                                                 23981
Credit card or prepaid card                                                     12531
Mortgage                                                                        10036
Checking or savings account                                                      7354
Student loan                                                                     4282
Vehicle loan or lease                                                            2930
Money transfer, virtual currency, or money service                               2835
Payday loan, title loan, or personal loan                                        2429
Name: product, dtype: int64


df['product'].nunique()

9


ax = df['product'].value_counts().plot.bar()
add_text_barplot(ax)
plt.savefig('../images/category_distribution.png',dpi=300)


df['category_id'] = df['product'].astype('category').cat.codes

df['category_id'].head()

0    3
1    3
2    3
3    2
4    2
Name: category_id, dtype: int8


dic_id_to_product = dict(enumerate(df['product'].unique()))
dic_id_to_product

{0: 'Debt collection',
 1: 'Credit reporting, credit repair services, or other personal consumer reports',
 2: 'Money transfer, virtual currency, or money service',
 3: 'Mortgage',
 4: 'Student loan',
 5: 'Credit card or prepaid card',
 6: 'Checking or savings account',
 7: 'Payday loan, title loan, or personal loan',
 8: 'Vehicle loan or lease'}


dic_product_to_id = {v:k for k,v in dic_id_to_product.items()}
dic_product_to_id

{'Debt collection': 0,
 'Credit reporting, credit repair services, or other personal consumer reports': 1,
 'Money transfer, virtual currency, or money service': 2,
 'Mortgage': 3,
 'Student loan': 4,
 'Credit card or prepaid card': 5,
 'Checking or savings account': 6,
 'Payday loan, title loan, or personal loan': 7,
 'Vehicle loan or lease': 8}


# make data small for eda
df = df.sample(n=2000,random_state=SEED)


from sklearn.feature_extraction.text import TfidfVectorizer

# help(TfidfVectorizer())


tfidf = TfidfVectorizer(sublinear_tf=True,
                        min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')
tfidf

TfidfVectorizer(min_df=5, ngram_range=(1, 2), stop_words='english',
                sublinear_tf=True)


# transform each complaint into a vector
features = tfidf.fit_transform(df['complaint_clean']).toarray()

labels = df['category_id']

print("Each of the %d complaints is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

Each of the 2000 complaints is represented by 5204 features (TF-IDF score of unigrams and bigrams)


from sklearn.feature_selection import chi2
from tqdm import tqdm

def get_top_N_correlated(N=4,dic_product_to_id=dic_product_to_id):
    products,top_uni,top_bi = [],[],[]
    for product, category_id in sorted(dic_product_to_id.items()):
        features_chi2 = chi2(features, labels == category_id)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(tfidf.get_feature_names())[indices]
        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
        bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
        products.append(product)
        top_uni.append(', '.join(unigrams[-N:]))
        top_bi.append(', '.join(bigrams[-N:]))

    df_top_corr = pd.DataFrame({'product': products,
                                'unigram': top_uni,
                                'bigram': top_bi})

    return df_top_corr

df_top_corr = get_top_N_correlated(N=4)
df_top_corr.style.set_caption('Top Correlated Terms per Category')


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 8 secs

	product	complaint	complaint_lst_clean	complaint_clean	total_length	num_words	num_sent	num_unique_words	avg_word_len	avg_unique
0	Debt collection	transworld systems inc. \nis trying to collect a debt that is not mine, not owed and is inaccurate.	['transworld', 'system', 'inc', 'trying', 'collect', 'debt', 'mine', 'owed', 'inaccurate']	transworld system inc trying collect debt mine owed inaccurate	98	18	2	15	4.444444	0.833333
124906	Debt collection	I had a unwritten contract with XXXX XXXX XXXX XXXX ( XXXX ) in XXXX for a property I rented in with several other tenants and my name was on the utilities. When I moved away from this rental I found out that I was on a budget billing program and me and the other tenants owed more money than we were paying due to them billing us less than our usage. The tenants left me with the bills so I consolidated my debt to pay off ( XXXX ) and the other utilities companies. When I consolidated this debt I called XXXX to have them send me a final bill that I never received. I then called ( XXXX ) XX/XX/XXXX to request the amount owed and mailed ( XXXX ) a check for {$120.00} via USPS the same day due to never receiving a final bill. On XX/XX/XXXX Municipal Collections of America, Inc. sent me a bill in via USPS for {$120.00} authorized by Iowa Code Section 8A.504. This same statement from ( MCOA ) stated the I had 30 days after receiving this notice that I could dispute the validity of the debt or any portion thereof, this ( MCOA ) would assume the debt was valid. After receiving this billI called ( MCOA ) the same night but they were closed. I called ( MCOA ) the following day and ( MCOA ) explained that I never paid the utility bill from 6/24/2010. I stated I wanted to dispute the debt and I requested to have all the information on this debt mailed to me at the same address they already had listed. I also asked ( MCOA ) how the dispute works and how much time this gives me to pay the bill or dispute it and they MCOA employee would not answer my question. I read the notice they sent me that said I 30 days and he said that since I'm disputing it " I don't have to worry about the time frame now ''. I explained that want to dispute this but want the time to exceed the amount given and have it hurt my credit. ( MCOA ) employee said that the time doesn't matter now that I'm in a dispute and refused to give me any information on the dispute or the time frame it gives me. The same day I called ( XXXX ) and asked if this was a scam or real debt and they responded that the debt was real. I asked for the dates and debt amount. They said they would send me the information via mail. I then asked why it took 9 years to bill me and she responded that they got a new system and the debt was found. After thinking it an hour later I called ( XXXX ) gain and requested my past billing history, my contract with them, and for an itemized bill for the debt I owed. I asked the why it took 9 years for them to bill me and the lady responded that they didn't have the resources to bill me earlier but now they do. A week later the only information ( XXXX ) they sent me was the final bill and the only information ( MCOA ) sent me was the same bill. After an estimated two to three weeks passed without ( MCOA ) calling me back about the dispute on XX/XX/XXXX I Called ( MCOA ) to request and update on the dispute. The ( MCOA ) employee asked me if I had a receipt from a bill I payed 9 years earlier and acted as if that was all the dispute was and they weren't going to get back to me on the subject. I requested the information on the debt such as my billing contract, and past billing history over the phone and the employee stated I couldn't request it over the phone and that I would request it through an outside source, I then asked why I couldn't request it verbally right now when the ( MCOA ) employee hung up on me. I called back and that ( MCOA ) employee forwarded me to the same person where he dropped the call again. I called back for a 3rd time and this employee at least attempted to try to help me. He said he would have someone contact me on the subject. I brought up the iowa code they used to send solve this debt section 8A.504 and how Iowa code 614.1 states that a unwritten contract statue of limitations is 5 years and I want to know how 9 years later collections is calling me and he couldn't explain. I requested the time I had left on the dispute before it was in collections and hurting my credit and he said it was all ready turned into collections even though that their employee told me that the 30 day time frame didn't matter when I started the dispute. Immediately after called ( XXXX ) and asked to get a copy of my contract again where the employee stated there is no copy because its a verbal contract and not a written contract. I asked how they can legally turn me into collections with a verbal contract debt after 9 years of them not contacting me even though Iowa code 614.1 clearly states under unwritten contacts that they have 5 years to confront me. The IRS typiclaly goes back 7 years but a utility bill expects me to go back 9? Nobody saves their recipe that the utility company doesn't give you 9 years to defend themselves over a {$120.00} dollar debt. Considering they employee at ( MCOA ) stated I'm now in collections and its damaging my credit I had to make the choice of paying a fraudulent debt or letting my credit be destroyed. I have since payed ( XXXX ) using their XXXX billing platform without the chance to properly dispute the fraudulent debt so it no long destroys my credit. There isn't a chance in XXXX I'm trusting Municipal Collections of America Inc. with my payment considering they hung up on me twice and never made a realistic attempt to dispute by debt but instead gave me the run around, false information, and couldn't support any information on the debt outside of a computer generated bill from 9 years earlier. Both of these companies need to be deeply audited considering their demanding payment without a legit chance to dispute the debt. They are granting a form of credit to people under verbal contracts that they can't support and then turning the debt into a collection agency thats most likely in the City of XXXX XXXX pocket.	['unwritten', 'contract', 'property', 'rented', 'several', 'tenant', 'name', 'utility', 'moved', 'away', 'rental', 'found', 'budget', 'billing', 'program', 'tenant', 'owed', 'money', 'paying', 'due', 'billing', 'u', 'le', 'usage', 'tenant', 'left', 'bill', 'consolidated', 'debt', 'pay', 'utility', 'company', 'consolidated', 'debt', 'called', 'send', 'final', 'bill', 'never', 'received', 'called', 'request', 'amount', 'owed', 'mailed', 'check', 'via', 'usps', 'day', 'due', 'never', 'receiving', 'final', 'bill', 'municipal', 'collection', 'america', 'inc', 'sent', 'bill', 'via', 'usps', 'authorized', 'iowa', 'code', 'section', 'statement', 'mcoa', 'stated', 'day', 'receiving', 'notice', 'could', 'dispute', 'validity', 'debt', 'portion', 'thereof', 'mcoa', 'would', 'assume', 'debt', 'valid', 'receiving', 'billi', 'called', 'mcoa', 'night', 'closed', 'called', 'mcoa', 'following', 'day', 'mcoa', 'explained', 'never', 'paid', 'utility', 'bill', 'stated', 'wanted', 'dispute', 'debt', 'requested', 'information', 'debt', 'mailed', 'address', 'already', 'listed', 'also', 'asked', 'mcoa', 'dispute', 'work', 'much', 'time', 'give', 'pay', 'bill', 'dispute', 'mcoa', 'employee', 'would', 'answer', 'question', 'read', 'notice', 'sent', 'said', 'day', 'said', 'since', 'disputing', 'worry', 'time', 'frame', 'explained', 'want', 'dispute', 'want', 'time', 'exceed', 'amount', 'given', 'hurt', 'credit', 'mcoa', 'employee', 'said', 'time', 'matter', 'dispute', 'refused', 'give', 'information', 'dispute', 'time', 'frame', 'give', 'day', 'called', 'asked', 'scam', 'real', 'debt', 'responded', 'debt', 'real', 'asked', 'date', 'debt', 'amount', 'said', 'would', 'send', 'information', 'via', 'mail', 'asked', 'took', 'year', 'bill', 'responded', 'got', 'new', 'system', 'debt', 'found', 'thinking', 'hour', 'later', 'called', 'gain', 'requested', 'past', 'billing', 'history', 'contract', 'itemized', 'bill', 'debt', 'owed', 'asked', 'took', 'year', 'bill', 'lady', 'responded', 'resource', 'bill', 'earlier', 'week', 'later', 'information', 'sent', 'final', 'bill', 'information', 'mcoa', 'sent', 'bill', 'estimated', 'two', 'three', 'week', 'passed', 'without', 'mcoa', 'calling', 'back', 'dispute', 'called', 'mcoa', 'request', 'update', 'dispute', 'mcoa', 'employee', 'asked', 'receipt', 'bill', 'payed', 'year', 'earlier', 'acted', 'dispute', 'going', 'get', 'back', 'subject', 'requested', 'information', 'debt', 'billing', 'contract', 'past', 'billing', 'history', 'phone', 'employee', 'stated', 'could', 'request', 'phone', 'would', 'request', 'outside', 'source', 'asked', 'could', 'request', 'verbally', 'right', 'mcoa', 'employee', 'hung', 'called', 'back', 'mcoa', 'employee', 'forwarded', 'person', 'dropped', 'call', 'called', 'back', 'rd', 'time', 'employee', 'least', 'attempted', 'try', 'help', 'said', 'would', 'someone', 'contact', 'subject', 'brought', 'iowa', 'code', 'used', 'send', 'solve', 'debt', 'section', 'iowa', 'code', 'state', 'unwritten', 'contract', 'statue', 'limitation', 'year', 'want', 'know', 'year', 'later', 'collection', 'calling', 'could', 'explain', 'requested', 'time', 'left', 'dispute', 'collection', 'hurting', 'credit', 'said', 'ready', 'turned', 'collection', 'even', 'though', 'employee', 'told', 'day', 'time', 'frame', 'matter', 'started', 'dispute', 'immediately', 'called', 'asked', 'get', 'copy', 'contract', 'employee', 'stated', 'copy', 'verbal', 'contract', 'written', 'contract', 'asked', 'legally', 'turn', 'collection', 'verbal', 'contract', 'debt', 'year', 'contacting', 'even', 'though', 'iowa', 'code', 'clearly', 'state', 'unwritten', 'contact', 'year', 'confront', 'irs', 'typiclaly', 'go', 'back', 'year', 'utility', 'bill', 'expects', 'go', 'back', 'nobody', 'save', 'recipe', 'utility', 'company', 'give', 'year', 'defend', 'dollar', 'debt', 'considering', 'employee', 'mcoa', 'stated', 'collection', 'damaging', 'credit', 'make', 'choice', 'paying', 'fraudulent', 'debt', 'letting', 'credit', 'destroyed', 'since', 'payed', 'using', 'billing', 'platform', 'without', 'chance', 'properly', 'dispute', 'fraudulent', 'debt', 'long', 'destroys', 'credit', 'chance', 'trusting', 'municipal', 'collection', 'america', 'inc', 'payment', 'considering', 'hung', 'twice', 'never', 'made', 'realistic', 'attempt', 'dispute', 'debt', 'instead', 'gave', 'run', 'around', 'false', 'information', 'could', 'support', 'information', 'debt', 'outside', 'computer', 'generated', 'bill', 'year', 'earlier', 'company', 'need', 'deeply', 'audited', 'considering', 'demanding', 'payment', 'without', 'legit', 'chance', 'dispute', 'debt', 'granting', 'form', 'credit', 'people', 'verbal', 'contract', 'support', 'turning', 'debt', 'collection', 'agency', 'thats', 'likely', 'city', 'pocket']	unwritten contract property rented several tenant name utility moved away rental found budget billing program tenant owed money paying due billing u le usage tenant left bill consolidated debt pay utility company consolidated debt called send final bill never received called request amount owed mailed check via usps day due never receiving final bill municipal collection america inc sent bill via usps authorized iowa code section statement mcoa stated day receiving notice could dispute validity debt portion thereof mcoa would assume debt valid receiving billi called mcoa night closed called mcoa following day mcoa explained never paid utility bill stated wanted dispute debt requested information debt mailed address already listed also asked mcoa dispute work much time give pay bill dispute mcoa employee would answer question read notice sent said day said since disputing worry time frame explained want dispute want time exceed amount given hurt credit mcoa employee said time matter dispute refused give information dispute time frame give day called asked scam real debt responded debt real asked date debt amount said would send information via mail asked took year bill responded got new system debt found thinking hour later called gain requested past billing history contract itemized bill debt owed asked took year bill lady responded resource bill earlier week later information sent final bill information mcoa sent bill estimated two three week passed without mcoa calling back dispute called mcoa request update dispute mcoa employee asked receipt bill payed year earlier acted dispute going get back subject requested information debt billing contract past billing history phone employee stated could request phone would request outside source asked could request verbally right mcoa employee hung called back mcoa employee forwarded person dropped call called back rd time employee least attempted try help said would someone contact subject brought iowa code used send solve debt section iowa code state unwritten contract statue limitation year want know year later collection calling could explain requested time left dispute collection hurting credit said ready turned collection even though employee told day time frame matter started dispute immediately called asked get copy contract employee stated copy verbal contract written contract asked legally turn collection verbal contract debt year contacting even though iowa code clearly state unwritten contact year confront irs typiclaly go back year utility bill expects go back nobody save recipe utility company give year defend dollar debt considering employee mcoa stated collection damaging credit make choice paying fraudulent debt letting credit destroyed since payed using billing platform without chance properly dispute fraudulent debt long destroys credit chance trusting municipal collection america inc payment considering hung twice never made realistic attempt dispute debt instead gave run around false information could support information debt outside computer generated bill year earlier company need deeply audited considering demanding payment without legit chance dispute debt granting form credit people verbal contract support turning debt collection agency thats likely city pocket	5827	1131	1	365	4.152078	0.322723

	product	unigram	bigram
0	Checking or savings account	stressful, payday, lending, loan	loan agreement, loan day, predatory lending, received loan
1	Credit card or prepaid card	modification, foreclosure, escrow, mortgage	mortgage company, loan modification, escrow account, mortgage payment
2	Credit reporting, credit repair services, or other personal consumer reports	citi, express, reward, card	balance transfer, fraudulent charge, american express, credit card
3	Debt collection	overdraft, deposit, deposited, branch	direct deposit, overdraft fee, checking account, saving account
4	Money transfer, virtual currency, or money service	inquiry, reporting, equifax, report	credit file, fraudulent account, credit bureau, credit report
5	Mortgage	midland, collect, collection, debt	belong identity, collection agency, debt belong, collect debt
6	Payday loan, title loan, or personal loan	university, student, forgiveness, navient	federal loan, loan navient, student loan, loan forgiveness
7	Student loan	seller, transfer, ticket, paypal	money account, transfer fund, money order, paypal account
8	Vehicle loan or lease	gm, car, vehicle, leased	auto loan, vehicle returned, pay car, gm financial

Description¶

Data Description¶

Purpose¶

Business Problem¶

Imports¶

Useful Scripts¶

Load the data¶

Class Distribution¶

EDA for Text Data¶

Total Time Taken¶