import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')


import numpy as np
import pandas as pd
pd.set_option('max_colwidth',1000)
pd.set_option('max_columns',100)
pd.set_option('max_rows',50)
import os

#========= NLP
import re
import string
import unidecode
import wordninja
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords

#=======OTHERS
import scipy
import multiprocessing as mp

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-10-23 

CPython 3.7.7
IPython 7.16.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

nltk      3.5
gensim    3.8.3
watermark 2.0.2
textblob  0.15.3
numpy     1.17.5
wordninja 2.0.0
scipy     1.5.0
spacy     2.2.3
pandas    1.0.5
re        2.2.1


# !wget https://files.consumerfinance.gov/ccdb/complaints.csv.zip


%%time

create_data = False

if create_data:
    ifile = os.path.expanduser("~/Datasets/projects/consumer_complaints/complaints.csv.zip")
    df = pd.read_csv(ifile, compression='zip')

    # take only 2019 data
    df['Date received'] = pd.to_datetime(df['Date received'])
    df = df[df['Date received'].dt.year==2019]

    cols = ['Product','Consumer complaint narrative']
    df = df[cols]
    df = df.dropna()
    df.columns = ['product','complaint']

    df.to_csv('../data/complaints_2019.csv.zip',compression='zip',index=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


df = pd.read_csv('../data/complaints_2019.csv.zip', compression='zip')
print(f'data shape: {df.shape}')
display(df.head(2).append(df.tail(2)))

data shape: (124907, 2)


# params
target = 'product'
maincol = 'complaint'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'


import multiprocessing as mp
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


from urllib.parse import urlparse
def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove anonymous word x xx xxx etc
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: decode unicode characters
    s = s.apply(unidecode.unidecode)
    
    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'<3': 'love',
                 'awsm': 'awesome',
                 'b4': 'before',
                 'bc': 'because',
                 'bday': 'birthday',
                 'dm': 'direct message',
                 'doin': 'doing',
                 'gr8': 'great',
                 'gud': 'good',
                 'h8': 'hate',
                 'hw': 'how',
                 'idc': 'i do not care',
                 'idgaf': 'hate',
                 'irl': 'in real life',
                 'k': 'okay',
                 'lv': 'love',
                 'm': 'am',
                 'r': 'are',
                 'rt': 'retweet',
                 'ttyl': 'talk to you later',
                 'ty': 'thank you',
                 'u': 'you',
                 'wlcm': 'welcome',
                 'wtf': 'hate',
                 'xoxo': 'love',
                 'y': 'why',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove xx xxx xxx etc
    s = s.str.replace(r'\b[x\s]+\b',' ',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [i for i in x if i not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)

['retweet', 'textnum', 'yesyes', 'good', 'wwwxycom', 'amazing']


def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')

    return df


%%time
df = parallelize_dataframe(df, add_features)
# Wall time: 6min 8s

CPU times: user 6.68 s, sys: 2.57 s, total: 9.25 s
Wall time: 5min 53s


df.head()


note = """
Look the clean column properly:
- look for url links
- look for ellipsis e.g. #sonyexperias…
- look at anonymous words x xx xxx etc
""";


def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_unique'] = df['num_unique_words'] / df['num_words']

    return df


%%time
df = parallelize_dataframe(df, create_text_features)

CPU times: user 6.48 s, sys: 2.68 s, total: 9.15 s
Wall time: 18.2 s


print(df.shape)
df.head(2)

(124907, 18)


%%writefile a01_data_processing.py

# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')

# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords

import unidecode
import wordninja

time_start = time.time()

# Load the data
df = pd.read_csv('../data/complaints_2019.csv.zip', compression='zip')

# Variables
target = 'product'
maincol = 'complaint'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'


# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False
    
#================== Text processing =================
def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    
    - decode unicode
    - lowercase
    - remove ellipsis
    - remove url
    - expand apostrophes
    - remove punctuation
    - remove digits
    - remove anonymous word x xx xxx etc
    - remove stop words
    - lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])

    # step: decode unicode characters
    s = s.apply(unidecode.unidecode)
    
    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove xx xxx xxx etc
    s = s.str.replace(r'(\sx+)+\s', r' ', regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [i for i in x if i not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')

    return df

print("Creating clean data ...")
df = parallelize_dataframe(df, add_features)

#======================= Text Feature Generation =====
def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent'] = df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df

print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)


#===================== Save clean data =========================
df.to_csv('../data/complaints_2019_clean.csv.zip',compression='zip', index=False)

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")

Writing a01_data_processing.py

	product	complaint
0	Debt collection	transworld systems inc. \nis trying to collect a debt that is not mine, not owed and is inaccurate.
1	Debt collection	Over the past 2 weeks, I have been receiving excessive amounts of telephone calls from the company listed in this complaint. The calls occur between XXXX XXXX and XXXX XXXX to my cell and at my job. The company does not have the right to harass me at work and I want this to stop. It is extremely distracting to be told 5 times a day that I have a call from this collection agency while at work.
124905	Mortgage	Every 6 months or so, every since loan was sold to XXXX, and serviced by first XXXX ( from XXXX XXXX to XX/XX/XXXX ), and then transferred to Fay Servicing in XX/XX/XXXX until present, we are threatened with foreclosure proceedings and told we have missed several months in payments. After providing proof ( Bank statements ) of the so-called missed payments, the motion for relief of automatic stay ( we were in bankruptcy for the first 3 threats of foreclosure ), the motions are dismissed but attorneys fees are added to our loan balance ( several thousand dollars in attorney fees thus far ). However, this last time ( beginning in XX/XX/XXXX ), we were actually served with foreclosure papers, as we have no attorney, and had our bankruptcy dismissed this past XXXX ( XXXX ), in hopes of at least attempting to save our home. We are once again being told that we have not made a payment since XX/XX/XXXX, despite bank records proving that we have made the payments. Included in the paperwork...
124906	Debt collection	I had a unwritten contract with XXXX XXXX XXXX XXXX ( XXXX ) in XXXX for a property I rented in with several other tenants and my name was on the utilities. When I moved away from this rental I found out that I was on a budget billing program and me and the other tenants owed more money than we were paying due to them billing us less than our usage. The tenants left me with the bills so I consolidated my debt to pay off ( XXXX ) and the other utilities companies. When I consolidated this debt I called XXXX to have them send me a final bill that I never received. I then called ( XXXX ) XX/XX/XXXX to request the amount owed and mailed ( XXXX ) a check for {$120.00} via USPS the same day due to never receiving a final bill. On XX/XX/XXXX Municipal Collections of America, Inc. sent me a bill in via USPS for {$120.00} authorized by Iowa Code Section 8A.504. This same statement from ( MCOA ) stated the I had 30 days after receiving this notice that I could dispute the validity of the deb...

	product	complaint	complaint_lst_clean	complaint_clean
0	Debt collection	transworld systems inc. \nis trying to collect a debt that is not mine, not owed and is inaccurate.	[transworld, system, inc, trying, collect, debt, mine, owed, inaccurate]	transworld system inc trying collect debt mine owed inaccurate
1	Debt collection	Over the past 2 weeks, I have been receiving excessive amounts of telephone calls from the company listed in this complaint. The calls occur between XXXX XXXX and XXXX XXXX to my cell and at my job. The company does not have the right to harass me at work and I want this to stop. It is extremely distracting to be told 5 times a day that I have a call from this collection agency while at work.	[past, week, receiving, excessive, amount, telephone, call, company, listed, complaint, call, occur, cell, job, company, right, harass, work, want, stop, extremely, distracting, told, time, day, call, collection, agency, work]	past week receiving excessive amount telephone call company listed complaint call occur cell job company right harass work want stop extremely distracting told time day call collection agency work
2	Debt collection	Pioneer has committed several federal violations against me, a Private law abiding Federally Protected Consumer. Each violation is a statutory cost of {$1000.00} each, which does not include my personal cost and fees which shall be determined for taking time to address these issues. Violations committed against me include but not limited to : ( 1 ) Violated 15 USC 1692c ( a ) ; Communication without prior consent, expressed permission. ( 2 ) Violated 15 USC 1692d ; Harass and oppressive use of intercourse about an alleged debt. ( 3 ) Violated 15 USC 1692d ( l ) ; Attacking my reputation, accusing me of owing an alleged debt to you. ( 4 ) Violated 15 USC 1692e ( 9 ) ; Use/distribution of communication with authorization or approval. ( 5 ) Violated 15 USC 1692f ( l ) ; Attempting to collect a debt unauthorized by an agreement between parties.	[pioneer, committed, several, federal, violation, private, law, abiding, federally, protected, consumer, violation, statutory, cost, include, personal, cost, fee, shall, determined, taking, time, address, issue, violation, committed, include, limited, violated, usc, c, communication, without, prior, consent, expressed, permission, violated, usc, harass, oppressive, use, intercourse, alleged, debt, violated, usc, l, attacking, reputation, accusing, owing, alleged, debt, violated, usc, e, usedistribution, communication, authorization, approval, violated, usc, f, l, attempting, collect, debt, unauthorized, agreement, party]	pioneer committed several federal violation private law abiding federally protected consumer violation statutory cost include personal cost fee shall determined taking time address issue violation committed include limited violated usc c communication without prior consent expressed permission violated usc harass oppressive use intercourse alleged debt violated usc l attacking reputation accusing owing alleged debt violated usc e usedistribution communication authorization approval violated usc f l attempting collect debt unauthorized agreement party
3	Credit reporting, credit repair services, or other personal consumer reports	Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/XX/XXXX I requested that Experian send me a copy of the verifiable proof they have on file showing that the XXXX account they have listed on my credit report is actually mine. On XX/XX/XXXX and XX/XX/XXXX, instead of sending me a copy of the verifiable proof that I requested, Experian sent me a statement which reads, " The information you disputed has been verified as accurate. '' Experian also failed to provide me with the method of " verification. '' Since Experian neither provided me with a copy of the verifiable proof, nor did they delete the unverified information, I believe they are in violation of the Fair Credit Reporting Act and I have been harmed as a result. I have again, today, sent my fourth and final written request that they verify the account, and send me verifiable proof that this account is mine, or that they delete the unverified account. If they do not, my next step is to pursue a remedy through litigation.	[previously, requested, experian, send, copy, verifiable, proof, file, showing, account, listed, credit, report, actually, mine, instead, sending, copy, verifiable, proof, requested, experian, sent, statement, read, information, disputed, verified, accurate, experian, also, failed, provide, method, verification, since, experian, neither, provided, copy, verifiable, proof, delete, unverified, information, believe, violation, fair, credit, reporting, act, harmed, result, today, sent, fourth, final, written, request, verify, account, send, verifiable, proof, account, mine, delete, unverified, account, next, step, pursue, remedy, litigation]	previously requested experian send copy verifiable proof file showing account listed credit report actually mine instead sending copy verifiable proof requested experian sent statement read information disputed verified accurate experian also failed provide method verification since experian neither provided copy verifiable proof delete unverified information believe violation fair credit reporting act harmed result today sent fourth final written request verify account send verifiable proof account mine delete unverified account next step pursue remedy litigation
4	Credit reporting, credit repair services, or other personal consumer reports	Hello This complaint is against the three credit reporting companies. XXXX, Trans Union and XXXX. I noticed some discrepencies on my credit report so I put a credit freeze with XXXX.on XX/XX/2019. I then notified the three credit agencies previously stated with a writtent letter dated XX/XX/2019 requesting them to verifiy certain accounts showing on my report They were a Bankruptcy and a bank account from XXXX XXXX XXXX. \nThe response from XXXX and XXXX was that it was verified through their third partner XXXX. That can not be correct because I have a Freeze on my XXXX XXXX account since XX/XX/XXXX. which no one can obtain my report for anything until I unfreeze it. \nI wrote XXXX and XXXX a second letter and mailed them on XX/XX/2019 telling them that they have lied when they said that they verified the two accounts in question using XXXX because I have a freeze on my account and when I called XXXX on XX/XX/2019 they stated that no one has requested or attempted access to my acc...	[hello, complaint, three, credit, reporting, company, trans, union, noticed, discrepencies, credit, report, put, credit, freeze, xxxxon, notified, three, credit, agency, previously, stated, writtent, letter, dated, requesting, verifiy, certain, account, showing, report, bankruptcy, bank, account, response, verified, third, partner, correct, freeze, account, since, one, obtain, report, anything, unfreeze, wrote, second, letter, mailed, telling, lied, said, verified, two, account, question, using, freeze, account, called, stated, one, requested, attempted, access, account, year, told, violation, consumer, right, demand, two, account, removed, report, immediately, wrote, back, letter, dated, stating, verified, account, talk, complaint, trans, union, never, responded, letter, dated, verifiy, two, account, question, since, plus, ...]	hello complaint three credit reporting company trans union noticed discrepencies credit report put credit freeze xxxxon notified three credit agency previously stated writtent letter dated requesting verifiy certain account showing report bankruptcy bank account response verified third partner correct freeze account since one obtain report anything unfreeze wrote second letter mailed telling lied said verified two account question using freeze account called stated one requested attempted access account year told violation consumer right demand two account removed report immediately wrote back letter dated stating verified account talk complaint trans union never responded letter dated verifiy two account question since plus day investigate claim expired checked credit report transunion state us verify two account question also wrote second letter mailed stating violation consumer right demand two account removed report immediately please investigate compnaies wrong using third par...

	product	complaint	complaint_lst_clean	complaint_clean	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_question_marks	num_punctuation	num_symbols	num_digits	avg_word_len	avg_uppercase	avg_unique
0	Debt collection	transworld systems inc. \nis trying to collect a debt that is not mine, not owed and is inaccurate.	[transworld, system, inc, trying, collect, debt, mine, owed, inaccurate]	transworld system inc trying collect debt mine owed inaccurate	98	18	2	15	0	0	0	0	3	0	0	4.444444	0.000000	0.833333
1	Debt collection	Over the past 2 weeks, I have been receiving excessive amounts of telephone calls from the company listed in this complaint. The calls occur between XXXX XXXX and XXXX XXXX to my cell and at my job. The company does not have the right to harass me at work and I want this to stop. It is extremely distracting to be told 5 times a day that I have a call from this collection agency while at work.	[past, week, receiving, excessive, amount, telephone, call, company, listed, complaint, call, occur, cell, job, company, right, harass, work, want, stop, extremely, distracting, told, time, day, call, collection, agency, work]	past week receiving excessive amount telephone call company listed complaint call occur cell job company right harass work want stop extremely distracting told time day call collection agency work	395	78	1	54	7	23	0	0	5	0	2	4.076923	0.058228	0.692308

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Useful Functions¶

Text Data Processing¶

Process text¶

Text Features Generation¶

Script¶