Twitter sentiment analysis.
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',1000)
pd.set_option('max_columns',100)
pd.set_option('max_rows',50)
#========= NLP
import re
import string
import unidecode
import wordninja
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])
#=======OTHERS
import scipy
import multiprocessing as mp
[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]
df_train_raw = pd.read_csv('../data/raw/train.csv',nrows=50)
df_test_raw = pd.read_csv('../data/raw/test.csv',nrows=50)
df = df_train_raw.append(df_test_raw)
df = df.reset_index()
print(f"shape df_train_raw: {df_train_raw.shape}")
print(f"shape df_test_raw: {df_test_raw.shape}")
df.head(2).append(df.tail(2))
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
shape df_train_raw: (50, 3) shape df_test_raw: (50, 2)
import multiprocessing as mp
def parallelize_dataframe(df, func):
ncores = mp.cpu_count()
df_split = np.array_split(df, ncores)
pool = mp.Pool(ncores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
from urllib.parse import urlparse
def is_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def process_text(text):
"""
Do a basic text processing.
Parameters
-----------
text : string
Returns
--------
This function returns pandas series having one list
with clean text.
1: split combined text
2: lowercase
3: expand apostrophes
4: remove punctuation
5: remove digits
6: remove repeated substring
7: remove stop words
8: lemmatize
Example:
========
import re
import string
from nltk.corpus import stopwords
import nltk
text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
process_text(text)
# ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']
"""
s = pd.Series([text])
# step: Split combined words areYou ==> are You
#s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))
# step: decode unicode characters
s = s.apply(unidecode.unidecode)
# step: lowercase
s = s.str.lower()
# step: remove ellipsis
#s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
s = s.str.replace(r'…+',r'')
# step: remove url
#s = s.str.replace('http\S+|www.\S+', '', case=False)
s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])
# step: expand apostrophes
map_apos = {
"you're": 'you are',
"i'm": 'i am',
"he's": 'he is',
"she's": 'she is',
"it's": 'it is',
"they're": 'they are',
"can't": 'can not',
"couldn't": 'could not',
"don't": 'do not',
"don;t": 'do not',
"didn't": 'did not',
"doesn't": 'does not',
"isn't": 'is not',
"wasn't": 'was not',
"aren't": 'are not',
"weren't": 'were not',
"won't": 'will not',
"wouldn't": 'would not',
"hasn't": 'has not',
"haven't": 'have not',
"what's": 'what is',
"that's": 'that is',
}
sa = pd.Series(s.str.split()[0])
sb = sa.map(map_apos).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: expand shortcuts
shortcuts = {'<3': 'love',
'awsm': 'awesome',
'b4': 'before',
'bc': 'because',
'bday': 'birthday',
'dm': 'direct message',
'doin': 'doing',
'gr8': 'great',
'gud': 'good',
'h8': 'hate',
'hw': 'how',
'idc': 'i do not care',
'idgaf': 'hate',
'irl': 'in real life',
'k': 'okay',
'lv': 'love',
'm': 'am',
'r': 'are',
'rt': 'retweet',
'ttyl': 'talk to you later',
'ty': 'thank you',
'u': 'you',
'wlcm': 'welcome',
'wtf': 'hate',
'xoxo': 'love',
'y': 'why',
'yolo': 'you only live once'}
sa = pd.Series(s.str.split()[0])
sb = sa.map(shortcuts).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: remove punctuation
s = s.str.translate(str.maketrans(' ',' ',
string.punctuation))
# step: remove digits
s = s.str.translate(str.maketrans(' ', ' ', '\n'))
s = s.str.translate(str.maketrans(' ', ' ', string.digits))
# step: remove repeated substring yesyes ==> yes
s = s.str.replace(r'(\w+)\1',r'\1',regex=True)
# step: remove stop words
stop = set(stopwords.words('English'))
extra_stop_words = ['...']
stop.update(extra_stop_words) # inplace operation
s = s.str.split()
s = s.apply(lambda x: [I for I in x if I not in stop])
# step: convert word to base form or lemmatize
lemmatizer = nltk.stem.WordNetLemmatizer()
s = s.apply(lambda lst: [lemmatizer.lemmatize(word)
for word in lst])
return s.to_numpy()[0]
text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)
['retwet', 'textnum', 'yes', 'god', 'wwxycom', 'amazing']
def add_features(df):
df[mcl] = df[maincol].apply(process_text)
df[mc] = df[mcl].str.join(' ')
df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
#df['hastags'] = df[mc].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df['hashtags'] = df['hashtags_lst'].str.join(' ')
return df
%%time
df = parallelize_dataframe(df, add_features)
CPU times: user 17.5 ms, sys: 20.8 ms, total: 38.3 ms Wall time: 491 ms
df.head()
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | [#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | [#yay, #Sony, #Xperia, #S, #sonyexperias…] | #yay #Sony #Xperia #S #sonyexperias… |
2 | 2 | 3 | 0.0 | We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu | [love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect] | love would go talk makemories unplug relax iphone smartphone wifi conect | [#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...] | #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... |
3 | 3 | 4 | 0.0 | I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/ | [wired, know, george, made, way, iphone, cute, daventry, home] | wired know george made way iphone cute daventry home | [#iphone, #cute, #daventry, #home] | #iphone #cute #daventry #home |
4 | 4 | 5 | 1.0 | What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support! | [amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport] | amazing service aple wil even talk question unles pay stupid suport | [] |
%run emoticons.py
%run emojis.py
def convert_emoticons(text):
for emot in EMOTICONS:
text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
return text
text1 = "Hello :-) :-)"
text2 = "Thanks to my uncle :) #yay"
print(convert_emoticons(text1))
print(convert_emoticons(text2))
Hello Happy_face_smiley Happy_face_smiley Thanks to my uncle Happy_face_or_smiley #yay
def convert_emojis(text):
for emot in UNICODE_EMO:
text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
return text
text = "game is on 🔥"
convert_emojis(text)
'game is on fire'
def process_text_emoji(text):
"""
Do a basic text processing.
Parameters
-----------
text : string
Returns
--------
This function returns pandas series having one list
with clean text.
1: split combined text
2: lowercase
3: expand apostrophes
4: remove punctuation
5: remove digits
6: remove repeated substring
7: remove stop words
8: lemmatize
Example:
========
import re
import string
from nltk.corpus import stopwords
import nltk
text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
process_text(text)
# ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']
"""
s = pd.Series([text])
# step: decode unicode characters
s = s.apply(unidecode.unidecode)
# step: howareyou ==> how are you
s = s.apply(lambda x: ' '.join(wordninja.split(x)))
# step: expand emoticons and emojis
s = s.apply(convert_emoticons)
s = s.apply(convert_emojis)
# step: Split combined words areYou ==> are You
#s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))
# step: lowercase
s = s.str.lower()
# step: remove ellipsis
#s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
s = s.str.replace(r'…+',r'')
# step: remove url
#s = s.str.replace('http\S+|www.\S+', '', case=False)
s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])
# step: expand apostrophes
map_apos = {
"you're": 'you are',
"i'm": 'i am',
"he's": 'he is',
"she's": 'she is',
"it's": 'it is',
"they're": 'they are',
"can't": 'can not',
"couldn't": 'could not',
"don't": 'do not',
"don;t": 'do not',
"didn't": 'did not',
"doesn't": 'does not',
"isn't": 'is not',
"wasn't": 'was not',
"aren't": 'are not',
"weren't": 'were not',
"won't": 'will not',
"wouldn't": 'would not',
"hasn't": 'has not',
"haven't": 'have not',
"what's": 'what is',
"that's": 'that is',
}
sa = pd.Series(s.str.split()[0])
sb = sa.map(map_apos).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: expand shortcuts
shortcuts = {'<3': 'love',
'awsm': 'awesome',
'b4': 'before',
'bc': 'because',
'bday': 'birthday',
'dm': 'direct message',
'doin': 'doing',
'gr8': 'great',
'gud': 'good',
'h8': 'hate',
'hw': 'how',
'idc': 'i do not care',
'idgaf': 'hate',
'irl': 'in real life',
'k': 'okay',
'lv': 'love',
'm': 'am',
'r': 'are',
'rt': 'retweet',
'ttyl': 'talk to you later',
'ty': 'thank you',
'u': 'you',
'wlcm': 'welcome',
'wtf': 'hate',
'xoxo': 'love',
'y': 'why',
'yolo': 'you only live once'}
sa = pd.Series(s.str.split()[0])
sb = sa.map(shortcuts).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: remove punctuation
s = s.str.translate(str.maketrans(' ',' ',
string.punctuation))
# step: remove digits
s = s.str.translate(str.maketrans(' ', ' ', '\n'))
s = s.str.translate(str.maketrans(' ', ' ', string.digits))
# step: remove repeated substring yesyes ==> yes
s = s.str.replace(r'(\w+)\1',r'\1',regex=True)
# step: remove stop words
stop = set(stopwords.words('English'))
extra_stop_words = ['...']
stop.update(extra_stop_words) # inplace operation
s = s.str.split()
s = s.apply(lambda x: [I for I in x if I not in stop])
# step: convert word to base form or lemmatize
lemmatizer = nltk.stem.WordNetLemmatizer()
s = s.apply(lambda lst: [lemmatizer.lemmatize(word)
for word in lst])
return s.to_numpy()[0]
text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)
['retwet', 'textnum', 'yes', 'god', 'wwxycom', 'amazing']
def add_features_emoji(df):
# we need to remove url first
df[mcle] = df[maincol].str.replace('http\S+|www.\S+', '', case=False)
df[mcle] = df[mcle].apply(process_text_emoji)
df[mce] = df[mcle].str.join(' ')
return df
add_features_emoji(df.head().copy())
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | [#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | [#yay, #Sony, #Xperia, #S, #sonyexperias…] | #yay #Sony #Xperia #S #sonyexperias… | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
2 | 2 | 3 | 0.0 | We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu | [love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect] | love would go talk makemories unplug relax iphone smartphone wifi conect | [#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...] | #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... | [love, would, go, talk, make, memory, unplug, relax, iphone, smartphone, wi, fi, conect] | love would go talk make memory unplug relax iphone smartphone wi fi conect |
3 | 3 | 4 | 0.0 | I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/ | [wired, know, george, made, way, iphone, cute, daventry, home] | wired know george made way iphone cute daventry home | [#iphone, #cute, #daventry, #home] | #iphone #cute #daventry #home | [wired, know, george, made, way, iphone, cute, daventry, home] | wired know george made way iphone cute daventry home |
4 | 4 | 5 | 1.0 | What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support! | [amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport] | amazing service aple wil even talk question unles pay stupid suport | [] | [amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport] | amazing service aple wil even talk question unles pay stupid suport |
%%time
# This takes long time for full data
df = parallelize_dataframe(df, add_features_emoji)
CPU times: user 18.9 ms, sys: 18.4 ms, total: 37.3 ms Wall time: 5.26 s
df.head()
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | [#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | [#yay, #Sony, #Xperia, #S, #sonyexperias…] | #yay #Sony #Xperia #S #sonyexperias… | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
2 | 2 | 3 | 0.0 | We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu | [love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect] | love would go talk makemories unplug relax iphone smartphone wifi conect | [#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...] | #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... | [love, would, go, talk, make, memory, unplug, relax, iphone, smartphone, wi, fi, conect] | love would go talk make memory unplug relax iphone smartphone wi fi conect |
3 | 3 | 4 | 0.0 | I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/ | [wired, know, george, made, way, iphone, cute, daventry, home] | wired know george made way iphone cute daventry home | [#iphone, #cute, #daventry, #home] | #iphone #cute #daventry #home | [wired, know, george, made, way, iphone, cute, daventry, home] | wired know george made way iphone cute daventry home |
4 | 4 | 5 | 1.0 | What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support! | [amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport] | amazing service aple wil even talk question unles pay stupid suport | [] | [amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport] | amazing service aple wil even talk question unles pay stupid suport |
note = """
Look the clean tweet properly:
- look for url links
- look for ellipsis e.g. #sonyexperias…
- convert emoji and emoticons eg. :) with library emot
""";
def create_text_features(df):
# total
df['total_length'] = df[maincol].apply(len)
# num of word and sentence
df['num_words'] = df[maincol].apply(lambda x: len(x.split()))
df['num_sent']=df[maincol].apply(lambda x:
len(re.findall("\n",str(x)))+1)
df['num_unique_words'] = df[maincol].apply(
lambda x: len(set(w for w in x.split())))
df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df['num_uppercase'] = df[maincol].apply(
lambda x: sum(1 for c in x if c.isupper()))
# num of certain characters ! ? . @
df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))
df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))
df['num_punctuation'] = df[maincol].apply(
lambda x: sum(x.count(w) for w in '.,;:'))
df['num_symbols'] = df[maincol].apply(
lambda x: sum(x.count(w) for w in '*&$%'))
df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
# average
df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df['avg_uppercase'] = df.apply(
lambda row: float(row['num_uppercase'])/float(row['total_length']),
axis=1)
df['avg_unique'] = df['num_unique_words'] / df['num_words']
return df
%%time
df = parallelize_dataframe(df, create_text_features)
CPU times: user 25.9 ms, sys: 22.1 ms, total: 48 ms Wall time: 244 ms
df[[maincol,mcl,mcle]].tail()
tweet | tweet_lst_clean | tweet_lst_clean_emoji | |
---|---|---|---|
95 | #organic #farming #orange and #apple #snack *for more high quality photos… https://www.instagram.com/p/BHPAdCnDYtz/ | [organic, farming, orange, aple, snack, high, quality, photo] | [organic, farming, orange, aple, snack, high, quality, photo] |
96 | Fellow passenger owns @oneplus One introduced myself and happen to speak how amazing the big boy is! #OnePlus growing strong! | [felow, pasenger, owns, oneplus, one, introduced, hapen, speak, amazing, big, boy, oneplus, growing, strong] | [felow, pasenger, owns, one, plus, one, introduced, hapen, speak, amazing, big, boy, one, plus, growing, strong] |
97 | New dress, thanks to momma #apple #dress #littleblackdress #dresses #bestmomma #cute #beautiful ... http://dlvr.it/5PJzxv | [new, dres, thanks, moma, aple, dres, litleblackdres, dreses, bestmoma, cute, beautiful] | [new, dres, thanks, moma, aple, dres, litle, black, dres, dreses, best, moma, cute, beautiful] |
98 | Finally got the android 2.1 update for the moto milestone :) happy but came 2 weeks late :S #android #motorola | [finaly, got, android, update, moto, milestone, hapy, came, weks, late, android, motorola] | [finaly, got, android, update, moto, milestone, hapy, came, weks, late, android, motorola] |
99 | ok. second time in seven days that my #apple #magicmouse batteries need to be replaced. #thisaintright #apple xoxo, antigirl | [ok, second, time, seven, day, aple, magicmouse, bateries, ned, replaced, thisaintright, aple, xo, antigirl] | [ok, second, time, seven, day, aple, magic, mouse, bateries, ned, replaced, right, aple, x, oxo, anti, girl] |
arr_all_words = df[mcle].sum()
least_freq_words = np.unique(arr_all_words,return_counts=True)[0][-100:]
least_freq_words
array(['towards', 'trans', 'travel', 'tre', 'trending', 'trip', 'trols', 'truth', 'tt', 'tv', 'twe', 'twet', 'tweted', 'twitch', 'twiter', 'two', 'type', 'u', 'ude', 'ugh', 'ugo', 'uk', 'um', 'uncle', 'unles', 'unplug', 'unresponsive', 'update', 'upgrade', 'ur', 'urban', 'useles', 'v', 'vacation', 'verification', 'vertical', 'vet', 'vg', 'vgr', 'vi', 'via', 'vibe', 'vidal', 'video', 'vien', 'view', 'vocation', 'w', 'wag', 'waiter', 'waiting', 'wal', 'walpaper', 'walpapers', 'want', 'wanting', 'watch', 'water', 'way', 'weather', 'wek', 'weks', 'white', 'wi', 'wife', 'wil', 'wild', 'window', 'wired', 'without', 'wods', 'woman', 'work', 'worldwide', 'would', 'wq', 'writing', 'wtc', 'x', 'xbox', 'xf', 'xp', 'yad', 'yay', 'year', 'yec', 'yelow', 'yes', 'young', 'youtube', 'z', 'za', 'zand', 'zar', 'zeland', 'zi', 'zn', 'zp', 'zu', 'zw'], dtype='<U13')
%%writefile sentiment_analysis_data_processing.py
# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords
import unidecode
import wordninja
time_start = time.time()
# Load the data
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df = df_train_raw.append(df_test_raw)
df = df.reset_index()
# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
ncores = mp.cpu_count()
df_split = np.array_split(df, ncores)
pool = mp.Pool(ncores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def is_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
#================== Text processing =================
def process_text(text):
"""
Do a basic text processing.
Parameters
-----------
text : string
Returns
--------
This function returns pandas series having one list
with clean text.
1: split combined text
2: lowercase
3: expand apostrophes
4: remove punctuation
5: remove digits
6: remove repeated substring
7: remove stop words
8: lemmatize
Example:
========
import re
import string
from nltk.corpus import stopwords
import nltk
text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
process_text(text)
# ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']
"""
s = pd.Series([text])
# step: Split combined words areYou ==> are You
#s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))
# step: lowercase
s = s.str.lower()
# step: remove ellipsis
#s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
s = s.str.replace(r'…+',r'')
# step: remove url
#s = s.str.replace('http\S+|www.\S+', '', case=False)
s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])
# step: expand apostrophes
map_apos = {
"you're": 'you are',
"i'm": 'i am',
"he's": 'he is',
"she's": 'she is',
"it's": 'it is',
"they're": 'they are',
"can't": 'can not',
"couldn't": 'could not',
"don't": 'do not',
"don;t": 'do not',
"didn't": 'did not',
"doesn't": 'does not',
"isn't": 'is not',
"wasn't": 'was not',
"aren't": 'are not',
"weren't": 'were not',
"won't": 'will not',
"wouldn't": 'would not',
"hasn't": 'has not',
"haven't": 'have not',
"what's": 'what is',
"that's": 'that is',
}
sa = pd.Series(s.str.split()[0])
sb = sa.map(map_apos).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: expand shortcuts
shortcuts = {'<3': 'love',
'awsm': 'awesome',
'b4': 'before',
'bc': 'because',
'bday': 'birthday',
'dm': 'direct message',
'doin': 'doing',
'gr8': 'great',
'gud': 'good',
'h8': 'hate',
'hw': 'how',
'idc': 'i do not care',
'idgaf': 'hate',
'irl': 'in real life',
'k': 'okay',
'lv': 'love',
'm': 'am',
'r': 'are',
'rt': 'retweet',
'ttyl': 'talk to you later',
'ty': 'thank you',
'u': 'you',
'wlcm': 'welcome',
'wtf': 'hate',
'xoxo': 'love',
'y': 'why',
'yolo': 'you only live once'}
sa = pd.Series(s.str.split()[0])
sb = sa.map(shortcuts).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: remove punctuation
s = s.str.translate(str.maketrans(' ',' ',
string.punctuation))
# step: remove digits
s = s.str.translate(str.maketrans(' ', ' ', '\n'))
s = s.str.translate(str.maketrans(' ', ' ', string.digits))
# step: remove repeated substring yesyes ==> yes
s = s.str.replace(r'(\w+)\1',r'\1',regex=True)
# step: remove stop words
stop = set(stopwords.words('English'))
extra_stop_words = ['...']
stop.update(extra_stop_words) # inplace operation
s = s.str.split()
s = s.apply(lambda x: [I for I in x if I not in stop])
# step: convert word to base form or lemmatize
lemmatizer = nltk.stem.WordNetLemmatizer()
s = s.apply(lambda lst: [lemmatizer.lemmatize(word)
for word in lst])
return s.to_numpy()[0]
def add_features(df):
df[mcl] = df[maincol].apply(process_text)
df[mc] = df[mcl].str.join(' ')
df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
#df['hashtags'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df['hashtags'] = df['hashtags_lst'].str.join(' ')
return df
print("Creating clean tweet and hashtags ...")
df = parallelize_dataframe(df, add_features)
#======================= Text Feature Generation =====
def create_text_features(df):
# total
df['total_length'] = df[maincol].apply(len)
# num of word and sentence
df['num_words'] = df[maincol].apply(lambda x: len(x.split()))
df['num_sent']=df[maincol].apply(lambda x:
len(re.findall("\n",str(x)))+1)
df['num_unique_words'] = df[maincol].apply(
lambda x: len(set(w for w in x.split())))
df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df['num_uppercase'] = df[maincol].apply(
lambda x: sum(1 for c in x if c.isupper()))
# num of certain characters ! ? . @
df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))
df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))
df['num_punctuation'] = df[maincol].apply(
lambda x: sum(x.count(w) for w in '.,;:'))
df['num_symbols'] = df[maincol].apply(
lambda x: sum(x.count(w) for w in '*&$%'))
df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
# average
df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df['avg_uppercase'] = df.apply(
lambda row: float(row['num_uppercase'])/float(row['total_length']),
axis=1)
df['avg_unique'] = df['num_unique_words'] / df['num_words']
return df
print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)
#===================== Emoticons =====================
from emoticons.py import *
def convert_emoticons(text):
for emot in EMOTICONS:
text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
return text
#===================== Save clean data =========================
df.to_csv('../data/processed/df_combined_clean.csv',index=False)
time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")
Overwriting sentiment_analysis_data_processing.py
%%writefile sentiment_analysis_data_processing.py
# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords
import unidecode
import wordninja
time_start = time.time()
# Load the data
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df = df_train_raw.append(df_test_raw)
df = df.reset_index()
# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
ncores = mp.cpu_count()
df_split = np.array_split(df, ncores)
pool = mp.Pool(ncores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def is_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
#================== Text processing =================
def process_text(text):
"""
Do a basic text processing.
Parameters
-----------
text : string
Returns
--------
This function returns pandas series having one list
with clean text.
1: split combined text
2: lowercase
3: expand apostrophes
4: remove punctuation
5: remove digits
6: remove repeated substring
7: remove stop words
8: lemmatize
Example:
========
import re
import string
from nltk.corpus import stopwords
import nltk
text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
process_text(text)
# ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']
"""
s = pd.Series([text])
# step: Split combined words areYou ==> are You
#s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))
# step: lowercase
s = s.str.lower()
# step: remove ellipsis
#s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
s = s.str.replace(r'…+',r'')
# step: remove url
#s = s.str.replace('http\S+|www.\S+', '', case=False)
s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])
# step: expand apostrophes
map_apos = {
"you're": 'you are',
"i'm": 'i am',
"he's": 'he is',
"she's": 'she is',
"it's": 'it is',
"they're": 'they are',
"can't": 'can not',
"couldn't": 'could not',
"don't": 'do not',
"don;t": 'do not',
"didn't": 'did not',
"doesn't": 'does not',
"isn't": 'is not',
"wasn't": 'was not',
"aren't": 'are not',
"weren't": 'were not',
"won't": 'will not',
"wouldn't": 'would not',
"hasn't": 'has not',
"haven't": 'have not',
"what's": 'what is',
"that's": 'that is',
}
sa = pd.Series(s.str.split()[0])
sb = sa.map(map_apos).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: expand shortcuts
shortcuts = {'<3': 'love',
'awsm': 'awesome',
'b4': 'before',
'bc': 'because',
'bday': 'birthday',
'dm': 'direct message',
'doin': 'doing',
'gr8': 'great',
'gud': 'good',
'h8': 'hate',
'hw': 'how',
'idc': 'i do not care',
'idgaf': 'hate',
'irl': 'in real life',
'k': 'okay',
'lv': 'love',
'm': 'am',
'r': 'are',
'rt': 'retweet',
'ttyl': 'talk to you later',
'ty': 'thank you',
'u': 'you',
'wlcm': 'welcome',
'wtf': 'hate',
'xoxo': 'love',
'y': 'why',
'yolo': 'you only live once'}
sa = pd.Series(s.str.split()[0])
sb = sa.map(shortcuts).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: remove punctuation
s = s.str.translate(str.maketrans(' ',' ',
string.punctuation))
# step: remove digits
s = s.str.translate(str.maketrans(' ', ' ', '\n'))
s = s.str.translate(str.maketrans(' ', ' ', string.digits))
# step: remove repeated substring yesyes ==> yes
s = s.str.replace(r'(\w+)\1',r'\1',regex=True)
# step: remove stop words
stop = set(stopwords.words('English'))
extra_stop_words = ['...']
stop.update(extra_stop_words) # inplace operation
s = s.str.split()
s = s.apply(lambda x: [I for I in x if I not in stop])
# step: convert word to base form or lemmatize
lemmatizer = nltk.stem.WordNetLemmatizer()
s = s.apply(lambda lst: [lemmatizer.lemmatize(word)
for word in lst])
return s.to_numpy()[0]
def add_features(df):
df[mcl] = df[maincol].apply(process_text)
df[mc] = df[mcl].str.join(' ')
df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
#df['hashtags'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df['hashtags'] = df['hashtags_lst'].str.join(' ')
return df
print("Creating clean tweet and hashtags ...")
df = parallelize_dataframe(df, add_features)
#======================= Text Feature Generation =====
def create_text_features(df):
# total
df['total_length'] = df[maincol].apply(len)
# num of word and sentence
df['num_words'] = df[maincol].apply(lambda x: len(x.split()))
df['num_sent']=df[maincol].apply(lambda x:
len(re.findall("\n",str(x)))+1)
df['num_unique_words'] = df[maincol].apply(
lambda x: len(set(w for w in x.split())))
df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df['num_uppercase'] = df[maincol].apply(
lambda x: sum(1 for c in x if c.isupper()))
# num of certain characters ! ? . @
df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))
df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))
df['num_punctuation'] = df[maincol].apply(
lambda x: sum(x.count(w) for w in '.,;:'))
df['num_symbols'] = df[maincol].apply(
lambda x: sum(x.count(w) for w in '*&$%'))
df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
# average
df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df['avg_uppercase'] = df.apply(
lambda row: float(row['num_uppercase'])/float(row['total_length']),
axis=1)
df['avg_unique'] = df['num_unique_words'] / df['num_words']
return df
print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)
#===================== Manipulating emoticons and emojis
from emojis import *
from emoticons import *
def convert_emoticons(text):
for emot in EMOTICONS:
text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
return text
def convert_emojis(text):
for emot in UNICODE_EMO:
text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
return text
def process_text_emoji(text):
"""
Do a basic text processing.
Parameters
-----------
text : string
Returns
--------
This function returns pandas series having one list
with clean text.
1: split combined text
2: lowercase
3: expand apostrophes
4: remove punctuation
5: remove digits
6: remove repeated substring
7: remove stop words
8: lemmatize
Example:
========
import re
import string
from nltk.corpus import stopwords
import nltk
text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
process_text(text)
# ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']
"""
s = pd.Series([text])
# step: decode unicode characters
s = s.apply(unidecode.unidecode)
# step: howareyou ==> how are you
s = s.apply(lambda x: ' '.join(wordninja.split(x)))
# step: expand emoticons and emojis
s = s.apply(convert_emoticons)
s = s.apply(convert_emojis)
# step: Split combined words areYou ==> are You
#s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))
# step: lowercase
s = s.str.lower()
# step: remove ellipsis
#s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
s = s.str.replace(r'…+',r'')
# step: remove url
#s = s.str.replace('http\S+|www.\S+', '', case=False)
s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])
# step: expand apostrophes
map_apos = {
"you're": 'you are',
"i'm": 'i am',
"he's": 'he is',
"she's": 'she is',
"it's": 'it is',
"they're": 'they are',
"can't": 'can not',
"couldn't": 'could not',
"don't": 'do not',
"don;t": 'do not',
"didn't": 'did not',
"doesn't": 'does not',
"isn't": 'is not',
"wasn't": 'was not',
"aren't": 'are not',
"weren't": 'were not',
"won't": 'will not',
"wouldn't": 'would not',
"hasn't": 'has not',
"haven't": 'have not',
"what's": 'what is',
"that's": 'that is',
}
sa = pd.Series(s.str.split()[0])
sb = sa.map(map_apos).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: expand shortcuts
shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
'doin': 'doing', 'hw': 'how',
'k': 'okay', 'm': 'am', 'b4': 'before',
'idc': "i do not care", 'ty': 'thankyou',
'wlcm': 'welcome', 'bc': 'because',
'<3': 'love', 'xoxo': 'love',
'ttyl': 'talk to you later', 'gr8': 'great',
'bday': 'birthday', 'awsm': 'awesome',
'gud': 'good', 'h8': 'hate',
'lv': 'love', 'dm': 'direct message',
'rt': 'retweet', 'wtf': 'hate',
'idgaf': 'hate','irl': 'in real life',
'yolo': 'you only live once'}
sa = pd.Series(s.str.split()[0])
sb = sa.map(shortcuts).fillna(sa)
sentence = sb.str.cat(sep=' ')
s = pd.Series([sentence])
# step: remove punctuation
s = s.str.translate(str.maketrans(' ',' ',
string.punctuation))
# step: remove digits
s = s.str.translate(str.maketrans(' ', ' ', '\n'))
s = s.str.translate(str.maketrans(' ', ' ', string.digits))
# step: remove repeated substring yesyes ==> yes
s = s.str.replace(r'(\w+)\1',r'\1',regex=True)
# step: remove stop words
stop = set(stopwords.words('English'))
extra_stop_words = ['...']
stop.update(extra_stop_words) # inplace operation
s = s.str.split()
s = s.apply(lambda x: [I for I in x if I not in stop])
# step: convert word to base form or lemmatize
lemmatizer = nltk.stem.WordNetLemmatizer()
s = s.apply(lambda lst: [lemmatizer.lemmatize(word)
for word in lst])
return s.to_numpy()[0]
def add_features_emoji(df):
# we need to remove url first
df[mcle] = df[maincol].str.replace('http\S+|www.\S+', '', case=False)
df[mcle] = df[mcle].apply(process_text_emoji)
df[mce] = df[mcle].str.join(' ')
return df
print("Adding Emoticons and emoji features ...")
df = parallelize_dataframe(df, add_features_emoji)
#===================== Save clean data =========================
df.to_csv('../data/processed/df_combined_clean.csv',index=False)
time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")
# Data cleaning finished in 13.0 min 28.34 sec.
Overwriting sentiment_analysis_data_processing.py
# !open .