Twitter sentiment analysis.
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px
pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)
import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])
#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])
#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict
#=====Warnigns
import warnings
warnings.simplefilter("ignore")
# plottig warnings
[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]
[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import ast
df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')
# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)
df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]
print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")
df_train.head(2).append(df_train.tail(2))
shape df_train: (7920, 24) shape df_test: (1953, 24)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
7918 | 7918 | 7919 | 0.0 | Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ | [finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget] | finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget | ['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#'] | #smart #pocket #wifi #ipad #samsung #s3 #gadget # | 133 | 16 | 1 | 16 | 1 | 5 | 1 | 0 | 3 | 0 | 0 | 7.375000 | 0.037594 | 1.0 | [finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget] | finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget |
7919 | 7919 | 7920 | 0.0 | Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ | [aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew] | aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew | ['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…'] | #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… | 129 | 13 | 1 | 13 | 5 | 12 | 3 | 0 | 2 | 0 | 0 | 9.000000 | 0.093023 | 1.0 | [aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew] | aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew |
df = df_train
df_pos = df[df['label']==0.0] # it's 0 NOT 1
df_neg = df[df['label']==1.0]
sns.countplot(df[target])
<matplotlib.axes._subplots.AxesSubplot at 0x7f8fe4062f10>
df[target].value_counts().plot.bar()
df1 = df.sample(1000)
df1['tfidf'] = df[mce].pipe(hero.tfidf)
df1['pca'] = df1['tfidf'].pipe(hero.pca)
df1['kmeans_labels'] = df1['tfidf'].pipe(hero.kmeans,n_clusters=2)
hero.scatterplot(df1, 'pca', color='kmeans_labels')
df.head(2).T
0 | 1 | |
---|---|---|
index | 0 | 1 |
id | 1 | 2 |
label | 0 | 0 |
tweet | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ |
tweet_lst_clean | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] |
tweet_clean | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias |
hashtags_lst | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] |
hashtags | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | #yay #Sony #Xperia #S #sonyexperias… |
total_length | 128 | 131 |
num_words | 13 | 17 |
num_sent | 1 | 1 |
num_unique_words | 13 | 17 |
num_words_title | 2 | 5 |
num_uppercase | 5 | 12 |
num_exclamation_marks | 0 | 0 |
num_question_marks | 0 | 0 |
num_punctuation | 2 | 3 |
num_symbols | 0 | 0 |
num_digits | 0 | 0 |
avg_word_len | 8.92308 | 6.76471 |
avg_uppercase | 0.0390625 | 0.0916031 |
avg_unique | 1 | 1 |
tweet_lst_clean_emoji | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] |
tweet_clean_emoji | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
df[mcle].head(2)[0]
['fingerprint', 'pregnancy', 'test', 'android', 'aps', 'beautiful', 'cute', 'health', 'iger', 'iphone', 'iphones', 'iphone']
arr_all_words = df[mcle].sum()
arr_pos_words = df[df[target]==0.0][mcle].sum()
arr_neg_words = df[df[target]==1.0][mcle].sum()
print(f"len arr_all_words: {len(arr_all_words)}")
print(f"len arr_pos_words: {len(arr_pos_words)}")
print(f"len arr_neg_words: {len(arr_neg_words)}")
len arr_all_words: 112290 len arr_pos_words: 89487 len arr_neg_words: 22803
from collections import Counter
df_freq = pd.DataFrame(Counter(arr_all_words).most_common())
df_freq_pos = pd.DataFrame(Counter(arr_pos_words).most_common())
df_freq_neg = pd.DataFrame(Counter(arr_neg_words).most_common())
df_freq_pos.head()
0 | 1 | |
---|---|---|
0 | iphone | 4153 |
1 | aple | 1737 |
2 | samsung | 1424 |
3 | pic | 1253 |
4 | insta | 1247 |
df_freq = pd.DataFrame(np.unique(arr_all_words,return_counts=True)).T
df_freq.head(2).append(df_freq.tail(2))
0 | 1 | |
---|---|---|
0 | aa | 7 |
1 | aaa | 2 |
10603 | zy | 13 |
10604 | zz | 1 |
fdist = nltk.FreqDist(arr_all_words)
print([i for i in dir(fdist) if i[0]!='_'])
['B', 'N', 'Nr', 'clear', 'copy', 'elements', 'freq', 'fromkeys', 'get', 'hapaxes', 'items', 'keys', 'max', 'most_common', 'pformat', 'plot', 'pop', 'popitem', 'pprint', 'r_Nr', 'setdefault', 'subtract', 'tabulate', 'unicode_repr', 'update', 'values']
df_freq = pd.DataFrame(fdist.most_common(20))
df_freq.head()
0 | 1 | |
---|---|---|
0 | iphone | 4831 |
1 | aple | 3183 |
2 | samsung | 1574 |
3 | new | 1381 |
4 | pic | 1305 |
fdist.plot(20)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8f33605f50>
df_freq = hero.top_words(df[mce]).to_frame()
df_freq.head(2)
tweet_clean_emoji | |
---|---|
iphone | 4831 |
aple | 3183 |
hero.top_words(df[mce],normalize=True).to_frame().head().mul(100)
tweet_clean_emoji | |
---|---|
iphone | 4.302253 |
aple | 2.834625 |
samsung | 1.401728 |
new | 1.229851 |
pic | 1.162169 |
hero.wordcloud(df[mce])
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[30, 30])
wordcloud1 = WordCloud( background_color='white',
width=800,
height=600
).generate(' '.join(arr_pos_words))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive Tweets',fontsize=40);
wordcloud2 = WordCloud( background_color='white',
width=800,
height=600
).generate(' '.join(arr_neg_words))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative Tweets',fontsize=40);
from plotly_wordcloud import plotly_wordcloud
text = " ".join(arr_pos_words)
fig = plotly_wordcloud(text)
fig['layout']['title'] = 'Wordcloud for +Ve Tweets'
fig['layout']['height'] = 800
fig['layout']['width'] = 800
py.iplot(fig)
import inspect
# inspect.getsourcelines(plotly_wordcloud)
fig = px.treemap(df_pos_uni.head(20),
path=['Word'],values='Count',
title='Top +Ve Twitter Words')
fig['layout']['title']['x'] = 0.5
fig.show()
fig = px.treemap(df_neg_uni.head(20),path=['Word'],values='Count')
fig.update_layout(
title={
'text': "Top -Ve Twitter Words",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
df.columns
Index(['index', 'id', 'label', 'tweet', 'tweet_lst_clean', 'tweet_clean', 'hashtags_lst', 'hashtags', 'total_length', 'num_words', 'num_sent', 'num_unique_words', 'num_words_title', 'num_uppercase', 'num_exclamation_marks', 'num_question_marks', 'num_punctuation', 'num_symbols', 'num_digits', 'avg_word_len', 'avg_uppercase', 'avg_unique', 'tweet_lst_clean_emoji', 'tweet_clean_emoji'], dtype='object')
import warnings
warnings.simplefilter("ignore")
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
NEG_TWEETS = df_train[target] == 1
def compare_distplots(df_train,features):
fig, axes = plt.subplots(ncols=2, nrows=len(features), figsize=(20, 50), dpi=100)
for i, feature in enumerate(features):
sns.distplot(df_train.loc[~NEG_TWEETS][feature], label='Positive', ax=axes[i][0], color='green')
sns.distplot(df_train.loc[NEG_TWEETS][feature], label='Negative', ax=axes[i][0], color='red')
sns.distplot(df_train[feature], label='Training', ax=axes[i][1])
sns.distplot(df_test[feature], label='Test', ax=axes[i][1])
for j in range(2):
axes[i][j].set_xlabel('')
axes[i][j].tick_params(axis='x', labelsize=20)
axes[i][j].tick_params(axis='y', labelsize=20)
axes[i][j].legend(fontsize=20)
axes[i][0].set_title(f'{feature} Target Distribution in Training Set', fontsize=20)
axes[i][1].set_title(f'{feature} Training & Test Set Distribution', fontsize=20)
plt.show()
features = ['total_length', 'num_words', 'num_sent',
'num_unique_words', 'num_words_title', 'num_uppercase',
'num_exclamation_marks' ]
compare_distplots(df_train,features)
features = ['num_question_marks',
'num_punctuation','num_symbols', 'num_digits',
'avg_word_len', 'avg_uppercase']
compare_distplots(df_train,features)
note = """
NOTE:
The distriubtion must be different between label +ve and -ve
but must be similar between train and test sets.
""";
from wordcloud import STOPWORDS
def generate_ngrams(text, n_gram=1):
token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
ngrams = zip(*[token[i:] for i in range(n_gram)])
return [' '.join(ngram) for ngram in ngrams]
def get_ngram_dfs(df_train,NEG_TWEETS,col,n_gram=1):
# NEG_TWEETS = df_train[target] == 1
neg_ngrams = defaultdict(int)
pos_ngrams = defaultdict(int)
for tweet in df_train[NEG_TWEETS][col]:
for word in generate_ngrams(tweet,n_gram=n_gram):
neg_ngrams[word] += 1
for tweet in df_train[~NEG_TWEETS][col]:
for word in generate_ngrams(tweet,n_gram=n_gram):
pos_ngrams[word] += 1
df_neg_ngrams = pd.DataFrame(sorted(neg_ngrams.items(),
key=lambda x: x[1])[::-1])
df_pos_ngrams = pd.DataFrame(sorted(pos_ngrams.items(),
key=lambda x: x[1])[::-1])
return [df_neg_ngrams,df_pos_ngrams]
def plot_neg_pos_ngrams(n_gram_name,
df_neg_ngrams,df_pos_ngrams,N=20):
FS = 25
fig, axes = plt.subplots(ncols=2, figsize=(18, 20), dpi=100)
plt.tight_layout()
sns.barplot(y=df_neg_ngrams[0].values[:N], x=df_neg_ngrams[1].values[:N], ax=axes[0], color='red')
sns.barplot(y=df_pos_ngrams[0].values[:N], x=df_pos_ngrams[1].values[:N], ax=axes[1], color='green')
for i in range(2):
axes[i].spines['right'].set_visible(False)
axes[i].set_xlabel('')
axes[i].set_ylabel('')
axes[i].tick_params(axis='x', labelsize=FS)
axes[i].tick_params(axis='y', labelsize=FS)
axes[0].set_title(f'Top {N} most common {n_gram_name} in -Ve Tweets', fontsize=FS)
axes[1].set_title(f'Top {N} most common {n_gram_name} in +Ve Tweets', fontsize=FS)
plt.show()
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=1)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=2)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)
df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=3)
plot_neg_pos_ngrams('Trigrams',df1,df2,N=20)
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def get_top_n_words(corpus, n=None):
"""
List the top n words in a vocabulary according
to occurrence in a text corpus.
"""
vec = CountVectorizer(stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
df1.head()
0 | 1 | |
---|---|---|
0 | aple pic twiter | 10 |
1 | fuck aple aple | 7 |
2 | hate aple retwet | 7 |
3 | aple suck aple | 7 |
4 | aple aple iphone | 6 |
df1.head(20)[::-1].plot.bar(x=1,y=0)
df_pos = df[df[target]==0.0]
df_neg = df[df[target]==1.0]
df_pos.head(2)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
pos_uni = get_top_n_words(df_pos[mce],20)
neg_uni = get_top_n_words(df_neg[mce],20)
df_pos_uni = pd.DataFrame(pos_uni,columns=['Word','Count'])[::-1]
df_neg_uni = pd.DataFrame(neg_uni,columns=['Word','Count'])[::-1]
fig = df_pos_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='green', opacity=0.6)
fig.update_layout(title_text='Most frequent +Ve **Unigrams** ')
fig.show()
fig = df_neg_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='red', opacity=0.6)
fig.update_layout(title_text='Most frequent -Ve **Unigrams** ')
fig.show()