import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')


import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px

pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)

import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])

#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)


#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])

#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict

#=====Warnigns
import warnings
warnings.simplefilter("ignore")
# plottig warnings

[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]

[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


import ast


df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")

df_train.head(2).append(df_train.tail(2))

shape df_train: (7920, 24)
shape df_test: (1953, 24)


df = df_train
df_pos = df[df['label']==0.0] # it's 0 NOT 1
df_neg = df[df['label']==1.0]


sns.countplot(df[target])

<matplotlib.axes._subplots.AxesSubplot at 0x7f8fe4062f10>


df[target].value_counts().plot.bar()


df1 = df.sample(1000)

df1['tfidf'] = df[mce].pipe(hero.tfidf)
df1['pca'] = df1['tfidf'].pipe(hero.pca)
df1['kmeans_labels'] = df1['tfidf'].pipe(hero.kmeans,n_clusters=2)

hero.scatterplot(df1, 'pca', color='kmeans_labels')


df.head(2).T


df[mcle].head(2)[0]

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'aps',
 'beautiful',
 'cute',
 'health',
 'iger',
 'iphone',
 'iphones',
 'iphone']


arr_all_words = df[mcle].sum()
arr_pos_words = df[df[target]==0.0][mcle].sum()
arr_neg_words = df[df[target]==1.0][mcle].sum()

print(f"len arr_all_words: {len(arr_all_words)}")
print(f"len arr_pos_words: {len(arr_pos_words)}")
print(f"len arr_neg_words: {len(arr_neg_words)}")

len arr_all_words: 112290
len arr_pos_words: 89487
len arr_neg_words: 22803


from collections import Counter

df_freq = pd.DataFrame(Counter(arr_all_words).most_common())
df_freq_pos = pd.DataFrame(Counter(arr_pos_words).most_common())
df_freq_neg = pd.DataFrame(Counter(arr_neg_words).most_common())

df_freq_pos.head()


df_freq = pd.DataFrame(np.unique(arr_all_words,return_counts=True)).T
df_freq.head(2).append(df_freq.tail(2))


fdist = nltk.FreqDist(arr_all_words)

print([i for i in dir(fdist) if i[0]!='_'])

['B', 'N', 'Nr', 'clear', 'copy', 'elements', 'freq', 'fromkeys', 'get', 'hapaxes', 'items', 'keys', 'max', 'most_common', 'pformat', 'plot', 'pop', 'popitem', 'pprint', 'r_Nr', 'setdefault', 'subtract', 'tabulate', 'unicode_repr', 'update', 'values']


df_freq = pd.DataFrame(fdist.most_common(20))
df_freq.head()


fdist.plot(20)

<matplotlib.axes._subplots.AxesSubplot at 0x7f8f33605f50>


df_freq = hero.top_words(df[mce]).to_frame()
df_freq.head(2)


hero.top_words(df[mce],normalize=True).to_frame().head().mul(100)


hero.wordcloud(df[mce])


from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[30, 30])

wordcloud1 = WordCloud( background_color='white',
                        width=800,
                        height=600
                      ).generate(' '.join(arr_pos_words))


ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=800,
                        height=600
                      ).generate(' '.join(arr_neg_words))

ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative Tweets',fontsize=40);


from plotly_wordcloud import plotly_wordcloud

text = " ".join(arr_pos_words)

fig = plotly_wordcloud(text)
fig['layout']['title'] = 'Wordcloud for +Ve Tweets'
fig['layout']['height'] = 800
fig['layout']['width'] = 800
py.iplot(fig)


import inspect
# inspect.getsourcelines(plotly_wordcloud)


fig = px.treemap(df_pos_uni.head(20),
           path=['Word'],values='Count',
           title='Top +Ve Twitter Words')

fig['layout']['title']['x'] = 0.5
fig.show()


fig = px.treemap(df_neg_uni.head(20),path=['Word'],values='Count')

fig.update_layout(
    title={
        'text': "Top -Ve Twitter Words",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()


df.columns

Index(['index', 'id', 'label', 'tweet', 'tweet_lst_clean', 'tweet_clean',
       'hashtags_lst', 'hashtags', 'total_length', 'num_words', 'num_sent',
       'num_unique_words', 'num_words_title', 'num_uppercase',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_digits', 'avg_word_len', 'avg_uppercase',
       'avg_unique', 'tweet_lst_clean_emoji', 'tweet_clean_emoji'],
      dtype='object')


import warnings
warnings.simplefilter("ignore")


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


NEG_TWEETS = df_train[target] == 1

def compare_distplots(df_train,features):
    fig, axes = plt.subplots(ncols=2, nrows=len(features), figsize=(20, 50), dpi=100)

    for i, feature in enumerate(features):
        sns.distplot(df_train.loc[~NEG_TWEETS][feature], label='Positive', ax=axes[i][0], color='green')
        sns.distplot(df_train.loc[NEG_TWEETS][feature], label='Negative', ax=axes[i][0], color='red')

        sns.distplot(df_train[feature], label='Training', ax=axes[i][1])
        sns.distplot(df_test[feature], label='Test', ax=axes[i][1])

        for j in range(2):
            axes[i][j].set_xlabel('')
            axes[i][j].tick_params(axis='x', labelsize=20)
            axes[i][j].tick_params(axis='y', labelsize=20)
            axes[i][j].legend(fontsize=20)

        axes[i][0].set_title(f'{feature} Target Distribution in Training Set', fontsize=20)
        axes[i][1].set_title(f'{feature} Training & Test Set Distribution', fontsize=20)

    plt.show()


features = ['total_length', 'num_words', 'num_sent',
    'num_unique_words', 'num_words_title', 'num_uppercase',
    'num_exclamation_marks' ]

compare_distplots(df_train,features)


features = ['num_question_marks', 
    'num_punctuation','num_symbols', 'num_digits',
    'avg_word_len', 'avg_uppercase']
compare_distplots(df_train,features)


note = """
NOTE:
The distriubtion must be different between label +ve and -ve
but must be similar between train and test sets.


""";


from wordcloud import STOPWORDS


def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]


def get_ngram_dfs(df_train,NEG_TWEETS,col,n_gram=1):

    # NEG_TWEETS = df_train[target] == 1
    neg_ngrams = defaultdict(int)
    pos_ngrams = defaultdict(int)

    for tweet in df_train[NEG_TWEETS][col]:
        for word in generate_ngrams(tweet,n_gram=n_gram):
            neg_ngrams[word] += 1

    for tweet in df_train[~NEG_TWEETS][col]:
        for word in generate_ngrams(tweet,n_gram=n_gram):
            pos_ngrams[word] += 1

    df_neg_ngrams = pd.DataFrame(sorted(neg_ngrams.items(),
                                          key=lambda x: x[1])[::-1])

    df_pos_ngrams = pd.DataFrame(sorted(pos_ngrams.items(),
                                          key=lambda x: x[1])[::-1])

    return [df_neg_ngrams,df_pos_ngrams]


def plot_neg_pos_ngrams(n_gram_name,
            df_neg_ngrams,df_pos_ngrams,N=20):

    FS = 25
    fig, axes = plt.subplots(ncols=2, figsize=(18, 20), dpi=100)
    plt.tight_layout()

    sns.barplot(y=df_neg_ngrams[0].values[:N], x=df_neg_ngrams[1].values[:N], ax=axes[0], color='red')
    sns.barplot(y=df_pos_ngrams[0].values[:N], x=df_pos_ngrams[1].values[:N], ax=axes[1], color='green')

    for i in range(2):
        axes[i].spines['right'].set_visible(False)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].tick_params(axis='x', labelsize=FS)
        axes[i].tick_params(axis='y', labelsize=FS)

    axes[0].set_title(f'Top {N} most common {n_gram_name} in -Ve Tweets', fontsize=FS)
    axes[1].set_title(f'Top {N} most common {n_gram_name} in +Ve Tweets', fontsize=FS)

    plt.show()


df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=1)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)


df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=2)
plot_neg_pos_ngrams('Bigrams',df1,df2,N=20)


df1, df2 = get_ngram_dfs(df_train,NEG_TWEETS,mce,n_gram=3)
plot_neg_pos_ngrams('Trigrams',df1,df2,N=20)


import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according
    to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


df1.head()


df1.head(20)[::-1].plot.bar(x=1,y=0)


df_pos = df[df[target]==0.0]
df_neg = df[df[target]==1.0]

df_pos.head(2)


pos_uni = get_top_n_words(df_pos[mce],20)
neg_uni = get_top_n_words(df_neg[mce],20)

df_pos_uni = pd.DataFrame(pos_uni,columns=['Word','Count'])[::-1]
df_neg_uni = pd.DataFrame(neg_uni,columns=['Word','Count'])[::-1]


fig = df_pos_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='green', opacity=0.6)
fig.update_layout(title_text='Most frequent +Ve **Unigrams** ')
fig.show()


fig = df_neg_uni.plot.bar(x='Count',y='Word')
fig.update_traces(marker_color='red', opacity=0.6)
fig.update_layout(title_text='Most frequent -Ve **Unigrams** ')
fig.show()

	index	id	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	2	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	3	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918	7918	7919	Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/	[finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget]	finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget	['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#']	#smart #pocket #wifi #ipad #samsung #s3 #gadget #	133	16	1	16	1	5	1	3	7.375000	0.037594	1.0	[finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget]	finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919	7919	7920	Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/	[aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew]	aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew	['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…']	#Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew…	129	13	1	13	5	12	3	2	9.000000	0.093023	1.0	[aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew]	aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew

	0	1
0	iphone	4153
1	aple	1737
2	samsung	1424
3	pic	1253
4	insta	1247

	0	1
0	aa	7
1	aaa	2
10603	zy	13
10604	zz	1

	0	1
0	iphone	4831
1	aple	3183
2	samsung	1574
3	new	1381
4	pic	1305

	tweet_clean_emoji
iphone	4831
aple	3183

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Target Distribution¶

PCA plot¶

Frequency Distribution¶

Word Cloud¶

Treemap¶

New Features Comparison¶

N-grams¶

Uni-grams¶

Bi-grams¶

Tri-grams¶

N-grams using plotly¶

	tweet_clean_emoji
iphone	4.302253
aple	2.834625
samsung	1.401728
new	1.229851
pic	1.162169

	0	1
0	aple pic twiter	10
1	fuck aple aple	7
2	hate aple retwet	7
3	aple suck aple	7
4	aple aple iphone	6