import time

time_start_notebook = time.time()


%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install scattertext

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import numpy as np
import pandas as pd
import swifter

import scattertext as st

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Author: Bhishan Poudel

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.19.0

Compiler    : Clang 10.0.0 
OS          : Darwin
Release     : 19.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit


autopep8   : 1.5.4
pandas     : 1.1.4
scattertext: 0.0.2.75
json       : 2.0.9
numpy      : 1.19.4
swifter    : 1.0.6
sys        : 3.7.9 (default, Aug 31 2020, 07:22:35) 
[Clang 10.0.0 ]
watermark  : 2.1.0


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


target_name = 'label'
mc = 'tweet_clean' # main column cleaned

p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df = pd.read_csv(p + 'df_combined_clean.csv?raw=true',
                 usecols=['id', target_name,mc])

df = df[df.label.notnull()]
df['label'] = df['label'].map({1.0: 'positive',0.0: 'negative'})

display(df.head(2).append(df.tail(2)))


parsed_col = 'parse'
df[parsed_col] = df[mc].swifter.apply(st.whitespace_nlp_with_sentences)


df.head(2).append(df.tail(2))


corpus = st.CorpusFromParsedDocuments(
    df,
    category_col=target_name, 
    parsed_col=parsed_col
    )

corpus = corpus.build()
corpus = corpus.get_unigram_corpus()

# reduce size
corpus = corpus.compact(st.AssociationCompactor(2000))


html = st.produce_scattertext_explorer(
    corpus,
    category='negative',
    category_name='Negative',
    not_category_name='Positive',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    metadata=corpus.get_df()['id'],
    transform=st.Scalers.dense_rank
    )


with open('../outputs/tweets_sentiment.html', 'w') as fo:
    fo.write(html)


df.head(2)


import spacy


nlp = spacy.load('en')


# %%time

## add new column (or replace) called parse
# df = df.assign(parse=lambda row: row[mc].apply(nlp))

## Wall time: 48 s
## this is fast but it does not have progress bar.


%%time

df['parse'] = df[mc].swifter.apply(nlp)

CPU times: user 52 s, sys: 595 ms, total: 52.6 s
Wall time: 54.9 s


df.head(2)


df.loc[0,'parse']

fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone


%%time
corpus = st.CorpusFromParsedDocuments(
    df,
    category_col=target_name,
    parsed_col='parse',
    feats_from_spacy_doc=st.PyTextRankPhrases()
).build(
).compact(
    st.AssociationCompactor(2000, use_non_text_features=True)
)

/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages/scattertext/termscoring/ScaledFScore.py:129: RuntimeWarning: invalid value encountered in true_divide
  precision = (cat_word_counts * 1. / (cat_word_counts + not_cat_word_counts))
/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages/scattertext/termscoring/ScaledFScore.py:129: RuntimeWarning: invalid value encountered in true_divide
  precision = (cat_word_counts * 1. / (cat_word_counts + not_cat_word_counts))

CPU times: user 13.3 s, sys: 85.4 ms, total: 13.4 s
Wall time: 13.7 s


df_term_category_scores = corpus.get_metadata_freq_df('')
df_term_category_scores.head(10)


# Get the rank of each term in each category
term_ranks = np.argsort(np.argsort(-df_term_category_scores, axis=0), axis=0) + 1

# Text displayed when a term is clicked
metadata_descriptions = {
    term: '<br/>' + '<br/>'.join(
        '<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
        for cat in corpus.get_categories())
    for term in corpus.get_metadata()
}


category_specific_prominence = df_term_category_scores.apply(
    lambda r: r.negative if r.negative > r.positive else -r.positive,
    axis=1
)


html = st.produce_scattertext_explorer(
    corpus,
    category='negative',
    not_category_name='positive',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    transform=st.dense_rank,
    metadata=corpus.get_df()['id'],
    scores=category_specific_prominence,
    sort_by_dist=False,
    use_non_text_features=True,
    topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
    topic_model_preview_size=0,
    metadata_descriptions=metadata_descriptions,
    use_full_doc=True
)


with open('../outputs/tweets_sentiment_textrank.html','w') as fo:
    fo.write(html)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 1 min 18 secs

	id	label	tweet_clean
0	1	negative	fingerprint pregnancy test android aps beautif...
1	2	negative	finaly transparant silicon case thanks uncle y...
7918	7919	negative	finaly got smart pocket wifi stay conected any...
7919	7920	negative	aple barcelona aple store bcn barcelona travel...

	id	label	tweet_clean	parse
0	1	negative	fingerprint pregnancy test android aps beautif...	(fingerprint, pregnancy, test, android, aps, b...
1	2	negative	finaly transparant silicon case thanks uncle y...	(finaly, transparant, silicon, case, thanks, u...
7918	7919	negative	finaly got smart pocket wifi stay conected any...	(finaly, got, smart, pocket, wifi, stay, conec...
7919	7920	negative	aple barcelona aple store bcn barcelona travel...	(aple, barcelona, aple, store, bcn, barcelona,...

	negative	positive
term
iphonesia	8.928064	0.154059
love	5.211916	0.831196
way	0.449481	0.128470
phone big time	0.000000	0.251077
new year	4.943526	0.000000
sun instagod boy	0.403745	0.000000
picoftheday	14.308220	0.000000
new ipod	0.000000	1.126512
saturday	5.385685	0.126068
phone case	0.867800	0.000000

Table of Contents

Descriptions¶

Google Colab¶

Load the libraries¶

Useful Functions¶

Data Processing¶

Load the data¶

parse the text: whitespace nlp with sentence¶

Create corpus¶

Positive and Negative Tweets¶

Visualizing Phrase associations¶

Time Taken¶