import time
time_start_notebook = time.time()
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install scattertext
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import numpy as np
import pandas as pd
import swifter
import scattertext as st
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Author: Bhishan Poudel Python implementation: CPython Python version : 3.7.9 IPython version : 7.19.0 Compiler : Clang 10.0.0 OS : Darwin Release : 19.6.0 Machine : x86_64 Processor : i386 CPU cores : 4 Architecture: 64bit autopep8 : 1.5.4 pandas : 1.1.4 scattertext: 0.0.2.75 json : 2.0.9 numpy : 1.19.4 swifter : 1.0.6 sys : 3.7.9 (default, Aug 31 2020, 07:22:35) [Clang 10.0.0 ] watermark : 2.1.0
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
target_name = 'label'
mc = 'tweet_clean' # main column cleaned
p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df = pd.read_csv(p + 'df_combined_clean.csv?raw=true',
usecols=['id', target_name,mc])
df = df[df.label.notnull()]
df['label'] = df['label'].map({1.0: 'positive',0.0: 'negative'})
display(df.head(2).append(df.tail(2)))
id | label | tweet_clean | |
---|---|---|---|
0 | 1 | negative | fingerprint pregnancy test android aps beautif... |
1 | 2 | negative | finaly transparant silicon case thanks uncle y... |
7918 | 7919 | negative | finaly got smart pocket wifi stay conected any... |
7919 | 7920 | negative | aple barcelona aple store bcn barcelona travel... |
parsed_col = 'parse'
df[parsed_col] = df[mc].swifter.apply(st.whitespace_nlp_with_sentences)
df.head(2).append(df.tail(2))
id | label | tweet_clean | parse | |
---|---|---|---|---|
0 | 1 | negative | fingerprint pregnancy test android aps beautif... | (fingerprint, pregnancy, test, android, aps, b... |
1 | 2 | negative | finaly transparant silicon case thanks uncle y... | (finaly, transparant, silicon, case, thanks, u... |
7918 | 7919 | negative | finaly got smart pocket wifi stay conected any... | (finaly, got, smart, pocket, wifi, stay, conec... |
7919 | 7920 | negative | aple barcelona aple store bcn barcelona travel... | (aple, barcelona, aple, store, bcn, barcelona,... |
corpus = st.CorpusFromParsedDocuments(
df,
category_col=target_name,
parsed_col=parsed_col
)
corpus = corpus.build()
corpus = corpus.get_unigram_corpus()
# reduce size
corpus = corpus.compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category='negative',
category_name='Negative',
not_category_name='Positive',
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
width_in_pixels=1000,
metadata=corpus.get_df()['id'],
transform=st.Scalers.dense_rank
)
with open('../outputs/tweets_sentiment.html', 'w') as fo:
fo.write(html)
First download english words.
/Users/poudel/opt/miniconda3/envs/nlp/bin/python -m spacy download en
df.head(2)
id | label | tweet_clean | parse | |
---|---|---|---|---|
0 | 1 | negative | fingerprint pregnancy test android aps beautif... | (fingerprint, pregnancy, test, android, aps, b... |
1 | 2 | negative | finaly transparant silicon case thanks uncle y... | (finaly, transparant, silicon, case, thanks, u... |
import spacy
nlp = spacy.load('en')
# %%time
## add new column (or replace) called parse
# df = df.assign(parse=lambda row: row[mc].apply(nlp))
## Wall time: 48 s
## this is fast but it does not have progress bar.
%%time
df['parse'] = df[mc].swifter.apply(nlp)
CPU times: user 52 s, sys: 595 ms, total: 52.6 s Wall time: 54.9 s
df.head(2)
id | label | tweet_clean | parse | |
---|---|---|---|---|
0 | 1 | negative | fingerprint pregnancy test android aps beautif... | (fingerprint, pregnancy, test, android, aps, b... |
1 | 2 | negative | finaly transparant silicon case thanks uncle y... | (finaly, transparant, silicon, case, thanks, u... |
df.loc[0,'parse']
fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone
%%time
corpus = st.CorpusFromParsedDocuments(
df,
category_col=target_name,
parsed_col='parse',
feats_from_spacy_doc=st.PyTextRankPhrases()
).build(
).compact(
st.AssociationCompactor(2000, use_non_text_features=True)
)
/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages/scattertext/termscoring/ScaledFScore.py:129: RuntimeWarning: invalid value encountered in true_divide precision = (cat_word_counts * 1. / (cat_word_counts + not_cat_word_counts)) /Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages/scattertext/termscoring/ScaledFScore.py:129: RuntimeWarning: invalid value encountered in true_divide precision = (cat_word_counts * 1. / (cat_word_counts + not_cat_word_counts))
CPU times: user 13.3 s, sys: 85.4 ms, total: 13.4 s Wall time: 13.7 s
df_term_category_scores = corpus.get_metadata_freq_df('')
df_term_category_scores.head(10)
negative | positive | |
---|---|---|
term | ||
iphonesia | 8.928064 | 0.154059 |
love | 5.211916 | 0.831196 |
way | 0.449481 | 0.128470 |
phone big time | 0.000000 | 0.251077 |
new year | 4.943526 | 0.000000 |
sun instagod boy | 0.403745 | 0.000000 |
picoftheday | 14.308220 | 0.000000 |
new ipod | 0.000000 | 1.126512 |
saturday | 5.385685 | 0.126068 |
phone case | 0.867800 | 0.000000 |
# Get the rank of each term in each category
term_ranks = np.argsort(np.argsort(-df_term_category_scores, axis=0), axis=0) + 1
# Text displayed when a term is clicked
metadata_descriptions = {
term: '<br/>' + '<br/>'.join(
'<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
for cat in corpus.get_categories())
for term in corpus.get_metadata()
}
category_specific_prominence = df_term_category_scores.apply(
lambda r: r.negative if r.negative > r.positive else -r.positive,
axis=1
)
html = st.produce_scattertext_explorer(
corpus,
category='negative',
not_category_name='positive',
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.dense_rank,
metadata=corpus.get_df()['id'],
scores=category_specific_prominence,
sort_by_dist=False,
use_non_text_features=True,
topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
topic_model_preview_size=0,
metadata_descriptions=metadata_descriptions,
use_full_doc=True
)
with open('../outputs/tweets_sentiment_textrank.html','w') as fo:
fo.write(html)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 1 min 18 secs