%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install -q ktrain
    !pip install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1
    !pip install -q neptune-client neptune-contrib
    !pip install -q scikit-plot

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import tensorflow as tf
import ktrain

from pprint import pprint

versions_dl = [(x.__name__,x.__version__) for x in [tf, ktrain]]
pprint(versions_dl)

[('tensorflow', '2.3.0'), ('ktrain', '0.21.2')]


import neptune
from neptunecontrib.api import log_table


# use your real key and DELETE the cell

# neptune.init('bhishanpdl/twitter-sentiment-analysis','your_api_key')


import numpy as np
import pandas as pd

pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100

import time
import sys
import re
from tqdm import tqdm
tqdm.pandas()

versions_ds = [(x.__name__,x.__version__) for x in [np,pd]]
pprint(versions_ds)

[('numpy', '1.18.5'), ('pandas', '1.0.5')]


from sklearn import metrics

df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


target = 'label'
maincol = 'tweet'

p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df_combined = pd.read_csv(p + 'df_combined_clean.csv?raw=true')

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"train : {df_train.shape}")
print(f"test : {df_test.shape}")
display(df_train.head(2).append(df_train.tail(2)))

train : (7920, 24)
test : (1953, 24)


display(df_test.head(2).append(df_test.tail(2)))


df_train['tweet_clean'].apply(len).min()

4


neptune.create_experiment(
    name='ktrain', # put small name
    description='',
    tags = ['ktrain', 'bert'],
    upload_source_files=None
)

neptune.log_text('versions_dl', str(versions_dl))
neptune.log_text('versions_ds', str(versions_ds))

neptune.log_text('text processing', 'various maxlen and epochs')

maincol = 'tweet_clean' # best so far: 'tweet'
neptune.log_text('column_used', maincol)

https://ui.neptune.ai/bhishanpdl/twitter-sentiment-analysis/e/TWITSENT-15


%%time

time_start = time.time()
# bert is uncased_L-12_H-768_A-12.zip
# distilbert gives error


# Parameters
MODEL_NAME = 'bert'
PARAMS = dict()
PARAMS['ngram_range'] = 1
PARAMS['max_features'] = 20000
PARAMS['maxlen'] = 300

neptune.log_text('model_name', MODEL_NAME)
for k,v in PARAMS.items():
    neptune.log_metric(k,v)

(X_train, y_train), (X_valid, y_valid), preproc = \
ktrain.text.texts_from_df(df_train,
    text_column     = maincol,
    label_columns   = [target],
    random_state    = SEED,
    ngram_range     = PARAMS['ngram_range'] ,
    max_features    = PARAMS['max_features'],
    val_df          = None, # if not 10% of train is used
    maxlen          = PARAMS['maxlen'], # it was 500
    preprocess_mode = MODEL_NAME)

model = ktrain.text.text_classifier(name=MODEL_NAME,
                             train_data=(X_train, y_train),
                             metrics=['accuracy'],
                             preproc=preproc)

learner = ktrain.get_learner(model=model,
                             train_data=(X_train, y_train),
                             val_data=(X_valid, y_valid),
                             batch_size=6)

PARAMS_ONECYCLE = {
    'lr'    : 2e-5, # original value is 2e-5
    'epochs': 5 # best so far was 5
    }

for k,v in PARAMS_ONECYCLE.items():
    neptune.log_metric(k,v)

learner.fit_onecycle(**PARAMS_ONECYCLE )

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
neptune.log_metric('time_taken_min', m)

preprocessing train...
language: en

Is Multi-Label? False
preprocessing test...
language: en

Is Multi-Label? False
maxlen is 300
done.


begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
1188/1188 [==============================] - 797s 671ms/step - loss: 0.3380 - accuracy: 0.8450 - val_loss: 0.2213 - val_accuracy: 0.9053
Epoch 2/5
1188/1188 [==============================] - 790s 665ms/step - loss: 0.2325 - accuracy: 0.9003 - val_loss: 0.2186 - val_accuracy: 0.9091
Epoch 3/5
1188/1188 [==============================] - 787s 662ms/step - loss: 0.1784 - accuracy: 0.9324 - val_loss: 0.2130 - val_accuracy: 0.9179
Epoch 4/5
1188/1188 [==============================] - 785s 661ms/step - loss: 0.0885 - accuracy: 0.9689 - val_loss: 0.2753 - val_accuracy: 0.9116
Epoch 5/5
1188/1188 [==============================] - 784s 660ms/step - loss: 0.0288 - accuracy: 0.9907 - val_loss: 0.4086 - val_accuracy: 0.9104
CPU times: user 58min 50s, sys: 12min 28s, total: 1h 11min 19s
Wall time: 1h 6min 7s


%%time

from neptunecontrib.api import log_table

time_start = time.time()
valid_probs2d = learner.predict(val_data=(X_valid,y_valid))
valid_preds = (valid_probs2d[:,1]>0.5).astype(int).tolist()
lst_y_valid = (y_valid[:,1].tolist())

text_model_name = "bert"
params = str(PARAMS)
model_name = ""
desc = ""

yvd = lst_y_valid
vd_preds = valid_preds

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

# log the validation results
log_table('df_eval', df_eval)
dic_results = {'acc':acc,
               'precision':pre,
               'recall':rec,
               'f1': f1}

for k,v in dic_results.items():
    print('valid_'+k, v)
    neptune.log_metric('valid_'+k, v)

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
neptune.log_metric('time_taken_validation_min', m)

display(df_eval)

valid_acc 0.9103535353535354
valid_precision 0.7904761904761904
valid_recall 0.8601036269430051
valid_f1 0.9115991300375775

CPU times: user 5.43 s, sys: 831 ms, total: 6.26 s
Wall time: 29.2 s


%%time
predictor = ktrain.get_predictor(learner.model, preproc)
X_test = df_test[maincol].to_numpy()
test_preds = predictor.predict(X_test,return_proba=False)
df_test[target] = test_preds

df_sub = df_test[['id','label']]
df_sub['label'] = df_sub['label'].replace({'not_label': 0, 'label': 1})

df_sub.to_csv('sub_ktrain_tweet_clean_bert_epochs5_maxlen300.csv', index=False)

CPU times: user 5.04 s, sys: 2.11 s, total: 7.15 s
Wall time: 51.8 s


# upload the data and get the score
neptune.log_metric('test_f1', 0.877973006703751 )


best_so_far = """
bert lr=2e-5 epochs=5 ngram_range=1 maxlen=300
f1 = 0.908687336005899

n_gram=2 gave worse result
tweet_clean_emoji gave worse result

bert lr=2e-5 epochs=5 ngram_range=1 maxlen=400 
f1 = 0.908265806079951

bert lr=2e-5 epochs=5 ngram_range=1 maxlen=300 maincol=tweet_clean
f1=0.877973006703751

"""
neptune.log_text('best_so_far',
                 best_so_far)

	index	id	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	['fingerprint', 'pregnancy', 'test', 'android', 'aps', 'beautiful', 'cute', 'health', 'igers', 'iphoneonly', 'iphonesia', 'iphone']	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	2	8.923077	0.039062	1.0	['fingerprint', 'pregnancy', 'test', 'android', 'aps', 'beautiful', 'cute', 'health', 'iger', 'iphone', 'iphones', 'iphone']	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	['finaly', 'transparant', 'silicon', 'case', 'thanks', 'uncle', 'yay', 'sony', 'xperia', 'sonyexperias']	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	3	6.764706	0.091603	1.0	['finaly', 'trans', 'paran', 'silicon', 'case', 'thanks', 'uncle', 'yay', 'sony', 'x', 'peri', 'sony', 'ex', 'peri']	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918	7918	7919	Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/	['finaly', 'got', 'smart', 'pocket', 'wifi', 'stay', 'conected', 'anytimeanywhere', 'ipad', 'samsung', 'gadget']	finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget	['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#']	#smart #pocket #wifi #ipad #samsung #s3 #gadget #	133	16	1	16	1	5	1	3	7.375000	0.037594	1.0	['finaly', 'got', 'smart', 'pocket', 'wi', 'fi', 'stay', 'conected', 'anytime', 'anywhere', 'ipad', 'samsung', 'gadget']	finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919	7919	7920	Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/	['aple', 'barcelona', 'aple', 'store', 'bcn', 'barcelona', 'travel', 'iphone', 'selfie', 'fly', 'fun', 'cabincrew']	aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew	['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…']	#Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew…	129	13	1	13	5	12	3	2	9.000000	0.093023	1.0	['aple', 'barcelona', 'aple', 'store', 'n', 'barcelona', 'travel', 'iphone', 'self', 'ie', 'fly', 'fun', 'cabin', 'crew']	aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
7920	0	7921	NaN	I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks	['hate', 'new', 'iphone', 'upgrade', 'wil', 'let', 'download', 'aps', 'ugh', 'aple', 'suck']	hate new iphone upgrade wil let download aps ugh aple suck	['#iphone', '#ugh', '#apple']	#iphone #ugh #apple	77	14	1	14	1	2	2	4.571429	0.025974	1.000000	['hate', 'new', 'iphone', 'upgrade', 'wil', 'let', 'download', 'aps', 'ugh', 'aple', 'suck']	hate new iphone upgrade wil let download aps ugh aple suck
7921	1	7922	NaN	currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/	['curently', 'shiting', 'fucking', 'pant', 'aple', 'imac', 'cashmoney', 'radest', 'swagswag']	curently shiting fucking pant aple imac cashmoney radest swagswag	['#apple', '#iMac', '#cashmoney', '#raddest', '#swagswagswag']	#apple #iMac #cashmoney #raddest #swagswagswag	115	11	1	11	0	8	3	9.545455	0.069565	1.000000	['curently', 'shiting', 'fucking', 'pant', 'aple', 'imac', 'cash', 'money', 'rad', 'de', 'st', 'swag', 'wag', 'wag']	curently shiting fucking pant aple imac cash money rad de st swag wag wag
9871	1951	9872	NaN	@codeofinterest as i said #Adobe big time we may well as include #apple to	['codeofinterest', 'said', 'adobe', 'big', 'time', 'may', 'wel', 'include', 'aple']	codeofinterest said adobe big time may wel include aple	['#Adobe', '#apple']	#Adobe #apple	74	14	1	13	1	1	0	4.357143	0.013514	0.928571	['code', 'interest', 'said', 'adobe', 'big', 'time', 'may', 'wel', 'include', 'aple']	code interest said adobe big time may wel include aple
9872	1952	9873	NaN	Finally I got it .. thanx my father .. #Samsung #galaxy #s3 #gift #father #phone #new http://instagr.am/p/NoxkiPE	['finaly', 'got', 'thanx', 'father', 'samsung', 'galaxy', 'gift', 'father', 'phone', 'new']	finaly got thanx father samsung galaxy gift father phone new	['#Samsung', '#galaxy', '#s3', '#gift', '#father', '#phone', '#new']	#Samsung #galaxy #s3 #gift #father #phone #new	113	17	1	16	3	6	6	5.705882	0.053097	0.941176	['finaly', 'got', 'x', 'father', 'samsung', 'galaxy', 'gift', 'father', 'phone', 'new']	finaly got x father samsung galaxy gift father phone new

Descriptions¶

Google Colab¶

Load the libraries¶

Load the data¶

Model Prediction on Test data using ktrain¶

	Text Model	Params	Model	Description	Accuracy	Precision	Recall	F1
0	bert	{'lr': 2e-05, 'epochs': 5}			0.916667	0.812808	0.854922	0.917368
1	bert	{'ngram_range': 1, 'max_features': 20000, 'maxlen': 300}			0.910354	0.790476	0.860104	0.911599