Ref:
BERT stands for Bidirectional Encoder Representations from Transformers
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install -q ktrain
!pip install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1
!pip install -q neptune-client neptune-contrib
!pip install -q scikit-plot
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import tensorflow as tf
import ktrain
from pprint import pprint
versions_dl = [(x.__name__,x.__version__) for x in [tf, ktrain]]
pprint(versions_dl)
[('tensorflow', '2.3.0'), ('ktrain', '0.21.2')]
import neptune
from neptunecontrib.api import log_table
# use your real key and DELETE the cell
# neptune.init('bhishanpdl/twitter-sentiment-analysis','your_api_key')
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100
import time
import sys
import re
from tqdm import tqdm
tqdm.pandas()
versions_ds = [(x.__name__,x.__version__) for x in [np,pd]]
pprint(versions_ds)
[('numpy', '1.18.5'), ('pandas', '1.0.5')]
from sklearn import metrics
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1': [],
})
target = 'label'
maincol = 'tweet'
p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df_combined = pd.read_csv(p + 'df_combined_clean.csv?raw=true')
df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]
print(f"train : {df_train.shape}")
print(f"test : {df_test.shape}")
display(df_train.head(2).append(df_train.tail(2)))
train : (7920, 24) test : (1953, 24)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | ['fingerprint', 'pregnancy', 'test', 'android', 'aps', 'beautiful', 'cute', 'health', 'igers', 'iphoneonly', 'iphonesia', 'iphone'] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | ['fingerprint', 'pregnancy', 'test', 'android', 'aps', 'beautiful', 'cute', 'health', 'iger', 'iphone', 'iphones', 'iphone'] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | ['finaly', 'transparant', 'silicon', 'case', 'thanks', 'uncle', 'yay', 'sony', 'xperia', 'sonyexperias'] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | ['finaly', 'trans', 'paran', 'silicon', 'case', 'thanks', 'uncle', 'yay', 'sony', 'x', 'peri', 'sony', 'ex', 'peri'] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
7918 | 7918 | 7919 | 0.0 | Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ | ['finaly', 'got', 'smart', 'pocket', 'wifi', 'stay', 'conected', 'anytimeanywhere', 'ipad', 'samsung', 'gadget'] | finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget | ['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#'] | #smart #pocket #wifi #ipad #samsung #s3 #gadget # | 133 | 16 | 1 | 16 | 1 | 5 | 1 | 0 | 3 | 0 | 0 | 7.375000 | 0.037594 | 1.0 | ['finaly', 'got', 'smart', 'pocket', 'wi', 'fi', 'stay', 'conected', 'anytime', 'anywhere', 'ipad', 'samsung', 'gadget'] | finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget |
7919 | 7919 | 7920 | 0.0 | Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ | ['aple', 'barcelona', 'aple', 'store', 'bcn', 'barcelona', 'travel', 'iphone', 'selfie', 'fly', 'fun', 'cabincrew'] | aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew | ['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…'] | #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… | 129 | 13 | 1 | 13 | 5 | 12 | 3 | 0 | 2 | 0 | 0 | 9.000000 | 0.093023 | 1.0 | ['aple', 'barcelona', 'aple', 'store', 'n', 'barcelona', 'travel', 'iphone', 'self', 'ie', 'fly', 'fun', 'cabin', 'crew'] | aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew |
display(df_test.head(2).append(df_test.tail(2)))
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7920 | 0 | 7921 | NaN | I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks | ['hate', 'new', 'iphone', 'upgrade', 'wil', 'let', 'download', 'aps', 'ugh', 'aple', 'suck'] | hate new iphone upgrade wil let download aps ugh aple suck | ['#iphone', '#ugh', '#apple'] | #iphone #ugh #apple | 77 | 14 | 1 | 14 | 1 | 2 | 0 | 0 | 2 | 0 | 0 | 4.571429 | 0.025974 | 1.000000 | ['hate', 'new', 'iphone', 'upgrade', 'wil', 'let', 'download', 'aps', 'ugh', 'aple', 'suck'] | hate new iphone upgrade wil let download aps ugh aple suck |
7921 | 1 | 7922 | NaN | currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/ | ['curently', 'shiting', 'fucking', 'pant', 'aple', 'imac', 'cashmoney', 'radest', 'swagswag'] | curently shiting fucking pant aple imac cashmoney radest swagswag | ['#apple', '#iMac', '#cashmoney', '#raddest', '#swagswagswag'] | #apple #iMac #cashmoney #raddest #swagswagswag | 115 | 11 | 1 | 11 | 0 | 8 | 0 | 0 | 3 | 0 | 0 | 9.545455 | 0.069565 | 1.000000 | ['curently', 'shiting', 'fucking', 'pant', 'aple', 'imac', 'cash', 'money', 'rad', 'de', 'st', 'swag', 'wag', 'wag'] | curently shiting fucking pant aple imac cash money rad de st swag wag wag |
9871 | 1951 | 9872 | NaN | @codeofinterest as i said #Adobe big time we may well as include #apple to | ['codeofinterest', 'said', 'adobe', 'big', 'time', 'may', 'wel', 'include', 'aple'] | codeofinterest said adobe big time may wel include aple | ['#Adobe', '#apple'] | #Adobe #apple | 74 | 14 | 1 | 13 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 4.357143 | 0.013514 | 0.928571 | ['code', 'interest', 'said', 'adobe', 'big', 'time', 'may', 'wel', 'include', 'aple'] | code interest said adobe big time may wel include aple |
9872 | 1952 | 9873 | NaN | Finally I got it .. thanx my father .. #Samsung #galaxy #s3 #gift #father #phone #new http://instagr.am/p/NoxkiPE | ['finaly', 'got', 'thanx', 'father', 'samsung', 'galaxy', 'gift', 'father', 'phone', 'new'] | finaly got thanx father samsung galaxy gift father phone new | ['#Samsung', '#galaxy', '#s3', '#gift', '#father', '#phone', '#new'] | #Samsung #galaxy #s3 #gift #father #phone #new | 113 | 17 | 1 | 16 | 3 | 6 | 0 | 0 | 6 | 0 | 0 | 5.705882 | 0.053097 | 0.941176 | ['finaly', 'got', 'x', 'father', 'samsung', 'galaxy', 'gift', 'father', 'phone', 'new'] | finaly got x father samsung galaxy gift father phone new |
df_train['tweet_clean'].apply(len).min()
4
neptune.create_experiment(
name='ktrain', # put small name
description='',
tags = ['ktrain', 'bert'],
upload_source_files=None
)
neptune.log_text('versions_dl', str(versions_dl))
neptune.log_text('versions_ds', str(versions_ds))
neptune.log_text('text processing', 'various maxlen and epochs')
maincol = 'tweet_clean' # best so far: 'tweet'
neptune.log_text('column_used', maincol)
https://ui.neptune.ai/bhishanpdl/twitter-sentiment-analysis/e/TWITSENT-15
%%time
time_start = time.time()
# bert is uncased_L-12_H-768_A-12.zip
# distilbert gives error
# Parameters
MODEL_NAME = 'bert'
PARAMS = dict()
PARAMS['ngram_range'] = 1
PARAMS['max_features'] = 20000
PARAMS['maxlen'] = 300
neptune.log_text('model_name', MODEL_NAME)
for k,v in PARAMS.items():
neptune.log_metric(k,v)
(X_train, y_train), (X_valid, y_valid), preproc = \
ktrain.text.texts_from_df(df_train,
text_column = maincol,
label_columns = [target],
random_state = SEED,
ngram_range = PARAMS['ngram_range'] ,
max_features = PARAMS['max_features'],
val_df = None, # if not 10% of train is used
maxlen = PARAMS['maxlen'], # it was 500
preprocess_mode = MODEL_NAME)
model = ktrain.text.text_classifier(name=MODEL_NAME,
train_data=(X_train, y_train),
metrics=['accuracy'],
preproc=preproc)
learner = ktrain.get_learner(model=model,
train_data=(X_train, y_train),
val_data=(X_valid, y_valid),
batch_size=6)
PARAMS_ONECYCLE = {
'lr' : 2e-5, # original value is 2e-5
'epochs': 5 # best so far was 5
}
for k,v in PARAMS_ONECYCLE.items():
neptune.log_metric(k,v)
learner.fit_onecycle(**PARAMS_ONECYCLE )
time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
neptune.log_metric('time_taken_min', m)
preprocessing train... language: en
Is Multi-Label? False preprocessing test... language: en
Is Multi-Label? False maxlen is 300 done. begin training using onecycle policy with max lr of 2e-05... Epoch 1/5 1188/1188 [==============================] - 797s 671ms/step - loss: 0.3380 - accuracy: 0.8450 - val_loss: 0.2213 - val_accuracy: 0.9053 Epoch 2/5 1188/1188 [==============================] - 790s 665ms/step - loss: 0.2325 - accuracy: 0.9003 - val_loss: 0.2186 - val_accuracy: 0.9091 Epoch 3/5 1188/1188 [==============================] - 787s 662ms/step - loss: 0.1784 - accuracy: 0.9324 - val_loss: 0.2130 - val_accuracy: 0.9179 Epoch 4/5 1188/1188 [==============================] - 785s 661ms/step - loss: 0.0885 - accuracy: 0.9689 - val_loss: 0.2753 - val_accuracy: 0.9116 Epoch 5/5 1188/1188 [==============================] - 784s 660ms/step - loss: 0.0288 - accuracy: 0.9907 - val_loss: 0.4086 - val_accuracy: 0.9104 CPU times: user 58min 50s, sys: 12min 28s, total: 1h 11min 19s Wall time: 1h 6min 7s
%%time
from neptunecontrib.api import log_table
time_start = time.time()
valid_probs2d = learner.predict(val_data=(X_valid,y_valid))
valid_preds = (valid_probs2d[:,1]>0.5).astype(int).tolist()
lst_y_valid = (y_valid[:,1].tolist())
text_model_name = "bert"
params = str(PARAMS)
model_name = ""
desc = ""
yvd = lst_y_valid
vd_preds = valid_preds
acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
# log the validation results
log_table('df_eval', df_eval)
dic_results = {'acc':acc,
'precision':pre,
'recall':rec,
'f1': f1}
for k,v in dic_results.items():
print('valid_'+k, v)
neptune.log_metric('valid_'+k, v)
time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
neptune.log_metric('time_taken_validation_min', m)
display(df_eval)
valid_acc 0.9103535353535354 valid_precision 0.7904761904761904 valid_recall 0.8601036269430051 valid_f1 0.9115991300375775
Text Model | Params | Model | Description | Accuracy | Precision | Recall | F1 | |
---|---|---|---|---|---|---|---|---|
0 | bert | {'lr': 2e-05, 'epochs': 5} | 0.916667 | 0.812808 | 0.854922 | 0.917368 | ||
1 | bert | {'ngram_range': 1, 'max_features': 20000, 'maxlen': 300} | 0.910354 | 0.790476 | 0.860104 | 0.911599 |
CPU times: user 5.43 s, sys: 831 ms, total: 6.26 s Wall time: 29.2 s
%%time
predictor = ktrain.get_predictor(learner.model, preproc)
X_test = df_test[maincol].to_numpy()
test_preds = predictor.predict(X_test,return_proba=False)
df_test[target] = test_preds
df_sub = df_test[['id','label']]
df_sub['label'] = df_sub['label'].replace({'not_label': 0, 'label': 1})
df_sub.to_csv('sub_ktrain_tweet_clean_bert_epochs5_maxlen300.csv', index=False)
CPU times: user 5.04 s, sys: 2.11 s, total: 7.15 s Wall time: 51.8 s
# upload the data and get the score
neptune.log_metric('test_f1', 0.877973006703751 )
best_so_far = """
bert lr=2e-5 epochs=5 ngram_range=1 maxlen=300
f1 = 0.908687336005899
n_gram=2 gave worse result
tweet_clean_emoji gave worse result
bert lr=2e-5 epochs=5 ngram_range=1 maxlen=400
f1 = 0.908265806079951
bert lr=2e-5 epochs=5 ngram_range=1 maxlen=300 maincol=tweet_clean
f1=0.877973006703751
"""
neptune.log_text('best_so_far',
best_so_far)