%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install tokenizers
    !pip install transformers
    !pip install scikit-plot

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import numpy as np
import pandas as pd

# settings
pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100

# ml
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# warnings
import warnings
warnings.filterwarnings('ignore')

# nlp
import re

# extra
import sys
from pprint import pprint
from tqdm import tqdm
tqdm.pandas()

# deep learning
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import keras
from keras.models import Model
from keras.layers import LSTM,Dense,Bidirectional,Input
import torch
import transformers
import tokenizers
from tokenizers import BertWordPieceTokenizer

# versions
pprint([(x.__name__,x.__version__) for x in
        [np,pd,sklearn,tf,keras,torch,transformers,tokenizers]])

[('numpy', '1.18.5'),
 ('pandas', '1.0.5'),
 ('sklearn', '0.22.2.post1'),
 ('tensorflow', '2.3.0'),
 ('keras', '2.4.3'),
 ('torch', '1.6.0+cu101'),
 ('transformers', '3.1.0'),
 ('tokenizers', '0.8.1.rc2')]


df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/janatahack/sentiment_analysis/raw/train.csv?raw=true')

df = df.iloc[:1000] # to prevent OOM

print(f"train : {df.shape}")
display(df.head(2).append(df.tail(2)))

target = 'label'
maincol = 'tweet'

train : (1000, 3)


maincol = 'tweet'
target = 'label'
mc = maincol + '_clean'


import re
def process_text(text):
    out = re.sub(r'<[^>]+>','', text) # remove html tags
    out = re.sub('[^a-zA-Z]', ' ', out) # keep only alpha
    out = re.sub(r"\s+[a-zA-Z]\s+", ' ', out) # remove single letter
    out = re.sub(r'\s+', ' ', out) # remove multiple spaces

    return out

s = pd.Series(df[maincol][1])
s.progress_apply(process_text)

100%|██████████| 1/1 [00:00<00:00, 494.15it/s]

0    Finally transparant silicon case Thanks to my uncle yay Sony Xperia sonyexperias http instagram com YGEt JC JM 
dtype: object


df[mc] = df[maincol].progress_apply(process_text)

100%|██████████| 1000/1000 [00:00<00:00, 33723.32it/s]


from sklearn.model_selection import train_test_split


ser_Xtrain, ser_Xtest, ser_ytrain, ser_ytest = train_test_split(df[mc],df[target],
                                                shuffle=True,
                                                random_state=SEED,
                                                stratify = df[target])

Xtrain = ser_Xtrain.to_numpy().ravel()
Xtest = ser_Xtest.to_numpy().ravel()

ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()


import transformers
from tokenizers import BertWordPieceTokenizer

class_tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased' ,
                                                             lower = True)
# Save the loaded tokenizer locally
class_tokenizer.save_pretrained('.')

# Reload it with the huggingface tokenizers library
tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=True)
tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)


# pprint([i for i in dir(tokenizer) if i[0]!='_'],max_seq_len=80)


def do_encode(texts, tokenizer, chunk_size=256, maxlen=400):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding()
    all_ids = []
    
    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)


Xtrain = do_encode(Xtrain, tokenizer, maxlen=400)
Xtest = do_encode(Xtest, tokenizer, maxlen=400)


from keras.layers import Input,Dense
from tensorflow.keras.optimizers import Adam
from keras.models import Model

def build_model(transformer, max_len=400):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


bert_model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


model = build_model(bert_model, max_len=400)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_word_ids (InputLayer)  [(None, 400)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 400, 768),)       66362880  
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 769       
=================================================================
Total params: 66,363,649
Trainable params: 66,363,649
Non-trainable params: 0
_________________________________________________________________


y_pred = model.predict(Xtest)
y_pred = np.round(y_pred).astype(int)


from sklearn import metrics
from sklearn.model_selection import cross_val_predict, StratifiedKFold


df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


text_model_name = "distilbert"
params = "distilbert-base-uncased"
model_name = "keras"
desc = "dense layer=1, Adam(lr=2e-5)"

Xvd = Xtest
yvd = ytest
vd_preds = y_pred

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

df_eval

	id	label	tweet
0	1	0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1	2	0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
998	999	1	Idk if I should download Dead Nation and inFAMOUS and never play either one, or just not download anything #Sony #PSN #Playstation
999	1000	1	I've gone thru four iPhone chargers in 3 days #boss #badass

Description¶

Descriptions¶

Load the Libraries¶

Load the dataset¶

Text Data Processing¶

Train Test Split¶

Tokenizing¶

Model Evaluation¶