Ref: https://www.kaggle.com/madz2000/sentiment-analysis-cleaning-eda-bert-88-acc
BERT WORKING BERT relies on a Transformer (the attention mechanism that learns contextual relationships between words in a text). A basic Transformer consists of an encoder to read the text input and a decoder to produce a prediction for the task. Since BERT’s goal is to generate a language representation model, it only needs the encoder part. The input to the encoder for BERT is a sequence of tokens, which are first converted into vectors and then processed in the neural network. But before processing can start, BERT needs the input to be massaged and decorated with some extra metadata:
Token embeddings: A [CLS]
token is added to the input word tokens at the beginning of the first sentence and a [SEP]
token is inserted at the end of each sentence.
Segment embeddings: A marker indicating Sentence A or Sentence B is added to each token. This allows the encoder to distinguish between sentences.
Positional embeddings: A positional embedding is added to each token to indicate its position in the sentence.
Ref: https://towardsml.com/2019/09/17/bert-explained-a-complete-guide-with-theory-and-tutorial/
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install tokenizers
!pip install transformers
!pip install scikit-plot
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import numpy as np
import pandas as pd
# settings
pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100
# ml
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# warnings
import warnings
warnings.filterwarnings('ignore')
# nlp
import re
# extra
import sys
from pprint import pprint
from tqdm import tqdm
tqdm.pandas()
# deep learning
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import keras
from keras.models import Model
from keras.layers import LSTM,Dense,Bidirectional,Input
import torch
import transformers
import tokenizers
from tokenizers import BertWordPieceTokenizer
# versions
pprint([(x.__name__,x.__version__) for x in
[np,pd,sklearn,tf,keras,torch,transformers,tokenizers]])
[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('sklearn', '0.22.2.post1'), ('tensorflow', '2.3.0'), ('keras', '2.4.3'), ('torch', '1.6.0+cu101'), ('transformers', '3.1.0'), ('tokenizers', '0.8.1.rc2')]
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/janatahack/sentiment_analysis/raw/train.csv?raw=true')
df = df.iloc[:1000] # to prevent OOM
print(f"train : {df.shape}")
display(df.head(2).append(df.tail(2)))
target = 'label'
maincol = 'tweet'
train : (1000, 3)
id | label | tweet | |
---|---|---|---|
0 | 1 | 0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone |
1 | 2 | 0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ |
998 | 999 | 1 | Idk if I should download Dead Nation and inFAMOUS and never play either one, or just not download anything #Sony #PSN #Playstation |
999 | 1000 | 1 | I've gone thru four iPhone chargers in 3 days #boss #badass |
maincol = 'tweet'
target = 'label'
mc = maincol + '_clean'
import re
def process_text(text):
out = re.sub(r'<[^>]+>','', text) # remove html tags
out = re.sub('[^a-zA-Z]', ' ', out) # keep only alpha
out = re.sub(r"\s+[a-zA-Z]\s+", ' ', out) # remove single letter
out = re.sub(r'\s+', ' ', out) # remove multiple spaces
return out
s = pd.Series(df[maincol][1])
s.progress_apply(process_text)
100%|██████████| 1/1 [00:00<00:00, 494.15it/s]
0 Finally transparant silicon case Thanks to my uncle yay Sony Xperia sonyexperias http instagram com YGEt JC JM dtype: object
df[mc] = df[maincol].progress_apply(process_text)
100%|██████████| 1000/1000 [00:00<00:00, 33723.32it/s]
from sklearn.model_selection import train_test_split
ser_Xtrain, ser_Xtest, ser_ytrain, ser_ytest = train_test_split(df[mc],df[target],
shuffle=True,
random_state=SEED,
stratify = df[target])
Xtrain = ser_Xtrain.to_numpy().ravel()
Xtest = ser_Xtest.to_numpy().ravel()
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
import transformers
from tokenizers import BertWordPieceTokenizer
class_tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased' ,
lower = True)
# Save the loaded tokenizer locally
class_tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=True)
tokenizer
Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)
# pprint([i for i in dir(tokenizer) if i[0]!='_'],max_seq_len=80)
def do_encode(texts, tokenizer, chunk_size=256, maxlen=400):
tokenizer.enable_truncation(max_length=maxlen)
tokenizer.enable_padding()
all_ids = []
for i in range(0, len(texts), chunk_size):
text_chunk = texts[i:i+chunk_size].tolist()
encs = tokenizer.encode_batch(text_chunk)
all_ids.extend([enc.ids for enc in encs])
return np.array(all_ids)
Xtrain = do_encode(Xtrain, tokenizer, maxlen=400)
Xtest = do_encode(Xtest, tokenizer, maxlen=400)
from keras.layers import Input,Dense
from tensorflow.keras.optimizers import Adam
from keras.models import Model
def build_model(transformer, max_len=400):
input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
sequence_output = transformer(input_word_ids)[0]
cls_token = sequence_output[:, 0, :]
out = Dense(1, activation='sigmoid')(cls_token)
model = Model(inputs=input_word_ids, outputs=out)
model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
return model
bert_model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm'] - This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model). - This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
model = build_model(bert_model, max_len=400)
model.summary()
Model: "functional_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_word_ids (InputLayer) [(None, 400)] 0 _________________________________________________________________ tf_distil_bert_model (TFDist ((None, 400, 768),) 66362880 _________________________________________________________________ tf_op_layer_strided_slice_1 [(None, 768)] 0 _________________________________________________________________ dense_1 (Dense) (None, 1) 769 ================================================================= Total params: 66,363,649 Trainable params: 66,363,649 Non-trainable params: 0 _________________________________________________________________
y_pred = model.predict(Xtest)
y_pred = np.round(y_pred).astype(int)
from sklearn import metrics
from sklearn.model_selection import cross_val_predict, StratifiedKFold
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1': [],
})
text_model_name = "distilbert"
params = "distilbert-base-uncased"
model_name = "keras"
desc = "dense layer=1, Adam(lr=2e-5)"
Xvd = Xtest
yvd = ytest
vd_preds = y_pred
acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
df_eval
Text Model | Params | Model | Description | Accuracy | Precision | Recall | F1 | |
---|---|---|---|---|---|---|---|---|
0 | distilbert | distilbert-base-uncased | keras | dense layer=1, Adam(lr=2e-5) | 0.684 | 0.318182 | 0.222222 | 0.663583 |