%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install bert-for-tf2
!pip install sentencepiece
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
try:
%tensorflow_version 2.x
except Exception:
pass
import tensorflow as tf
import tensorflow_hub as hub
import keras
from tensorflow.keras import layers
import bert
from pprint import pprint
pprint([(x.__name__,x.__version__) for x in [tf, hub, keras,bert]])
[('tensorflow', '2.3.0'), ('tensorflow_hub', '0.9.0'), ('keras', '2.4.3'), ('bert', '0.14.6')]
import numpy as np
import pandas as pd
from pprint import pprint
pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100
import sys
import re
from tqdm import tqdm
tqdm.pandas()
pprint([(x.__name__,x.__version__) for x in [np,pd]])
[('numpy', '1.18.5'), ('pandas', '1.0.5')]
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/janatahack/sentiment_analysis/raw/train.csv?raw=true')
df = df.iloc[:1000] # to prevent OOM
print(f"train : {df.shape}")
display(df.head(2).append(df.tail(2)))
target = 'label'
maincol = 'tweet'
train : (1000, 3)
id | label | tweet | |
---|---|---|---|
0 | 1 | 0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone |
1 | 2 | 0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ |
998 | 999 | 1 | Idk if I should download Dead Nation and inFAMOUS and never play either one, or just not download anything #Sony #PSN #Playstation |
999 | 1000 | 1 | I've gone thru four iPhone chargers in 3 days #boss #badass |
maincol = 'tweet'
target = 'label'
mc = maincol + '_clean'
tmc = 'tokenized_' + mc
'tokenized_tweet_clean'
import re
def process_text(text):
out = re.sub(r'<[^>]+>','', text) # remove html tags
out = re.sub('[^a-zA-Z]', ' ', out) # keep only alpha
out = re.sub(r"\s+[a-zA-Z]\s+", ' ', out) # remove single letter
out = re.sub(r'\s+', ' ', out) # remove multiple spaces
return out
s = pd.Series(df[maincol][1])
s.progress_apply(process_text)
100%|██████████| 1/1 [00:00<00:00, 600.99it/s]
0 Finally transparant silicon case Thanks to my uncle yay Sony Xperia sonyexperias http instagram com YGEt JC JM dtype: object
df[mc] = df[maincol].progress_apply(process_text)
100%|██████████| 1000/1000 [00:00<00:00, 26317.70it/s]
df.head(2)
id | label | tweet | tweet_clean | |
---|---|---|---|---|
0 | 1 | 0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | fingerprint Pregnancy Test https goo gl MfQV android apps beautiful cute health igers iphoneonly iphonesia iphone |
1 | 2 | 0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | Finally transparant silicon case Thanks to my uncle yay Sony Xperia sonyexperias http instagram com YGEt JC JM |
X = df[mc].to_list()
labels = df[target].to_list()
%%time
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)
CPU times: user 9.12 s, sys: 1.71 s, total: 10.8 s Wall time: 17.4 s
tokenizer.tokenize("nlp is fun.")
['nl', '##p', 'is', 'fun', '.']
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("nlp is fun."))
[17953, 2361, 2003, 4569, 1012]
def encode_sentence(text):
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
df[tmc] = df[mc].progress_apply(encode_sentence)
100%|██████████| 1000/1000 [00:00<00:00, 3229.76it/s]
# shuffle the data
df = df.sample(frac=1,random_state=SEED)
df['length'] = df[mc].progress_apply(len)
100%|██████████| 1000/1000 [00:00<00:00, 123459.92it/s]
df.head(2)
id | label | tweet | tweet_clean | tokenized_tweet_clean | length | |
---|---|---|---|---|---|---|
249 | 250 | 0 | look what mommy got 4 me :-p #igers #ipad #apple #mom #gifts #instahub #instadaily #blessed #fo http://instagr.am/p/Uor2Uoh6C0/ | look what mommy got me igers ipad apple mom gifts instahub instadaily blessed fo http instagr am Uor Uoh | [2298, 2054, 20565, 2288, 2033, 1045, 15776, 25249, 6207, 3566, 9604, 16021, 2696, 6979, 2497, 16021, 17713, 12502, 2100, 10190, 1042, 2080, 8299, 16021, 15900, 2099, 2572, 1057, 2953, 1057, 11631] | 105 |
353 | 354 | 0 | Top Photo App Available now >> https://itunes.apple.com/us/app/love360/id809353957?mt=8 … #photography #usa #woman poem #valentine #iphone #kärlek 872 | Top Photo App Available now https itunes apple com us app love id mt photography usa woman poem valentine iphone rlek | [2327, 6302, 10439, 2800, 2085, 16770, 11943, 6207, 4012, 2149, 10439, 2293, 8909, 11047, 5855, 3915, 2450, 5961, 10113, 18059, 1054, 2571, 2243] | 118 |
# sort data with length
df = df.sort_values('length')
df.head(2)
id | label | tweet | tweet_clean | tokenized_tweet_clean | length | |
---|---|---|---|---|---|---|
282 | 283 | 1 | 1 hour usage and phones down 13% ... Thanks #iPhone #Apple | hour usage and phones down Thanks iPhone Apple | [3178, 8192, 1998, 11640, 2091, 4283, 18059, 6207] | 47 |
575 | 576 | 0 | An awesome phablet ... note 5 duos. #Note7 note 5 ... #Samsung ... | An awesome phablet note duos Note note Samsung | [2019, 12476, 6887, 3085, 2102, 3602, 6829, 2015, 3602, 3602, 19102] | 47 |
sorted_data_labels = [ (df[tmc][i], df[target][i]) for i in range(len(df)) ]
sorted_data_labels[0][0][:5]
[4344, 16550, 10032, 3231, 16770]
# tf2 compatible dataset
processed_dataset = tf.data.Dataset.from_generator(
lambda: sorted_data_labels, output_types=(tf.int32, tf.int32))
# pad each batch (better than padding whole dataset)
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
# see the first batch
# next(iter(batched_dataset))
TOTAL_BATCHES = np.ceil(len(sorted_data_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES,seed=SEED)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)
class TEXT_MODEL(tf.keras.Model):
def __init__(self,
vocabulary_size,
embedding_dimensions=128,
cnn_filters=50,
dnn_units=512,
model_output_classes=2,
dropout_rate=0.1,
training=False,
name="text_model"):
super(TEXT_MODEL, self).__init__(name=name)
self.embedding = layers.Embedding(vocabulary_size,
embedding_dimensions)
self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
kernel_size=2,
padding="valid",
activation="relu")
self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
kernel_size=3,
padding="valid",
activation="relu")
self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
kernel_size=4,
padding="valid",
activation="relu")
self.pool = layers.GlobalMaxPool1D()
self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
self.dropout = layers.Dropout(rate=dropout_rate)
if model_output_classes == 2:
self.last_dense = layers.Dense(units=1,
activation="sigmoid")
else:
self.last_dense = layers.Dense(units=model_output_classes,
activation="softmax")
def call(self, inputs, training):
l = self.embedding(inputs)
l_1 = self.cnn_layer1(l)
l_1 = self.pool(l_1)
l_2 = self.cnn_layer2(l)
l_2 = self.pool(l_2)
l_3 = self.cnn_layer3(l)
l_3 = self.pool(l_3)
concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
concatenated = self.dense_1(concatenated)
concatenated = self.dropout(concatenated, training)
model_output = self.last_dense(concatenated)
return model_output
# Hyperparameters
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 300
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2
DROPOUT_RATE = 0.1
NB_EPOCHS = 10
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
embedding_dimensions=EMB_DIM,
cnn_filters=CNN_FILTERS,
dnn_units=DNN_UNITS,
model_output_classes=OUTPUT_CLASSES,
dropout_rate=DROPOUT_RATE)
if OUTPUT_CLASSES == 2:
text_model.compile(loss="binary_crossentropy",
optimizer="adam",
metrics=["accuracy"])
else:
text_model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["sparse_categorical_accuracy"])
%%time
# Note: use gpu
text_model.fit(train_data, epochs=NB_EPOCHS)
Epoch 1/10 29/29 [==============================] - 3s 87ms/step - loss: 0.4916 - accuracy: 0.7400 Epoch 2/10 29/29 [==============================] - 2s 84ms/step - loss: 0.2866 - accuracy: 0.9082 Epoch 3/10 29/29 [==============================] - 2s 86ms/step - loss: 0.1180 - accuracy: 0.9723 Epoch 4/10 29/29 [==============================] - 2s 86ms/step - loss: 0.0163 - accuracy: 0.9989 Epoch 5/10 29/29 [==============================] - 3s 86ms/step - loss: 0.0032 - accuracy: 1.0000 Epoch 6/10 29/29 [==============================] - 2s 86ms/step - loss: 0.0011 - accuracy: 1.0000 Epoch 7/10 29/29 [==============================] - 2s 85ms/step - loss: 6.4176e-04 - accuracy: 1.0000 Epoch 8/10 29/29 [==============================] - 2s 85ms/step - loss: 4.2553e-04 - accuracy: 1.0000 Epoch 9/10 29/29 [==============================] - 3s 86ms/step - loss: 3.1922e-04 - accuracy: 1.0000 Epoch 10/10 29/29 [==============================] - 2s 86ms/step - loss: 2.3658e-04 - accuracy: 1.0000 CPU times: user 45.3 s, sys: 989 ms, total: 46.3 s Wall time: 26.7 s
<tensorflow.python.keras.callbacks.History at 0x7fa5f6dba4a8>
results = text_model.evaluate(test_data)
print(results)
3/3 [==============================] - 0s 7ms/step - loss: 0.4582 - accuracy: 0.8854 [0.458150714635849, 0.8854166865348816]
y_test = []
for element in test_data.as_numpy_iterator():
y_test += element[1].tolist()
y_test[:5]
[0, 0, 0, 0, 1]
y_pred = text_model.predict(test_data)
y_pred = (y_pred > 0.5).astype('int').ravel().tolist()
y_pred[:5]
[0, 0, 0, 0, 1]
from sklearn import metrics
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1': [],
})
text_model_name = "bert"
params = "bert_en_uncased_L-12_H-768_A-12"
model_name = "tf"
desc = "3 layers"
yvd = y_test
vd_preds = y_pred
acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
df_eval
Text Model | Params | Model | Description | Accuracy | Precision | Recall | F1 | |
---|---|---|---|---|---|---|---|---|
0 | bert | bert_en_uncased_L-12_H-768_A-12 | tf | 3 layers | 0.885417 | 0.807692 | 0.777778 | 0.884748 |