%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install transformers

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import transformers as tfr

import warnings
warnings.filterwarnings('ignore')

SEED = 100

print([(x.__name__,x.__version__) for x in [np,pd,sklearn,torch,tfr]])

[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('sklearn', '0.22.2.post1'), ('torch', '1.6.0+cu101'), ('transformers', '3.1.0')]


df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/janatahack/sentiment_analysis/raw/train.csv?raw=true')

df = df.iloc[:1000] # to prevent OOM

print(f"train : {df.shape}")
display(df.head(2).append(df.tail(2)))

target = 'label'
maincol = 'tweet'

train : (1000, 3)


str_wt =  'distilbert-base-uncased'
tokenizer =  tfr.DistilBertTokenizer.from_pretrained(str_wt)
model = tfr.DistilBertModel.from_pretrained(str_wt)


# tokenize
ser_tokenized = df[maincol].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

# make all list of series have same length
longest = ser_tokenized.apply(len).max()

padded = np.array([i + [0]*(longest-len(i)) for i in ser_tokenized.to_numpy()])
print(f"padded: {padded.shape}")

padded: (1000, 129)


# masking
attention_mask = np.where(padded != 0, 1, 0)
print(f"attenstion mask: {attention_mask.shape}")

attenstion mask: (1000, 129)


%%time
# run the torch model
# Take small of size otherwise we may get OOM error in Colab.
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)


Xtrain_orig = last_hidden_states[0][:,0,:].numpy()
ytrain_orig = df[target].to_numpy().ravel()


Xtrain, Xvalid, ytrain, yvalid = train_test_split(Xtrain_orig, ytrain_orig,
                                                  random_state=SEED,stratify=ytrain_orig)


from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_predict


model = LogisticRegression(random_state=SEED,n_jobs=-1,max_iter=1000)
model.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=100, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


skf = StratifiedKFold(n_splits=3,random_state=SEED,shuffle=True)

vd_preds = cross_val_predict(model,Xvalid,yvalid,n_jobs=-1,cv=skf)


f1 = metrics.f1_score(yvalid,vd_preds,average='weighted')
print(f"F1 weighted = {f1:.4f}")

F1 weighted = 0.9120

	id	label	tweet
0	1	0	#fingerprint #Pregnancy Test https://goo.gl/h1...
1	2	0	Finally a transparant silicon case ^^ Thanks t...
998	999	1	Idk if I should download Dead Nation and inFAM...
999	1000	1	I've gone thru four iPhone chargers in 3 days ...

Load the Libraries¶

Load the dataset¶

Load Pre-trained Bert¶

Train test split¶

ML Modelling¶