%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install transformers
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as tfr
import warnings
warnings.filterwarnings('ignore')
SEED = 100
print([(x.__name__,x.__version__) for x in [np,pd,sklearn,torch,tfr]])
[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('sklearn', '0.22.2.post1'), ('torch', '1.6.0+cu101'), ('transformers', '3.1.0')]
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/janatahack/sentiment_analysis/raw/train.csv?raw=true')
df = df.iloc[:1000] # to prevent OOM
print(f"train : {df.shape}")
display(df.head(2).append(df.tail(2)))
target = 'label'
maincol = 'tweet'
train : (1000, 3)
id | label | tweet | |
---|---|---|---|
0 | 1 | 0 | #fingerprint #Pregnancy Test https://goo.gl/h1... |
1 | 2 | 0 | Finally a transparant silicon case ^^ Thanks t... |
998 | 999 | 1 | Idk if I should download Dead Nation and inFAM... |
999 | 1000 | 1 | I've gone thru four iPhone chargers in 3 days ... |
str_wt = 'distilbert-base-uncased'
tokenizer = tfr.DistilBertTokenizer.from_pretrained(str_wt)
model = tfr.DistilBertModel.from_pretrained(str_wt)
# tokenize
ser_tokenized = df[maincol].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
# make all list of series have same length
longest = ser_tokenized.apply(len).max()
padded = np.array([i + [0]*(longest-len(i)) for i in ser_tokenized.to_numpy()])
print(f"padded: {padded.shape}")
padded: (1000, 129)
# masking
attention_mask = np.where(padded != 0, 1, 0)
print(f"attenstion mask: {attention_mask.shape}")
attenstion mask: (1000, 129)
%%time
# run the torch model
# Take small of size otherwise we may get OOM error in Colab.
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
last_hidden_states = model(input_ids, attention_mask=attention_mask)
Xtrain_orig = last_hidden_states[0][:,0,:].numpy()
ytrain_orig = df[target].to_numpy().ravel()
Xtrain, Xvalid, ytrain, yvalid = train_test_split(Xtrain_orig, ytrain_orig,
random_state=SEED,stratify=ytrain_orig)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_predict
model = LogisticRegression(random_state=SEED,n_jobs=-1,max_iter=1000)
model.fit(Xtrain, ytrain)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=-1, penalty='l2', random_state=100, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
skf = StratifiedKFold(n_splits=3,random_state=SEED,shuffle=True)
vd_preds = cross_val_predict(model,Xvalid,yvalid,n_jobs=-1,cv=skf)
f1 = metrics.f1_score(yvalid,vd_preds,average='weighted')
print(f"F1 weighted = {f1:.4f}")
F1 weighted = 0.9120