Ref: https://github.com/ThilinaRajapakse/simpletransformers/
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install simpletransformers
!pip install wandb
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import os
# os.environ["WANDB_API_KEY"] = ""
import wandb
wandb.init(project="sentiment-analysis")
W&B Run: https://app.wandb.ai/bhishanpdl/sentiment-analysis/runs/il4poh36
import numpy as np
import pandas as pd
from pprint import pprint
pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100
import sys
import re
from tqdm import tqdm
tqdm.pandas()
versions_ds = [(x.__name__,x.__version__) for x in [np,pd]]
pprint(versions_ds)
[('numpy', '1.18.5'), ('pandas', '1.0.5')]
p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/raw/'
df_train = pd.read_csv(p + 'train.csv?raw=true')
df_test = pd.read_csv(p + 'test.csv?raw=true')
print(f"train : {df_train.shape}")
print(f"test : {df_test.shape}")
display(df_train.head(2).append(df_train.tail(2)))
target = 'label'
maincol = 'tweet'
train : (7920, 3) test : (1953, 2)
id | label | tweet | |
---|---|---|---|
0 | 1 | 0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone |
1 | 2 | 0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ |
7918 | 7919 | 0 | Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ |
7919 | 7920 | 0 | Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ |
del df_train['id']
df_train.columns = ['label','text']
df_train = df_train[['text','label']]
df_train.head()
text | label | |
---|---|---|
0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 0 |
1 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | 0 |
2 | We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu | 0 |
3 | I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/ | 0 |
4 | What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support! | 1 |
df_test.head(2)
id | tweet | |
---|---|---|
0 | 7921 | I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks |
1 | 7922 | currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/ |
df_eval = df_train.sample(frac=0.5,random_state=SEED)
Ref:
Available models:
"bert": "bert-base-cased"
"roberta":"roberta-base"
"distilbert": "distilbert-base-cased"
"distilroberta":"roberta"
"electra-base":"electra"
"electra-small":"electra"
"xlnet":"xlnet-base-cased"
# note: xlnet uses too high memory, reduce batch
if model_type == "xlnet":
train_args["train_batch_size"] = 64
train_args["gradient_accumulation_steps"] = 2
from simpletransformers.classification import ClassificationModel
!rm -rf outputs
model_type = 'xlnet'
model_name = 'xlnet-base-cased'
train_args = {
"reprocess_input_data": True,
"overwrite_output_dir": True,
"use_cached_eval_features": True,
"output_dir": f"outputs/{model_type}",
"best_model_dir": f"outputs/{model_type}/best_model",
"train_batch_size": 128, # it was 128
"max_seq_length": 128, # 256 gives OOM
"num_train_epochs": 3,
# evaluation
"evaluate_during_training": False,
"evaluate_during_training_steps": 1000,
"save_model_every_epoch": False,
"save_eval_checkpoints": False,
"eval_batch_size": 64,
"gradient_accumulation_steps": 1,
}
train_args["wandb_project"] = "sentiment-analysis"
train_args["wandb_kwargs"] = {"name": model_name}
if model_type == "xlnet":
train_args["train_batch_size"] = 64
train_args["gradient_accumulation_steps"] = 2
%%time
model = ClassificationModel(model_type, model_name, args=train_args)
model.train_model(df_train, eval_df=None)
test_preds, _, = model.predict(df_test['tweet'].to_numpy())
df_test[target] = test_preds
df_sub = df_test[['id','label']]
df_sub.to_csv(f'sub_simpletransformers_{model_type}.csv', index=False)
/usr/local/lib/python3.6/dist-packages/transformers/configuration_xlnet.py:211: FutureWarning: This config doesn't use attention memories, a core feature of XLNet. Consider setting `men_len` to a non-zero value, for example `xlnet = XLNetLMHeadModel.from_pretrained('xlnet-base-cased'', mem_len=1024)`, for accurate training performance as well as an order of magnitude faster inference. Starting from version 3.5.0, the default parameter will be 1024, following the implementation in https://arxiv.org/abs/1906.08237 FutureWarning, Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias'] - This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model). - This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. /usr/local/lib/python3.6/dist-packages/simpletransformers/classification/classification_model.py:304: UserWarning: Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels. "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
/usr/local/lib/python3.6/dist-packages/transformers/modeling_xlnet.py:304: UserWarning: Mixed memory format inputs detected while calling the operator. The operator will output contiguous tensor even if some of the inputs are in channels_last format. (Triggered internally at /pytorch/aten/src/ATen/native/TensorIterator.cpp:918.) attn_score = (ac + bd + ef) * self.scale /usr/local/lib/python3.6/dist-packages/torch/optim/lr_scheduler.py:231: UserWarning: To get the last learning rate computed by the scheduler, please use `get_last_lr()`. warnings.warn("To get the last learning rate computed by the scheduler, "
CPU times: user 4min 43s, sys: 2min 46s, total: 7min 30s Wall time: 7min 36s