%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install simpletransformers
    !pip install watermark

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import numpy as np
import pandas as pd
from pprint import pprint
import gc

pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100

import sys
import re
from tqdm import tqdm
tqdm.pandas()

versions_ds = [(x.__name__,x.__version__) for x in [np,pd]]
pprint(versions_ds)

[('numpy', '1.18.5'), ('pandas', '1.1.3')]


p = 'https://github.com/bhishanpdl/Datasets/blob/master/Projects/Consumer_Complaint_Database/'
df = pd.read_csv(p + 'complaints_2019_clean.csv.zip?raw=true',compression='zip')

print(f"df shape : {df.shape}")
display(df.head(2).append(df.tail(2)))

df shape : (124907, 10)


df = df[['complaint','product']]
df['product'] = df['product'].astype('category').cat.codes

df.columns = ['text','label'] # we need this for simpletransformers

df.head(2)


num_labels = df['label'].nunique()

num_labels, sorted(df['label'].unique())

(9, [0, 1, 2, 3, 4, 5, 6, 7, 8])


from sklearn.model_selection import train_test_split

target = 'label'

df_train, df_test = train_test_split(df,test_size=0.2,
                                     random_state=SEED,
                                     stratify=df[target])


print(f"df             : {df.shape}")
print(f"\ndf_train : {df_train.shape}")
print(f"\ndf_test       : {df_test.shape}")

df_train.head(2)

df             : (124907, 2)

df_train : (99925, 2)

df_test       : (24982, 2)


from simpletransformers.classification import ClassificationModel

wandb: WARNING W&B installed but not logged in.  Run `wandb login` or set the WANDB_API_KEY env variable.


!rm -rf outputs


model_type = 'xlnet'
model_name = 'xlnet-base-cased'

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,

    # paths
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"outputs/{model_type}/best_model",

    # size
    "train_batch_size": 128, # 
    "max_seq_length": 128, # use small value to avoid OOM
    "num_train_epochs": 1,

    # rates
    # "weight_decay": 0,
    # "learning_rate": 4e-5,
    # "adam_epsilon": 1e-8,

    # evaluation
    "evaluate_during_training": False,
    "evaluate_during_training_steps": 1000,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "eval_batch_size": 64,
    "gradient_accumulation_steps": 1,
}


if model_type == "xlnet":
    train_args["train_batch_size"] = 64
    train_args["gradient_accumulation_steps"] = 2


%%time

model = ClassificationModel(model_type, model_name, args=train_args,num_labels=num_labels)
model.train_model(df_train, eval_df=None)

test_preds, _, = model.predict(df_test['text'].to_numpy())

/usr/local/lib/python3.6/dist-packages/transformers/configuration_xlnet.py:212: FutureWarning: This config doesn't use attention memories, a core feature of XLNet. Consider setting `mem_len` to a non-zero value, for example `xlnet = XLNetLMHeadModel.from_pretrained('xlnet-base-cased'', mem_len=1024)`, for accurate training performance as well as an order of magnitude faster inference. Starting from version 3.5.0, the default parameter will be 1024, following the implementation in https://arxiv.org/abs/1906.08237
  FutureWarning,

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

/usr/local/lib/python3.6/dist-packages/simpletransformers/classification/classification_model.py:353: UserWarning: Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."

/usr/local/lib/python3.6/dist-packages/transformers/modeling_xlnet.py:298: UserWarning: Mixed memory format inputs detected while calling the operator. The operator will output contiguous tensor even if some of the inputs are in channels_last format. (Triggered internally at  /pytorch/aten/src/ATen/native/TensorIterator.cpp:918.)
  attn_score = (ac + bd + ef) * self.scale
/usr/local/lib/python3.6/dist-packages/torch/optim/lr_scheduler.py:123: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
  "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)


ytest = df_test['label'].to_numpy().flatten()


from sklearn import metrics


metrics.accuracy_score(ytest, test_preds)

0.8521335361460252


from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = model.eval_model(df_test,
                                                            f1=f1_multiclass,
                                                            acc=accuracy_score)

/usr/local/lib/python3.6/dist-packages/simpletransformers/classification/classification_model.py:851: UserWarning: Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


result

{'acc': 0.8521335361460252,
 'eval_loss': 0.44023678308862557,
 'f1': 0.8521335361460252,
 'mcc': 0.7950124243416499}

	product	complaint	complaint_lst_clean	complaint_clean	total_length	num_words	num_sent	num_unique_words	avg_word_len	avg_unique
0	Debt collection	transworld systems inc. \nis trying to collect a debt that is not mine, not owed and is inaccurate.	['transworld', 'system', 'inc', 'trying', 'collect', 'debt', 'mine', 'owed', 'inaccurate']	transworld system inc trying collect debt mine owed inaccurate	98	18	2	15	4.444444	0.833333
1	Debt collection	Over the past 2 weeks, I have been receiving excessive amounts of telephone calls from the company listed in this complaint. The calls occur between XXXX XXXX and XXXX XXXX to my cell and at my jo...	['past', 'week', 'receiving', 'excessive', 'amount', 'telephone', 'call', 'company', 'listed', 'complaint', 'call', 'occur', 'cell', 'job', 'company', 'right', 'harass', 'work', 'want', 'stop', 'e...	past week receiving excessive amount telephone call company listed complaint call occur cell job company right harass work want stop extremely distracting told time day call collection agency work	395	78	1	54	4.076923	0.692308
124905	Mortgage	Every 6 months or so, every since loan was sold to XXXX, and serviced by first XXXX ( from XXXX XXXX to XX/XX/XXXX ), and then transferred to Fay Servicing in XX/XX/XXXX until present, we are thre...	['every', 'month', 'every', 'since', 'loan', 'sold', 'serviced', 'first', 'transferred', 'fay', 'servicing', 'present', 'threatened', 'foreclosure', 'proceeding', 'told', 'missed', 'several', 'mon...	every month every since loan sold serviced first transferred fay servicing present threatened foreclosure proceeding told missed several month payment providing proof bank statement socalled misse...	4300	797	1	330	4.395232	0.414053
124906	Debt collection	I had a unwritten contract with XXXX XXXX XXXX XXXX ( XXXX ) in XXXX for a property I rented in with several other tenants and my name was on the utilities. When I moved away from this rental I fo...	['unwritten', 'contract', 'property', 'rented', 'several', 'tenant', 'name', 'utility', 'moved', 'away', 'rental', 'found', 'budget', 'billing', 'program', 'tenant', 'owed', 'money', 'paying', 'du...	unwritten contract property rented several tenant name utility moved away rental found budget billing program tenant owed money paying due billing u le usage tenant left bill consolidated debt pay...	5827	1131	1	365	4.152078	0.322723

	text	label
87741	I am a victim of identity theft and I have submitted a notarized affidavit which serves as a legal document to my identity theft. I am also a victim of the XXXX breach and opted out of the settlem...	2
54623	I have sent a request to XXXX to get proof on how the account was verified as a valid account and never received the information on how it was verified.I have called XXXX XXXX Which is NCA and lef...	2

Introduction¶

Colab¶

Load the libraries¶

Data Processing for Simpletransformers¶

Train test split¶

Modelling: Simpletransformer¶

Model Evaluation¶