/
WnutPytorchTransformers.py
757 lines (590 loc) · 28.7 KB
/
WnutPytorchTransformers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
"""
Model that forms our submission to the WNUT-2020 Task 2.
Results were achieved running on a AWS EC2 p2 spot instance.
Experiments were run utilising the ExperimentManager script.
"""
import transformers
import torch
import pandas as pd
from torch.utils import data
from sklearn.model_selection import train_test_split
from transformers import ( AdamW, BertModel, RobertaModel)
from transformers import BertTokenizer, RobertaTokenizer
from transformers import get_linear_schedule_with_warmup
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import nltk
import numpy as np
from torch import nn
from collections import defaultdict
import string
import re
import csv
import argparse
import gc
import sys
import os
from nltk import word_tokenize
# Stemming
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
ps = nltk.PorterStemmer()
# Stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# Set GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# ==============================================================================================================================================
# Functions
def string_to_bool(string):
"""
Converts a string representation of a boolean to a boolean
:param string: A string "True" or "False"
:return: boolean value
"""
if string.lower() == 'false':
return False
elif string.lower() == 'true':
return True
else:
return ValueError
def pre_process(tweet):
"""
Method for pre_processing tweet text:
* converting "HTTPURL" to "http" and splitting camelcased hashtags.
* parsing for "covid-19" and "coronavirus" and standardizing to "coronavirus"
Regex term for splitting derived from https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
with additional implementation for computation in list comprehension.
"""
tweet = re.sub("HTTPURL", "http", tweet)
tweet = " ".join([" ".join([word for word in re.sub('([A-Z][a-z]+)', r' \1',
re.sub('([A-Z]+)', r' \1', word)).split()]) if word.startswith(
'#') else word for word in tweet.split()])
tweet = re.sub('# ', '#', tweet)
# convert instances of covid/corona to 'virus' for which there is an embedding
tweet = re.sub('(?i)CORONA$', 'coronavirus', tweet)
tweet = re.sub('(?i)COVID-19', 'coronavirus', tweet)
tweet = re.sub('(?i)COVIDー19', 'coronavirus', tweet)
tweet = re.sub('(?i)COVID', 'coronavirus', tweet)
tweet = re.sub('(?i)COVID19', 'coronavirus', tweet)
tweet = re.sub('(?i)coronavirus', 'coronavirus', tweet)
tweet = re.sub('(?i)coronavirus19', 'coronavirus', tweet)
tweet = re.sub('(?i)coronavirus_19', 'coronavirus', tweet)
tweet = re.sub('(?i)COVID_19', 'coronavirus', tweet)
tweet = re.sub('(?i)coronavirusー19', 'coronavirus', tweet)
return tweet
def remove_emojis(data):
"""
Function to remove unicode emojis.
:param data: The text data from which to remove emojis.
:return: The data with emojis removed.
"""
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)
def handcrafted_feature(text):
"""
Function to return the percentage probability of any character in the text
being numeric. Parsing and removing emojis to prevent the influence of numbers
in unicode characters.
:param text: The tweet text.
:return: A (1,1) tensor of the probability of any character being numeric.
"""
text = remove_emojis(text)
text = [char for char in text if char not in string.punctuation]
count = len([char for char in text if char.isdigit()])
avg = (count/(len(text) - text.count(" ")))*100
x = torch.tensor([avg], dtype=torch.float)
y = x.view(1,1)
return y
# ==============================================================================================================================================
# Set up arguments
parser = argparse.ArgumentParser(description='Script that utalises task spooler to run multiple hyperparameter experiments')
parser.add_argument('model_class', type=str, help='model class: BERT or RoBERTa')
parser.add_argument('model', type=str, help='model: e.g.bert-base-uncased')
parser.add_argument('random_seed', type=str)
parser.add_argument('max_len', type=str)
parser.add_argument('epochs', type=str)
parser.add_argument('learning_rate', type=str)
parser.add_argument('batch_size', type=str)
parser.add_argument('dropout_prob', type=str)
parser.add_argument('test_size', type=str)
parser.add_argument('preprocessed', type=str)
parser.add_argument('ensemble', type=str)
parser.add_argument('num_labels', type=str)
parser.add_argument('max_feats', type=str)
parser.add_argument('save_model', type=str)
parser.add_argument('split_train', type=str)
args = parser.parse_args()
config = {"model_class": args.model_class,
"model" : args.model,
"random_seed": int(args.random_seed),
"max_len": int(args.max_len),
"epochs": int(args.epochs),
"learning_rate": float(args.learning_rate),
"batch_size": int(args.batch_size),
"dropout_prob": float(args.dropout_prob),
"test_size": float(args.test_size),
"preprocessed": string_to_bool(args.preprocessed),
"ensemble": string_to_bool(args.ensemble),
"num_labels": int(args.num_labels),
"max_feats": int(args.max_feats),
"save_model": string_to_bool(args.save_model),
"split_train": string_to_bool(args.split_train)}
train_data = None
df_val = None
MODEL = None
RANDOM_SEED = None
MAX_LEN = None
BATCH_SIZE = None
EPOCHS = None
LEARNING_RATE = None
TEST_SIZE = None
# ==============================================================================================================================================
# read in data
try:
train_data = pd.read_csv('./datasets/train.tsv', sep=r'\t', header=0)
assert train_data.shape == (7000, 3) , "train.tsv should contain 7000 rows, only found " + str(train_data.shape[0])
except IOError as e:
sys.stderr.write('Error opening file ' + './datasets/train.tsv' + ' ' + e.strerror + '\n')
raise SystemExit
try:
df_val = pd.read_csv('./datasets/valid.tsv', sep=r'\t', header=0)
assert df_val.shape == (1000, 3) , "valid.tsv should contain 1000 rows, only found " + str(df_val.shape[0])
except IOError as e:
sys.stderr.write('Error opening file ' + './datasets/valid.tsv' + ' ' + e.strerror + '\n')
raise SystemExit
# Apply preprocessing to train text and validation text
if config['preprocessed']:
train_data.Text = train_data.Text.apply(lambda x: pre_process(x))
df_val.Text = df_val.Text.apply(lambda x: pre_process(x))
# Currently the labels in our dataframes are "UNINFORMATIVE" or "INFORMATIVE"
# It is therefore neccessary to convert these to binary values for classification
train_data.Label = train_data.Label.apply(lambda x: 0 if x == 'UNINFORMATIVE' else 1)
df_val.Label = df_val.Label.apply(lambda x: 0 if x=='UNINFORMATIVE' else 1)
# ==============================================================================================================================================
# Configure Model
TRANSFORMERS = {
"BERT" : (BertModel, BertTokenizer),
"RoBERTa": (RobertaModel, RobertaTokenizer)
}
transformer_class, tokenizer_class = TRANSFORMERS.get(config["model_class"])
# Define model
transformer_model = transformer_class.from_pretrained(config["model"])
# Define tokenizer
tokenizer = tokenizer_class.from_pretrained(config["model"])
# Define sequence length, batch size and random state
MAX_LEN = config['max_len']
BATCH_SIZE = config['batch_size']
RANDOM_SEED = config['random_seed']
TEST_SIZE = config['test_size']
ENSEMBLE = config['ensemble']
MAX_FEATS = config['max_feats']
if RANDOM_SEED is not None:
# Propogate random_state
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
#transformer_model.config.model_type
# ==============================================================================================================================================
# Datasets and Dataloader
# Note -- this is the same implementation used in BERT_baseline
# Pytorch dataset classes are extensions of the base abstract dataset class
# it is neccessary to override the __len__ method (returning the size of the dataset), and
# __getitem__ (which defined how to get the next item in the dataset).
#
# For more info see (https://pytorch.org/tutorials/beginner/data_loading_tutorial.html)
#
# This dataset returns a dictionary of the text, input_ids, attention_mask and label for a tweet.
# It is important to return pytorch tensors, which is the format that the BERT model expects.
#
# Class derived from (https://www.curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/)
class tweet_dataset(data.Dataset):
def __init__(self, id, text, label, tokenizer, max_len):
self.id = id
self.text = text
self.label = label
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, item):
text = str(self.text[item])
encoding = tokenizer.encode_plus(
text,
max_length=self.max_len,
truncation=True,
add_special_tokens=True,
pad_to_max_length=True,
return_attention_mask=True,
return_token_type_ids=False,
return_tensors='pt' # Return pytorch tenosors
)
return {
'text' : text,
'input_ids' : encoding['input_ids'].flatten(), # flatten to get right shape
'attention_mask' : encoding['attention_mask'].flatten(), # flatten to get right shape
'label' : torch.tensor(self.label[item], dtype=torch.long),
'id' : torch.tensor(self.id[item], dtype=torch.long)
}
# Note -- this is the same implementation used in BERT_baseline
# Wrap these datasets into pytorch dataloaders
# Credit for this method goes to (https://www.curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/)
def create_data_loader(df, tokenizer, max_len, batch_size):
#Create instance of dataset
ds = tweet_dataset(
id = df.Id.to_numpy(),
text = df.Text.to_numpy(),
label = df.Label.to_numpy(),
tokenizer = tokenizer,
max_len = max_len
)
return data.DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
if config['split_train']: # split training to create a test set
df_train, df_test = train_test_split(train_data, test_size=TEST_SIZE, random_state=RANDOM_SEED)
test_dataloader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
# Create loaders for training, test and val data
train_dataloader = (create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE) if config['split_train']
else create_data_loader(train_data, tokenizer, MAX_LEN, BATCH_SIZE))
val_dataloader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
# ==============================================================================================================================================
# TF-IDF MODEL
# Define a method that will passed as an analyzer to the TfidfVectorizer
def clean_text(text):
"""
Method to clean the tweet text, removing case, removing stopwords and punctuation and tokenizing.
This will be parsed to the TfidfVectoriser as an analyzer and applied to each tweet in fitting.
:param text: The text of each tweet
:return: The text tokenized, with punctuation and stopwords removed.
"""
text = text.lower()
text = remove_emojis(text)
text = "".join([char for char in text if not char.isdigit()])
text = "".join([char for char in text if char not in string.punctuation])
tokens = word_tokenize(text)
text = [ps.stem(word) for word in tokens if word not in stopwords]
return text
if ENSEMBLE:
# Instantiate TfidfVectorizer object and pass clean_text method which it will automatically apply
tfidf_vect = TfidfVectorizer(analyzer=clean_text, max_features=MAX_FEATS)
# Learning vocabulary on training data
if config['split_train']:
tfidf_vect.fit(df_train.Text)
else:
tfidf_vect.fit(train_data.Text)
# ==============================================================================================================================================
# Model Classes
class RobertaClassificationHead(nn.Module):
def __init__(self, hidden_size, num_labels, num_tfidf_feats, dropout_prob):
super(RobertaClassificationHead, self).__init__()
self.input_size = hidden_size + num_tfidf_feats + 1 #plus 1 to account for avg
self.num_labels = num_labels
self.dropout = nn.Dropout(p=dropout_prob)
self.activation = nn.Tanh()
#self.dense = nn.Linear(hidden_size , hidden_size) # Note - atm hidden_size
self.dense = nn.Linear(self.input_size , self.input_size) # Note - atm hidden_size
self.out_proj = nn.Linear(self.input_size , self.num_labels)
# Derived from (https://huggingface.co/transformers/v1.1.0/_modules/pytorch_transformers/modeling_roberta.html)
def forward(self, features, ensemble, tfidf_feats, avg_feats):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
if ensemble:
# Concatenate tfidf_feature vector to RoBERTa <s> token embedding
x = torch.cat((x, tfidf_feats), dim=1)
x = torch.cat((x, avg_feats), dim=1) # adding the extra avg_num features
x = self.dropout(x) # Roberta has extra dropout layer, and experimentation showed this gets better results.
x = self.dense(x)
x = self.activation(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class BertClassificationHead(nn.Module):
def __init__(self, hidden_size, num_labels, num_tfidf_feats, dropout_prob):
super(BertClassificationHead, self).__init__()
self.input_size = hidden_size + num_tfidf_feats + 1 #plus 1 to account for avg
self.num_labels = num_labels
self.activation = nn.Tanh()
self.dense = nn.Linear(self.input_size, self.input_size)
self.dropout = nn.Dropout(p=dropout_prob)
self.out_proj = nn.Linear(self.input_size, self.num_labels)
# Dervied from (https://huggingface.co/transformers/_modules/transformers/modeling_bert.html#BertModel)
def forward(self, features, ensemble, tfidf_feats, avg_feats):
x = features[:, 0] # take [CLS]
if ensemble:
# Concatenate tfidf_feature vector to RoBERTa <s> token embedding
x = torch.cat((x, tfidf_feats), dim=1)
x = torch.cat((x, avg_feats), dim=1) # adding the extra avg_num features
x = self.dense(x)
x = self.activation(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class Classifier(nn.Module):
def __init__(self, model, num_labels, classification_head):
super().__init__()
self.model = model
self.num_labels = num_labels
self.classification_head = classification_head
def forward(self, input_ids=None, attention_mask=None, labels=None, ensemble=None, tfidf_feats=None, avg_feats=None):
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
# if self.model.config.model_type == 'bert':
# # 'outputs' is a tuple (sequence_output, pooled_output).
# # The pooled_output is the last hidden-state of the [CLS] token further process by a Linear layer and tanh function
# model_output = outputs[1]
# else:
# # 'outputs' is raw hidden states from RoBERTa
# # Therefore take the sequence output of dimensions (batch_size, max_len, hidden_size)
# model_output = outputs[0]
# Need to avoid using pooled output from BERT (where making use of tfidf)
# therefore in each case BERT and RoBERTa going to use the sequence output
# and in the classification head extract the CLS token or equivalent
model_output = outputs[0]
logits = self.classification_head(model_output, ensemble, tfidf_feats, avg_feats)
outputs = (logits,)
# Following derived from (https://huggingface.co/transformers/_modules/transformers/modeling_bert.html#BertForSequenceClassification)
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs
# ==============================================================================================================================================
# Create instance of model and classificationHead
hidden_size = transformer_model.config.hidden_size
num_labels = config["num_labels"]
num_tfidf_feats = len(tfidf_vect.get_feature_names()) if config["ensemble"] else 0
dropout_prob = config["dropout_prob"]
# Two different classification heads - distinct by a dropout layer
classification_head = (BertClassificationHead(hidden_size, num_labels, num_tfidf_feats, dropout_prob) if config["model_class"] == "BERT"
else RobertaClassificationHead(hidden_size, num_labels, num_tfidf_feats, dropout_prob))
model = Classifier(transformer_model, num_labels, classification_head)
# Move to the GPU
model = model.to(device)
LEARNING_RATE = config["learning_rate"]
EPOCHS = config["epochs"]
WEIGHT_DECAY = 0 # defaults to 0 in AdamW
NUM_WARMUP_STEPS = 0
TOTAL_STEPS = len(train_dataloader) * EPOCHS
if config["model_class"] == "RoBERTa":
WEIGHT_DECAY = 0.1
NUM_WARMUP_STEPS = int(0.06 * TOTAL_STEPS) # 0.06 ratio
# Use a pytorch optimizer to preform updates on parameters.
# This is the Adam algorithm with weight decay fix. This is the same optimizer
# implemented in (https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=8o-VEBobKwHk)
# and the hugginface implementation of transformer models on the GLUE dataset,
# see (https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L109)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False, weight_decay=WEIGHT_DECAY) # BERT tf library use false.
# Create a schedular which will update the learning rate.
# This uses the defualt num_warmup_steps from:
# (https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L392)
schedular = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=NUM_WARMUP_STEPS,
num_training_steps=TOTAL_STEPS
)
# Function to calculate the accuracy of a batch of predictions vs true labels
# Function is derived from (https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=8o-VEBobKwHk)
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten() # returns max values on columns
labels_flat = labels.flatten()
assert len(pred_flat) == len(labels_flat)
return np.sum(pred_flat == labels_flat) / len(labels_flat)
# ==============================================================================================================================================
# Training Loop
# The below code forms the fine-tuning training loop of our model on the new data.
# This code was influenced by (https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=8o-VEBobKwHk).
# With ammendments to reflect the change that our dataset returns a dictionary, and the introduction of a phase for loop to reduce visual complexity.
# As in (https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=8o-VEBobKwHk)
# we will store the losses associated with training and validiation.
history = defaultdict(list)
# define a dict with our dataloaders
dataloader = {
'train' : train_dataloader,
'val' : val_dataloader
}
# Phase concept derived from (https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html#model-training-and-validation-code)
for epoch in range(0, EPOCHS):
print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
for phase in ['train', 'val']:
if phase == 'train':
print('Running training step...\n')
else:
print('Running validation...\n')
total_loss = 0
total_acc = 0
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
for batch in dataloader[phase]:
#Unpack batch
input_ids = batch.get('input_ids').to(device)
attention_mask = batch.get('attention_mask').to(device)
labels = batch.get('label').to(device)
# Declare a none-type variable where model not ensemble - only concatenated where ensemble == True
tfidf_feats = None
if ENSEMBLE:
tfidf_feats = tfidf_vect.transform(batch.get('text')) #transform text in batch
#transform to tensor - derived from https://github.com/donglinchen/text_classification/blob/master/model_pytorch.ipynb
tfidf_feats = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_feats)).float().to(device)
# apply average nums to each text item, and reshape to tensor of [32,1]
avg_feats = torch.stack([handcrafted_feature(text) for text in batch.get('text')])[:, -1, :].to(device)
# Pytorch accumulates the gradients on backward pass so need to set them to zero.
model.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
# Perform a forward pass on this training batch.
# As explained in (https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification),
# because we pass the true labels, the classification loss is returned along with the classification
# scores before normalizaition ("logits")
loss, logits = model(input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
ensemble=ENSEMBLE,
tfidf_feats=tfidf_feats,
avg_feats=avg_feats) # Default is None, but need to declare a var for when ensemble True
# Accumulate the training loss to calculate average loss for this epoch.
# loss.item() extracts the value from the loss tensor as a python float.
total_loss += loss.item()
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = labels.to('cpu').numpy()
# Calculate the accuracy for this batch
total_acc += flat_accuracy(logits, label_ids)
if phase == 'train':
# Perform a backward pass to calculate the gradients.
loss.backward()
# Use gradient clipping to prevent exploding gradients.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters
optimizer.step()
# Update the learning rate.
schedular.step()
avg_acc = total_acc / len(dataloader[phase])
avg_loss = total_loss / len(dataloader[phase])
if phase == 'train':
print("Training step complete. \n")
else:
print('Validation step complete. \n')
print(" Average loss: {0:.2f}".format(avg_loss))
print(" Average accuracy {0:.2f}".format(avg_acc))
print("")
# Record all statistics from this epoch and phase.
history[phase +'_loss'].append(avg_loss)
history[phase +'_acc'].append(avg_acc)
print("\nTraining complete!")
print('\nPerforming Evaluation on Test Set.')
# Put model in evaluation mode
model.eval()
# Tracking variables
predictions , true_labels = [], []
test_dataloader = test_dataloader if config['split_train'] else val_dataloader
for batch in test_dataloader:
#Unpack
input_ids = batch.get('input_ids').to(device)
attention_mask = batch.get('attention_mask').to(device)
labels = batch.get('label').to(device)
tfidf_feats = None
if ENSEMBLE:
tfidf_feats = tfidf_vect.transform(batch.get('text')) #transform text in batch
#transform to tensor - derived from https://github.com/donglinchen/text_classification/blob/master/model_pytorch.ipynb
tfidf_feats = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_feats)).float().to(device)
# apply average nums to each text item, and reshape to tensor of [32,1]
avg_feats = torch.stack([handcrafted_feature(text) for text in batch.get('text')])[:, -1, :].to(device)
with torch.no_grad():
outputs = model(input_ids,
attention_mask=attention_mask,
labels=labels,
ensemble=ENSEMBLE,
tfidf_feats=tfidf_feats,
avg_feats=avg_feats)
logits = outputs[1]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = labels.to('cpu').numpy()
# Store predictions and true labels
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE!')
# ==============================================================================================================================================
# Write to CSV
from sklearn.metrics import classification_report
# The predictions for each batch are a 2-column ndarray (one column for "0"
# and one column for "1"). With rows refelcting the weight
# of the predicition for each label. Identify the highest value and covert
# in to a list of 0s and 1s.
y_preds = []
for i in range(len(true_labels)):
y_preds.append(np.argmax(predictions[i], axis=1).flatten())
# Y_preds stores an array for each batch in our predicitions
# to produce a classification report, extract each item in these sublists to a
# flattend array.
flat_true_labels = [item for sublist in true_labels for item in sublist]
flat_predictions = [item for sublist in y_preds for item in sublist]
print(classification_report(flat_true_labels, flat_predictions))
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
def compute_metrics(labels, predictions):
# precision and recall only for INFORMATIVE class
precision = precision_score(labels, predictions, pos_label=1, average='binary')
recall = recall_score(labels, predictions, pos_label=1, average='binary')
# weighted f1-score
f1 = f1_score(labels, predictions, average='weighted')
tn, fp, fn, tp = confusion_matrix(labels, predictions, labels=[0,1]).ravel()
return {
"precision" : precision,
"recall" : recall,
"f1_score" : f1,
"tp" : tp,
"fp" : fp,
"fn" : fn,
"tn" : tn
}
#Write out to csv file:
try:
with open('./transformer_model_tests.csv', 'a', newline='') as csv_file:
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
metrics = compute_metrics(flat_true_labels, flat_predictions)
csv_writer.writerow([config['model_class'], config['model'], config['split_train'],
config['preprocessed'], config['ensemble'],config['save_model'],
config['random_seed'],config['max_len'], config['batch_size'],
config['learning_rate'], config['epochs'], config['dropout_prob'],config['max_feats'],
history['train_acc'], history['val_acc'], history['train_loss'],history['val_loss'],
metrics['precision'], metrics['recall'], metrics['f1_score'],
metrics['tp'], metrics['fp'], metrics['fn'], metrics['tn']])
except IOError as e:
sys.stderr.write('Error opening file trandormer_model_tests.csv' + e.strerror + '\n')
raise SystemExit
# ==============================================================================================================================================
# Save Model
if config['save_model']:
output_dir = './'+config['model']+'_save/'
# Create output directory if needed
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print("Saving model to %s" % output_dir)
# Pytorch reccomends saving models using save/load state dict
# see (https://pytorch.org/tutorials/beginner/saving_loading_models.html#:~:text=Save%2FLoad%20state_dict%20(Recommended)&text=Saving%20the%20model's%20state_dict%20with,pt%20or%20.)
torch.save(model.state_dict(), output_dir + '_model_state.pt')
torch.save(model, output_dir + 'model.pt')
# ==============================================================================================================================================
# Memory optimization
del model, optimizer, schedular
gc.collect()
torch.cuda.empty_cache()