I am trying to fine tune GPT2, with Huggingface's trainer class.
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments
class torchDataset(Dataset):
def __init__(self, encodings):
self.encodings = encodings
self.len = len(encodings)
def __getitem__(self, index):
item = {torch.tensor(val[index]) for key, val in self.encodings.items()}
return item
def __len__(self):
return self.len
def print(self):
print(self.encodings)
# HYPER PARAMETERS
EPOCHS = 5
BATCH_SIZE = 2
WARMUP_STEPS = 5000
LEARNING_RATE = 1e-3
DECAY = 0
# Model ids and loading dataset
model_id = 'gpt2' # small model
# model_id = 'gpt2-medium' # medium model
# model_id = 'gpt2-large' # large model
dataset = load_dataset('wikitext', 'wikitext-2-v1') # first dataset
# dataset = load_dataset('m-newhauser/senator-tweets') # second dataset
# dataset = load_dataset('IsaacRodgz/Fake-news-latam-omdena') # third dataset
print('Loaded dataset')
# Dividing dataset into predefined splits
train_dataset = dataset['train']['text']
validation_dataset = dataset['validation']['text']
test_dataset = dataset['test']['text']
print('Divided dataset')
# loading tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_id,
# bos_token='<|startoftext|>', eos_token='<|endoftext|>',
pad_token='<|pad|>'
)
print('tokenizer max length:', tokenizer.model_max_length)
train_encoding = tokenizer(train_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
eval_encoding = tokenizer(validation_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
test_encoding = tokenizer(test_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
print('Converted to torch dataset')
torch_dataset_train = torchDataset(train_encoding)
torch_dataset_eval = torchDataset(eval_encoding)
torch_dataset_test = torchDataset(test_encoding)
# Setup training hyperparameters
training_args = TrainingArguments(
output_dir='/model_dump/',
num_train_epochs=EPOCHS,
warmup_steps=WARMUP_STEPS,
learning_rate=LEARNING_RATE,
weight_decay=DECAY
)
model = GPT2LMHeadModel.from_pretrained(model_id)
model.resize_token_embeddings(len(tokenizer))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_encoding,
eval_dataset=eval_encoding
)
trainer.train()
# model.save_pretrained('/model_dump/')
But with this code I get this error
The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,past_key_values,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,encoder_hidden_states,encoder_attention_mask,labels,use_cache,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.
When I use the variables torch_dataset_train and torch_dataset_eval in Trainer's arguments, the error I get is:
TypeError: vars() argument must have __dict__ attribute
This typeError is the same I get if as dataset I use the WikiText2 from torchtext. How can I fix this issue?