How to train GPT2 with Huggingface trainer

3.9k Views Asked by At

I am trying to fine tune GPT2, with Huggingface's trainer class.

from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments


class torchDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        self.len = len(encodings)

    def __getitem__(self, index):
        item = {torch.tensor(val[index]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return self.len

    def print(self):
        print(self.encodings)


# HYPER PARAMETERS
EPOCHS = 5
BATCH_SIZE = 2
WARMUP_STEPS = 5000
LEARNING_RATE = 1e-3
DECAY = 0


# Model ids and loading dataset
model_id = 'gpt2'  # small model
# model_id = 'gpt2-medium'  # medium model
# model_id = 'gpt2-large'  # large model

dataset = load_dataset('wikitext', 'wikitext-2-v1')  # first dataset
# dataset = load_dataset('m-newhauser/senator-tweets')  # second dataset
# dataset = load_dataset('IsaacRodgz/Fake-news-latam-omdena')  # third dataset

print('Loaded dataset')

# Dividing dataset into predefined splits
train_dataset = dataset['train']['text']
validation_dataset = dataset['validation']['text']
test_dataset = dataset['test']['text']

print('Divided dataset')

# loading tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_id,
                                              # bos_token='<|startoftext|>', eos_token='<|endoftext|>',
                                              pad_token='<|pad|>'
                                              )

print('tokenizer max length:', tokenizer.model_max_length)

train_encoding = tokenizer(train_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
eval_encoding = tokenizer(validation_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')
test_encoding = tokenizer(test_dataset, padding=True, truncation=True, max_length=1024, return_tensors='pt')

print('Converted to torch dataset')

torch_dataset_train = torchDataset(train_encoding)
torch_dataset_eval = torchDataset(eval_encoding)
torch_dataset_test = torchDataset(test_encoding)

# Setup training hyperparameters
training_args = TrainingArguments(
    output_dir='/model_dump/',
    num_train_epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=DECAY
)

model = GPT2LMHeadModel.from_pretrained(model_id)
model.resize_token_embeddings(len(tokenizer))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoding,
    eval_dataset=eval_encoding

)

trainer.train()
# model.save_pretrained('/model_dump/')

But with this code I get this error

The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,past_key_values,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,encoder_hidden_states,encoder_attention_mask,labels,use_cache,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.

When I use the variables torch_dataset_train and torch_dataset_eval in Trainer's arguments, the error I get is:

TypeError: vars() argument must have __dict__ attribute

This typeError is the same I get if as dataset I use the WikiText2 from torchtext. How can I fix this issue?

0

There are 0 best solutions below