I am trying to train gpt2 on an IMDB Sentimental dataset for a classification task.
The dataset looks like the following:
my code is as follows:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification , AdamW
import numpy as np
import torch
df = pd.read_csv('data/IMDB.csv')
x = df['text'].tolist()
y = df['label'].tolist()
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token':'<|endoftext|>'})
tokenizer.padding_side="left"
tokenized_text = [tokenizer.encode(text, truncation=True, padding='max_length', max_length=128) for text in x]
# Step 3: Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(tokenized_text, y, test_size=0.2, random_state=42)
# Convert tokens to PyTorch tensors
x_train_tensors = torch.tensor(x_train)
y_train_tensors = torch.tensor(y_train)
x_test_tensors = torch.tensor(x_test)
y_test_tensors = torch.tensor(y_test)
# Create DataLoader for training and testing sets
train_dataset = TensorDataset(x_train_tensors, y_train_tensors)
test_dataset = TensorDataset(x_test_tensors, y_test_tensors)
batch_size =8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2) # Assuming binary classification
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
model.train()
for inputs, labels in train_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# Evaluation on the test set
model.eval()
with torch.no_grad():
correct = 0
total = 0
for inputs, labels in test_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
predictions = torch.argmax(outputs.logits, dim=1)
total += labels.size(0)
correct += (predictions == labels).sum().item()
accuracy = correct / total
print(f"Epoch {epoch + 1}, Test Accuracy: {accuracy:.4f}")
I got the following error at outputs = model(inputs, labels=labels):
AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.
I already defined the no padding token so I have no idea why this assertion error appears.

The issue was solved by configuring the model pad toked ID