I am using hugging face Roberta to classify multi-class dataset, but now I got an error “Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 15]))”. I am not sure what should I do now, could anyone provide some suggestions? Below is my codes and you can also find the error message in the bottom:
from datasets import load_dataset
from transformers import RobertaTokenizerFast, Trainer, DataCollatorWithPadding
dataset = load_dataset('csv', data_files=data_path,split = 'train')
train_testvalid = dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.2)
checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(checkpoint)
def tokenization(example):
return tokenizer(example['text'], truncation=True,max_length = 256, padding = True)
train_data = test_valid['train']
test_data= test_valid['test']
train_data = train_data.map(tokenization, batched = True)
test_data = test_data.map(tokenization, batched = True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_data = train_data.remove_columns(["Unnamed: 0", "text"])
test_data = test_data.remove_columns(["Unnamed: 0", "text"])
train_data.set_format("torch")
test_data.set_format("torch")
train_data.column_names
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")
from transformers import RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=15)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
from transformers import Trainer
trainer = Trainer(
model,
training_args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=test_data,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
ValueError Traceback (most recent call last)
<ipython-input-9-3435b262f1ae> in <module>
----> 1 trainer.train()
8 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in binary_cross_entropy_with_logits(input, target, weight, size_average, reduce, reduction, pos_weight)
3146
3147 if not (target.size() == input.size()):
-> 3148 raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
3149
3150 return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 15]))
I got the same error. For me the issue was that my "label" column was a Float-Type. Converting it to Int solved this problem for me: