I've been attempting to "transform" a multiclass classification system using ViTForImageClassification into a multilabel. However, I've been running into some problems.
(The original multiclass system which I'm attempting to convert can be found here.)
The folder structure for the dataset is as follow:
/dataset
./class1
./class2
./class3
./class1-class2
./class2-class3
The code I have so far to prepare the dataset is as follow:
file_names = []
labels = []
all_labels = []
for file in sorted((Path('/content/dataset').glob('*/*.*'))):
folder = str(file).split('/')[-2].split('.')[0]
label = folder.split('-')
for l in label:
if not set([l + '.class']).issubset(all_labels):
all_labels.append(str(label[0]) + '.class')
labels.append([x + '.class' for x in label])
file_names.append(str(file))
print(len(file_names), len(labels))
df = pd.DataFrame.from_dict({"image": file_names, "label": labels})
mlb = MultiLabelBinarizer()
mlb_result = mlb.fit_transform([df.loc[i,'label'] for i in range(len(df))])
df_final = pd.concat([df['image'],pd.DataFrame(mlb_result,columns=list(mlb.classes_))],axis=1)
dataset = Dataset.from_pandas(df_final).cast_column("image", Image())
labels_list = list(set(all_labels))
label2id, id2label = dict(), dict()
for i, label in enumerate(labels_list):
label2id[label] = i
id2label[i] = label
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
print(ClassLabels)
dataset = dataset.train_test_split(test_size=0.8, shuffle=True)
train_data = dataset['train']
test_data = dataset['test']
model_str = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_str)
image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]
normalize = Normalize(mean=image_mean, std=image_std)
_train_transforms = Compose(
[
Resize((size, size)),
RandomRotation(10),
RandomAdjustSharpness(2),
ToTensor(),
normalize
]
)
_val_transforms = Compose(
[
Resize((size, size)),
ToTensor(),
normalize
]
)
def train_transforms(examples):
examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
return examples
def val_transforms(examples):
examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['image']]
return examples
train_data.set_transform(train_transforms)
test_data.set_transform(val_transforms)
The code that I have to prepare the model is:
model = ViTForImageClassification.from_pretrained(model_str, num_labels=len(labels_list), problem_type="multi_label_classification")
model.config.id2label = id2label
model.config.label2id = label2id
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions = eval_pred.predictions
label_ids = eval_pred.label_ids
predicted_labels = predictions.argmax(axis=1)
acc_score = accuracy.compute(predictions=predicted_labels, references=label_ids)['accuracy']
return {
"accuracy": acc_score
}
metric_name = "accuracy"
model_name = "multilabel-classifier"
num_train_epochs = 30
args = TrainingArguments(
output_dir=model_name,
logging_dir='./logs',
evaluation_strategy="epoch",
learning_rate=1e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=8,
num_train_epochs=num_train_epochs,
weight_decay=0.02,
warmup_steps=50,
remove_unused_columns=False,.
save_strategy='epoch',
load_best_model_at_end=True,
save_total_limit=1,
report_to="mlflow"
)
# Attempting to shape it: pixel_values of shape (batch_size, num_channels, height, width) and labels of shape (batch_size, num_labels)
def collate_fn(examples):
pixel_values = torch.stack([example["pixel_values"] for example in examples])
temp = []
for example in examples:
temp2 = []
for label in example:
if label != 'image' and label != 'pixel_values':
temp2.append(example[label])
temp.append(temp2)
print(temp)
labels = torch.tensor(temp)
print(labels)
return {"pixel_values": pixel_values, "labels": labels}
trainer = Trainer(
model,
args,
train_dataset=train_data,
eval_dataset=test_data,
data_collator=collate_fn,
compute_metrics=compute_metrics,
tokenizer=processor,
)
The code I have to train the model is:
trainer.evaluate()
trainer.train()
The current error I'm running into is:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-13-bceb9305a605> in <cell line: 5>()
3 # to assess how well the model is performing on unseen data.
4
----> 5 trainer.evaluate()
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in binary_cross_entropy_with_logits(input, target, weight, size_average, reduce, reduction, pos_weight)
3193 raise ValueError(f"Target size ({target.size()}) must be the same as input size ({input.size()})")
3194
-> 3195 return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
3196
3197
RuntimeError: result type Float can't be cast to the desired output type Long
I believe I'm preparing the dataset wrong as I can infer from here, however I'm not sure how to continue / fix what I currently have.
The question above is already very close to the answer, however it requires a little bit of tweaking.
First of all we add a id mapping step before creating the test and train split. This will allow for the system to be able to label with the class's name instead of simply the ID.
The creation of the model has been changed to:
Then I changed the collate_fn function (which solves the specific bug described above) by changing the return statment to:
The last function changed was the compute_metrics. This is important, as the metrics which you wish to calculate with a multiclass or multilabel model vary differently.