I've been using the following code but it only logs the train loss and test accuracy/loss to Weights & Biaeses.
It is not a neccesity to get the logs train accuracy logs in Weights & Biases; but I need to see it after each epoch at least in the command ouput.
Other metrics like F1, precision, recall would also be good.
I checked these but could not really adapt them into my code: https://discuss.huggingface.co/t/metrics-for-training-set-in-trainer/2461/4 https://discuss.huggingface.co/t/logging-training-accuracy-using-trainer-class/5524
My code is:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
import numpy as np
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
class Config:
MODEL_CKPT ="dbmdz/bert-base-turkish-uncased"
TOKENIZER = "dbmdz/bert-base-turkish-uncased"
MODEL_OUT_DIR = "bert-sentiment-analysis-deneme"
SRC_COLUMN = "Text"
TGT_COLUMN = "Label"
SEED = 0
MAX_LEN = 512
DEVICE = "cuda"
ID2LABEL = {0: "discrediting", 1: "promoting"}
LABEL2ID = {"discrediting": 0, "promoting": 1}
EVAL_METRIC = "accuracy"
NUM_EPOCHS = 6
LR = 2E-5
BATCH_SIZE = 8
WEIGHT_DECAY = 0.01
EVAL_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
LOGGING_STRATEGY = "epoch"
PUSH_TO_HUB = True
config = Config()
class TextClassificationDataset:
def __init__(self, train_data, test_data):
self.src_column = config.SRC_COLUMN
self.tgt_column = config.TGT_COLUMN
self.seed = config.SEED
self.max_len = config.MAX_LEN
self.tokenizer = AutoTokenizer.from_pretrained(config.TOKENIZER)
# Load your own train and test data from Excel files
self.train_df = pd.read_excel(train_data)
self.test_df = pd.read_excel(test_data)
# Shuffle the data if needed
self.train_df = self.train_df.sample(frac=1, random_state=self.seed).reset_index(drop=True)
self.test_df = self.test_df.sample(frac=1, random_state=self.seed).reset_index(drop=True)
def tokenize_function(self, example):
model_inp = self.tokenizer(example[self.src_column], truncation=True, padding=True, max_length=self.max_len)
labels = torch.tensor(example[self.tgt_column], dtype=torch.int)
model_inp["labels"] = labels
return model_inp
def preprocess_function(self, data):
model_inp = data.map(self.tokenize_function, batched=True, remove_columns=data.column_names)
return model_inp
def gen_classification_dataset(self):
train_data = Dataset.from_pandas(self.train_df)
test_data = Dataset.from_pandas(self.test_df)
train_tokenized_data = self.preprocess_function(train_data)
test_tokenized_data = self.preprocess_function(test_data)
return train_tokenized_data, test_tokenized_data
# Replace 'your_train.xlsx' and 'your_test.xlsx' with the actual file paths to your train and test Excel files
train_data = '/content/drive/MyDrive/Colab Notebooks/combined_train_nod.xlsx
test_data = '/content/drive/MyDrive/Colab Notebooks/combined_test_nod.xlsx'
# Create an instance of the TextClassificationDataset class
textclassificationdataset = TextClassificationDataset(train_data, test_data)
# Generate tokenized training and testing datasets
train_tokenized_data, test_tokenized_data = textclassificationdataset.gen_classification_dataset()
print("Train Dataset Sample:")
print(textclassificationdataset.train_df.head())
print("\nTrain Dataset Statistics:")
print(textclassificationdataset.train_df.describe())
print("\Test Dataset Statistics:")
print(textclassificationdataset.test_df.describe())
class TextClassificationModelTrainer:
def __init__(self, train_data, test_data):
self.train_data = train_data
self.test_data = test_data
self.model_ckpt = config.MODEL_CKPT
self.id2label = config.ID2LABEL
self.label2id = config.LABEL2ID
self.num_labels = len(self.id2label)
self.device = config.DEVICE
self.eval_metric = config.EVAL_METRIC
self.model_out_dir = config.MODEL_OUT_DIR
self.num_epochs = config.NUM_EPOCHS
self.lr = config.LR
self.batch_size = config.BATCH_SIZE
self.weight_decay = config.WEIGHT_DECAY
self.eval_strategy = config.EVAL_STRATEGY
self.save_strategy = config.SAVE_STRATEGY
self.logging_strategy = config.LOGGING_STRATEGY
self.push_to_hub = config.PUSH_TO_HUB
self.model = AutoModelForSequenceClassification.from_pretrained(
self.model_ckpt,
id2label=self.id2label,
label2id=self.label2id,
num_labels=self.num_labels
).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
self.eval_metric_computer = evaluate.load(self.eval_metric)
self.data_collator = DataCollatorWithPadding(self.tokenizer)
def compute_metrics(self, eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return self.eval_metric_computer.compute(predictions=predictions, references=labels)
def set_training_args(self):
return TrainingArguments(
output_dir = self.model_out_dir,
num_train_epochs=self.num_epochs,
learning_rate = self.lr,
per_device_train_batch_size = self.batch_size,
per_device_eval_batch_size = self.batch_size,
weight_decay = self.weight_decay,
evaluation_strategy = self.eval_strategy,
save_strategy = self.save_strategy,
logging_strategy = self.logging_strategy,
push_to_hub = self.push_to_hub
)
def model_trainer(self):
return Trainer(
model = self.model,
args = self.set_training_args(),
data_collator = self.data_collator,
train_dataset = self.train_data,
eval_dataset = self.test_data,
compute_metrics = self.compute_metrics
)
def train_and_save_and_push_to_hub(self):
trainer = self.model_trainer()
trainer.train()
trainer.push_to_hub()
if __name__ == "__main__":
text_classification_dataset = TextClassificationDataset(train_data, test_data)
train_data, test_data = text_classification_dataset.gen_classification_dataset()
text_classification_trainer = TextClassificationModelTrainer(train_data, test_data)
text_classification_trainer.train_and_save_and_push_to_hub()
I would really appreciate it if you can help me with this.