I can't get trainer accuracy

34 Views Asked by At

I've been using the following code but it only logs the train loss and test accuracy/loss to Weights & Biaeses.

It is not a neccesity to get the logs train accuracy logs in Weights & Biases; but I need to see it after each epoch at least in the command ouput.

Other metrics like F1, precision, recall would also be good.

I checked these but could not really adapt them into my code: https://discuss.huggingface.co/t/metrics-for-training-set-in-trainer/2461/4 https://discuss.huggingface.co/t/logging-training-accuracy-using-trainer-class/5524

My code is:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
import numpy as np
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split


import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

class Config:
    MODEL_CKPT ="dbmdz/bert-base-turkish-uncased"
    TOKENIZER = "dbmdz/bert-base-turkish-uncased"
    MODEL_OUT_DIR = "bert-sentiment-analysis-deneme"
    SRC_COLUMN = "Text"
    TGT_COLUMN = "Label"
    SEED = 0
    MAX_LEN = 512
    DEVICE = "cuda"
    ID2LABEL = {0: "discrediting", 1: "promoting"}
    LABEL2ID = {"discrediting": 0, "promoting": 1}
    EVAL_METRIC = "accuracy"
    NUM_EPOCHS = 6
    LR = 2E-5
    BATCH_SIZE = 8
    WEIGHT_DECAY = 0.01
    EVAL_STRATEGY = "epoch"
    SAVE_STRATEGY = "epoch"
    LOGGING_STRATEGY = "epoch"
    PUSH_TO_HUB = True

config = Config()

class TextClassificationDataset:
    def __init__(self, train_data, test_data):
        self.src_column = config.SRC_COLUMN
        self.tgt_column = config.TGT_COLUMN
        self.seed = config.SEED
        self.max_len = config.MAX_LEN
        self.tokenizer = AutoTokenizer.from_pretrained(config.TOKENIZER)

        # Load your own train and test data from Excel files
        self.train_df = pd.read_excel(train_data)
        self.test_df = pd.read_excel(test_data)

        # Shuffle the data if needed
        self.train_df = self.train_df.sample(frac=1, random_state=self.seed).reset_index(drop=True)
        self.test_df = self.test_df.sample(frac=1, random_state=self.seed).reset_index(drop=True)

    def tokenize_function(self, example):
        model_inp = self.tokenizer(example[self.src_column], truncation=True, padding=True, max_length=self.max_len)
        labels = torch.tensor(example[self.tgt_column], dtype=torch.int)
        model_inp["labels"] = labels
        return model_inp

    def preprocess_function(self, data):
        model_inp = data.map(self.tokenize_function, batched=True, remove_columns=data.column_names)
        return model_inp

    def gen_classification_dataset(self):
        train_data = Dataset.from_pandas(self.train_df)
        test_data = Dataset.from_pandas(self.test_df)
        train_tokenized_data = self.preprocess_function(train_data)
        test_tokenized_data = self.preprocess_function(test_data)
        return train_tokenized_data, test_tokenized_data


# Replace 'your_train.xlsx' and 'your_test.xlsx' with the actual file paths to your train and test Excel files
train_data = '/content/drive/MyDrive/Colab Notebooks/combined_train_nod.xlsx
test_data = '/content/drive/MyDrive/Colab Notebooks/combined_test_nod.xlsx'

# Create an instance of the TextClassificationDataset class
textclassificationdataset = TextClassificationDataset(train_data, test_data)

# Generate tokenized training and testing datasets
train_tokenized_data, test_tokenized_data = textclassificationdataset.gen_classification_dataset()

print("Train Dataset Sample:")
print(textclassificationdataset.train_df.head())
print("\nTrain Dataset Statistics:")
print(textclassificationdataset.train_df.describe())

print("\Test Dataset Statistics:")
print(textclassificationdataset.test_df.describe())



class TextClassificationModelTrainer:
    def __init__(self, train_data, test_data):
        self.train_data = train_data
        self.test_data = test_data
        self.model_ckpt = config.MODEL_CKPT
        self.id2label = config.ID2LABEL
        self.label2id = config.LABEL2ID
        self.num_labels = len(self.id2label)
        self.device = config.DEVICE
        self.eval_metric = config.EVAL_METRIC
        self.model_out_dir = config.MODEL_OUT_DIR
        self.num_epochs = config.NUM_EPOCHS
        self.lr = config.LR
        self.batch_size = config.BATCH_SIZE
        self.weight_decay = config.WEIGHT_DECAY
        self.eval_strategy = config.EVAL_STRATEGY
        self.save_strategy = config.SAVE_STRATEGY
        self.logging_strategy = config.LOGGING_STRATEGY
        self.push_to_hub = config.PUSH_TO_HUB
        self.model = AutoModelForSequenceClassification.from_pretrained(
                                                                        self.model_ckpt,
                                                                        id2label=self.id2label,
                                                                        label2id=self.label2id,
                                                                        num_labels=self.num_labels
                                                                        ).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
        self.eval_metric_computer = evaluate.load(self.eval_metric)
        self.data_collator = DataCollatorWithPadding(self.tokenizer)

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return self.eval_metric_computer.compute(predictions=predictions, references=labels)

    def set_training_args(self):
        return TrainingArguments(
        output_dir = self.model_out_dir,
        num_train_epochs=self.num_epochs,
        learning_rate = self.lr,
        per_device_train_batch_size = self.batch_size,
        per_device_eval_batch_size = self.batch_size,
        weight_decay = self.weight_decay,
        evaluation_strategy = self.eval_strategy,
        save_strategy = self.save_strategy,
        logging_strategy = self.logging_strategy,
        push_to_hub = self.push_to_hub
        )

    def model_trainer(self):
        return Trainer(
            model = self.model,
            args = self.set_training_args(),
            data_collator = self.data_collator,
            train_dataset = self.train_data,
            eval_dataset = self.test_data,
            compute_metrics = self.compute_metrics
        )

    def train_and_save_and_push_to_hub(self):
        trainer = self.model_trainer()
        trainer.train()
        trainer.push_to_hub()

if __name__ == "__main__":
    text_classification_dataset = TextClassificationDataset(train_data, test_data)
    train_data, test_data = text_classification_dataset.gen_classification_dataset()
    text_classification_trainer = TextClassificationModelTrainer(train_data, test_data)
    text_classification_trainer.train_and_save_and_push_to_hub()

I would really appreciate it if you can help me with this.

0

There are 0 best solutions below