TFT5ForConditionalGeneration generate returns empty output_scores

132 Views Asked by At

I'm fine-tuning the TFT5ForConditionalGeneration model ("t5-small"). Before doing model.fit() and save the model I set the output_score=True as well as the relevant parameters in the generation config. After fine-tuning the model I load it and I can see that in the loaded model the config.output_scores = True as well as the generation_config.output_scores=True.

However, when I generate the text, the scores returns empty. How can I fix this please? I read documentation https://huggingface.co/docs/transformers/internal/generation_utils#transformers.generation.TFSampleEncoderDecoderOutput, https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/text_generation#transformers.TFGenerationMixin but wasn't able to figure out how to get the scores in the generated text.

I'm using transformers version 4.36.0

import os
import pandas as pd
import json
import tensorflow as tf
from datasets import Dataset
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import TFAutoModelForSeq2SeqLM, GenerationConfig, AutoConfig, AutoTokenizer, DataCollatorForSeq2Seq

tokenizer = AutoTokenizer.from_pretrained("t5-small", add_special_tokens=True, pad_to_max_length=True, truncation=True)


def tokenize_dataset(dataset):
    tokenized_dataset = dataset.map(
        tokenize, batched=True, num_proc=4, batch_size=16, remove_columns=['x', 'y'],
        fn_kwargs={
            "input_col": 'x', "output_col": 'y', "add_special_tokens": True, "pad_to_max_length": True, 
            "truncation": True, "max_input_length": 128, "tokenizer": tokenizer
        }
    )
    return tokenized_dataset


def tokenize(examples, input_col, output_col, add_special_tokens, pad_to_max_length, truncation, max_input_length, tokenizer):
    encodings = tokenizer(
        examples[input_col], add_special_tokens=add_special_tokens, pad_to_max_length=pad_to_max_length,
        truncation=truncation, max_length=max_input_length
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples[output_col], add_special_tokens=add_special_tokens, pad_to_max_length=pad_to_max_length,
            truncation=truncation, max_length=max_input_length
        )
    encodings["labels"] = labels["input_ids"]

    return encodings


def save_model(model, model_files_path, model_name):
    model_json = model.to_json()
    if not os.path.exists(model_files_path):
        os.makedirs(model_files_path)
    with open(os.path.join(model_files_path, model_name + ".json"), 'w') as json_file:
            json_file.write(model_json)
            json_file.close()

transformer_model_path = "<your path>"
model_files_path = "<your path>"
model_name = "model"
model_fn = model_name + ".h5"

checkpoint = ModelCheckpoint(os.path.join(model_files_path, model_fn), monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=True, mode="min")
early_stop = EarlyStopping(monitor='val_loss', patience=4, min_delta=0.005, verbose=1)
callbacks = [early_stop, checkpoint]

train_df = pd.DataFrame({'x': ['Mover, Part-time Afternoon Shift', 'Front Desk Clerk', 'Automotive Technician - $1,000 Sign-On Bonus', 'Administrative Manager', 'RN - Emergency Department', 'Port Operations Manager', 'Human Services Technician', 'Server administration', 'Wit & Wisdom-Food Runner', 'Business Development Executive', 'assistant', 'Technical Recruiter', 'Survey Research Technician III', 'Freelance- Translators/Linguists, English to Swedish (Swedish Native)', 'HR Technology Advisor'] * 4, 'y': ['Mover', 'Front Desk Clerk', 'Automotive Technician', 'Administrative Manager', 'RN - Emergency Department', 'Port Operations Manager', 'Human Services Technician', 'Server administration', 'Food Runner', 'Business Development Executive', 'assistant', 'Technical Recruiter', 'Survey Research Technician III', 'Translators/Linguists, English to Swedish', 'HR Technology Advisor'] * 4})
val_df = pd.DataFrame({'x': ['Specialist, Client Success Management', 'Sales Engineer', 'REMOTE Data Engineer - Must Live in Midwest or East Coast!', 'API Developer', 'Accountant'] * 5, 'y': ['Specialist, Client Success Management', 'Sales Engineer', 'Data Engineer', 'API Developer', 'Accountant'] * 5})


train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

tokenized_train_dataset = tokenize_dataset(train_dataset)
tokenized_val_dataset = tokenize_dataset(val_dataset)

config = AutoConfig.from_pretrained("t5-small")
config.output_scores = True
transformer_model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small", config=config)

generation_config = GenerationConfig(**transformer_model.generation_config.__dict__)
generation_config.max_new_tokens = 10
generation_config.return_dict_in_generate = True
generation_config.output_scores = True

transformer_model.save_pretrained(transformer_model_path)
generation_config.save_pretrained(transformer_model_path)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=transformer_model, return_tensors="tf")
train_dataset = tokenized_train_dataset.to_tf_dataset(batch_size=32, columns=["input_ids", "attention_mask", "labels"], shuffle=True, collate_fn=data_collator)
validation_dataset = tokenized_val_dataset.to_tf_dataset(batch_size=32, columns=["input_ids", "attention_mask", "labels"], shuffle=False, collate_fn=data_collator)

optimizer = tf.optimizers.Adam(learning_rate=2e-5)
transformer_model.compile(optimizer=optimizer, run_eagerly=True)
transformer_model.fit(train_dataset, validation_data=validation_dataset, epochs=2, callbacks=callbacks)

save_model(transformer_model, model_files_path, model_name)

# load model
transformer_model = TFAutoModelForSeq2SeqLM.from_pretrained(transformer_model_path)
transformer_model.load_weights(os.path.join(model_files_path, model_fn))
generation_config = GenerationConfig.from_pretrained(transformer_model_path, "generation_config.json")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

text_lst = ["Mover, Part-time Afternoon Shift", "Front Desk Clerk"]
text_tokenized = tokenizer(text_lst, add_special_tokens=True, pad_to_max_length=True, truncation=True, max_length=True)

# text generation
# input_ids = [[283, 1890, 6, 2733, 18, 715, 621, 29, 32, 106, 4804, 89, 17, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
generated_text = transformer_model.generate(text_tokenized.input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True)
print(generated_text.scores)

Edit Jan 9th 2023

I added more information to the code above. I understand now why generated_text.scores was empty. When you run the above code the generated_text.scores is not empty. However, when I wrap the transformer_model.generate() within a function so I can use jit_compile=True it does return empty.

So now my question is how can I get the scores when using jit_compile=True?

@tf.function(jit_compile=True)
def generate(transformer_model, input_ids, generation_config):
    # https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb
    generated_text = transformer_model.generate(
        input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True
    )

    return generated_text
0

There are 0 best solutions below