I'm fine-tuning the TFT5ForConditionalGeneration model ("t5-small"). Before doing model.fit() and save the model I set the output_score=True
as well as the relevant parameters in the generation config. After fine-tuning the model I load it and I can see that in the loaded model the config.output_scores = True
as well as the generation_config.output_scores=True
.
However, when I generate the text, the scores
returns empty.
How can I fix this please? I read documentation https://huggingface.co/docs/transformers/internal/generation_utils#transformers.generation.TFSampleEncoderDecoderOutput, https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/text_generation#transformers.TFGenerationMixin but wasn't able to figure out how to get the scores in the generated text.
I'm using transformers version 4.36.0
import os
import pandas as pd
import json
import tensorflow as tf
from datasets import Dataset
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import TFAutoModelForSeq2SeqLM, GenerationConfig, AutoConfig, AutoTokenizer, DataCollatorForSeq2Seq
tokenizer = AutoTokenizer.from_pretrained("t5-small", add_special_tokens=True, pad_to_max_length=True, truncation=True)
def tokenize_dataset(dataset):
tokenized_dataset = dataset.map(
tokenize, batched=True, num_proc=4, batch_size=16, remove_columns=['x', 'y'],
fn_kwargs={
"input_col": 'x', "output_col": 'y', "add_special_tokens": True, "pad_to_max_length": True,
"truncation": True, "max_input_length": 128, "tokenizer": tokenizer
}
)
return tokenized_dataset
def tokenize(examples, input_col, output_col, add_special_tokens, pad_to_max_length, truncation, max_input_length, tokenizer):
encodings = tokenizer(
examples[input_col], add_special_tokens=add_special_tokens, pad_to_max_length=pad_to_max_length,
truncation=truncation, max_length=max_input_length
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples[output_col], add_special_tokens=add_special_tokens, pad_to_max_length=pad_to_max_length,
truncation=truncation, max_length=max_input_length
)
encodings["labels"] = labels["input_ids"]
return encodings
def save_model(model, model_files_path, model_name):
model_json = model.to_json()
if not os.path.exists(model_files_path):
os.makedirs(model_files_path)
with open(os.path.join(model_files_path, model_name + ".json"), 'w') as json_file:
json_file.write(model_json)
json_file.close()
transformer_model_path = "<your path>"
model_files_path = "<your path>"
model_name = "model"
model_fn = model_name + ".h5"
checkpoint = ModelCheckpoint(os.path.join(model_files_path, model_fn), monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=True, mode="min")
early_stop = EarlyStopping(monitor='val_loss', patience=4, min_delta=0.005, verbose=1)
callbacks = [early_stop, checkpoint]
train_df = pd.DataFrame({'x': ['Mover, Part-time Afternoon Shift', 'Front Desk Clerk', 'Automotive Technician - $1,000 Sign-On Bonus', 'Administrative Manager', 'RN - Emergency Department', 'Port Operations Manager', 'Human Services Technician', 'Server administration', 'Wit & Wisdom-Food Runner', 'Business Development Executive', 'assistant', 'Technical Recruiter', 'Survey Research Technician III', 'Freelance- Translators/Linguists, English to Swedish (Swedish Native)', 'HR Technology Advisor'] * 4, 'y': ['Mover', 'Front Desk Clerk', 'Automotive Technician', 'Administrative Manager', 'RN - Emergency Department', 'Port Operations Manager', 'Human Services Technician', 'Server administration', 'Food Runner', 'Business Development Executive', 'assistant', 'Technical Recruiter', 'Survey Research Technician III', 'Translators/Linguists, English to Swedish', 'HR Technology Advisor'] * 4})
val_df = pd.DataFrame({'x': ['Specialist, Client Success Management', 'Sales Engineer', 'REMOTE Data Engineer - Must Live in Midwest or East Coast!', 'API Developer', 'Accountant'] * 5, 'y': ['Specialist, Client Success Management', 'Sales Engineer', 'Data Engineer', 'API Developer', 'Accountant'] * 5})
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
tokenized_train_dataset = tokenize_dataset(train_dataset)
tokenized_val_dataset = tokenize_dataset(val_dataset)
config = AutoConfig.from_pretrained("t5-small")
config.output_scores = True
transformer_model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small", config=config)
generation_config = GenerationConfig(**transformer_model.generation_config.__dict__)
generation_config.max_new_tokens = 10
generation_config.return_dict_in_generate = True
generation_config.output_scores = True
transformer_model.save_pretrained(transformer_model_path)
generation_config.save_pretrained(transformer_model_path)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=transformer_model, return_tensors="tf")
train_dataset = tokenized_train_dataset.to_tf_dataset(batch_size=32, columns=["input_ids", "attention_mask", "labels"], shuffle=True, collate_fn=data_collator)
validation_dataset = tokenized_val_dataset.to_tf_dataset(batch_size=32, columns=["input_ids", "attention_mask", "labels"], shuffle=False, collate_fn=data_collator)
optimizer = tf.optimizers.Adam(learning_rate=2e-5)
transformer_model.compile(optimizer=optimizer, run_eagerly=True)
transformer_model.fit(train_dataset, validation_data=validation_dataset, epochs=2, callbacks=callbacks)
save_model(transformer_model, model_files_path, model_name)
# load model
transformer_model = TFAutoModelForSeq2SeqLM.from_pretrained(transformer_model_path)
transformer_model.load_weights(os.path.join(model_files_path, model_fn))
generation_config = GenerationConfig.from_pretrained(transformer_model_path, "generation_config.json")
tokenizer = AutoTokenizer.from_pretrained("t5-small")
text_lst = ["Mover, Part-time Afternoon Shift", "Front Desk Clerk"]
text_tokenized = tokenizer(text_lst, add_special_tokens=True, pad_to_max_length=True, truncation=True, max_length=True)
# text generation
# input_ids = [[283, 1890, 6, 2733, 18, 715, 621, 29, 32, 106, 4804, 89, 17, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
generated_text = transformer_model.generate(text_tokenized.input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True)
print(generated_text.scores)
Edit Jan 9th 2023
I added more information to the code above. I understand now why generated_text.scores
was empty. When you run the above code the generated_text.scores
is not empty. However, when I wrap the transformer_model.generate()
within a function so I can use jit_compile=True
it does return empty.
So now my question is how can I get the scores when using jit_compile=True
?
@tf.function(jit_compile=True)
def generate(transformer_model, input_ids, generation_config):
# https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb
generated_text = transformer_model.generate(
input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True
)
return generated_text