i'm trying to fine tunning whisper-medium for Koreans language.
Here is tutorial that i followed.
And here is my experiment setting
python==3.9.16
transformers==4.27.4
tokenizers==0.13.3
torch==2.0.0
torchaudio==2.0.0
torchmetrics==0.11.4
torchvision==0.15.0
Here is preprocessing code for data
from transformers import WhisperFeatureExtractor
from datasets import load_dataset
from transformers import WhisperTokenizer
from datasets import Audio
def prepare_dataset(batch, tokenizer, feature_extractor):
# load and resample audio data from 48 to 16kHz
audio = batch["audio"]
# compute log-Mel input features from input audio array
batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
# encode target text to label ids
batch["labels"] = tokenizer(batch["text"]).input_ids
return batch
def main():
dataset = load_dataset("Bingsu/zeroth-korean")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="ko", task="transcribe")
mapped_dataset = dataset.map(lambda x: prepare_dataset(x, tokenizer, feature_extractor), remove_columns=dataset.column_names["train"], num_proc=16)
mapped_dataset.save_to_disk('./data/Bingsu_zeroth-korean', num_proc=4)
if __name__ == '__main__':
main()
And here is training code.
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_from_disk
from utils import compute_metrics, DataCollatorSpeechSeq2SeqWithPadding
import os
import argparse
import evaluate
def define_argparser():
"""Function to define the command line arguments
Returns:
argparse.Namespace: Command line arguments
"""
p = argparse.ArgumentParser()
p.add_argument('--data_path', type=str, default='./data/Bingsu_zeroth-korean/')
p.add_argument('--model_address', type=str, default='openai/whisper-medium')
p.add_argument('--model_save_path', type=str, default='./models_zoo/openai_whisper-medium/')
p.add_argument('--gradient_accumulation_steps', type=int, default=1)
p.add_argument('--batch_size_per_device', type=int, default=24)
# p.add_argument('--n_epochs', type=int, default=5)
p.add_argument('--total_step', type=int, default=500)
p.add_argument('--warmup_ratio', type=float, default=.2)
# p.add_argument('--max_length', type=int, default=225)
config = p.parse_args()
return config
def main(config):
"""Main function to train the language model
Args:
config (argparse.Namespace): Command line arguments
"""
dataset = load_from_disk(config.data_path)
processor = WhisperProcessor.from_pretrained(config.model_address, language="ko", task="transcribe")
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
model = WhisperForConditionalGeneration.from_pretrained(config.model_address)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
metric = evaluate.load("wer")
print(
'|train| =', len(dataset['train']),
'|valid| =', len(dataset['test']),
)
# total_batch_size = config.batch_size_per_device * torch.cuda.device_count()
# n_total_iterations = int(len(dataset['train']) / total_batch_size * config.n_epochs)
n_total_iterations = config.total_step
n_warmup_steps = int(n_total_iterations * config.warmup_ratio)
print(
'#total_iters =', n_total_iterations,
'#warmup_iters =', n_warmup_steps,
)
training_args = Seq2SeqTrainingArguments(
output_dir=os.path.join(config.model_save_path, 'checkpoints'),
# num_train_epochs=config.n_epochs,
max_steps=n_total_iterations,
per_device_train_batch_size=config.batch_size_per_device,
per_device_eval_batch_size=config.batch_size_per_device,
gradient_accumulation_steps=config.gradient_accumulation_steps,
warmup_steps=n_warmup_steps,
fp16=True,
learning_rate=5e-6,
gradient_checkpointing=True,
evaluation_strategy='steps',
save_strategy ='steps',
report_to=["tensorboard"],
logging_steps=25,
save_steps=n_total_iterations // 5,
eval_steps=n_total_iterations // 5,
predict_with_generate=True,
generation_max_length=config.max_length,
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
dataloader_num_workers=16
)
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
data_collator=data_collator,
compute_metrics=lambda x: compute_metrics(x, metric, processor.tokenizer),
tokenizer=processor.feature_extractor,
)
trainer.train()
trainer.model.save_pretrained(os.path.join(config.model_save_path, 'model_weights'))
# tokenizer.save_pretrained(os.path.join(config.model_save_path, 'tokenizer'))
if __name__ == '__main__':
config = define_argparser()
main(config)
During the training, the training loss and evaluation loss goes down, but The WER goes 100.
And, after training, the model always predict <|startoftranscript|><|endoftext|> for all samples.
How should i fix this?