Following recent blog posts of fine-tuning LLMs, such as Llama-7B or Mistral-7B, I created my own tuning script with small adaptions to fine-tune LLMs for a specific downstream task.
I use PEFT and specifically LoRA to fine-tune LLMs with 7B parameters using a task-specific dataset. I conduct the fine-tune experiments on a machine equipped with a NVIDIA A100 GPU (40GB RAM). Below is my code and the respective configuration of LoRA, BitsandBytes, and the trainer arguments.
Code:
def train(model, tokenizer, dataset, lora_config, train_config, output_dir):
# enabling gradient checkpointing to reduce memory usage during fine-tuning
model.config.use_cache = False
model.gradient_checkpointing_enable()
# use the prepare_model_for_kbit_training method from PEFT
model = prepare_model_for_kbit_training(model)
# Get lora module names
modules = find_all_linear_names(model)
# create PEFT config for these modules and wrap the model to PEFT
peft_config = LoraConfig(
lora_alpha=lora_config["lora_alpha"],
lora_dropout=lora_config["lora_dropout"],
r=lora_config["r"],
bias=lora_config["bias"],
target_modules=modules,
task_type=lora_config["task_type"]
)
model = get_peft_model(model, peft_config)
# print information about the percentage of trainable parameters
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")
# Training parameters
train_args = TrainingArguments(**train_config)
trainer = Trainer(
model=model,
train_dataset=dataset,
args=train_args,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# re-enable for inference to speed up predictions for similar inputs
model.config.use_cache = False
# Launch training
print("Fine-tuning...")
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics)
# Saving model
print("Saving last checkpoint of the model...")
os.makedirs(output_dir, exist_ok=True)
trainer.model.save_pretrained(output_dir)
# Free memory for merging weights
del model
del trainer
torch.cuda.empty_cache()
if __name__ == "__main__":
parser=argparse.ArgumentParser()
parser.add_argument("--config_file", default="../data/config/default_config.yaml")
args=parser.parse_args()
with open(args.config_file, "r", encoding="utf-8") as yaml_file:
config = yaml.safe_load(yaml_file)
model_name = config["model_name"]
output_dir = config["output_dir"]
data_file = config["data_file"]
bnb_config = config["bnb"]
lora_config = config["lora"]
train_config = config["train"]
dataset = load_dataset("json", data_files=data_file, split="train")
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')
bnb_config = BitsAndBytesConfig(
load_in_4bit=bnb_config["load_in_4bit"],
bnb_4bit_use_double_quant=bnb_config["bnb_4bit_use_double_quant"],
bnb_4bit_quant_type=bnb_config["bnb_4bit_quant_type"],
bnb_4bit_compute_dtype=torch.bfloat16
)
model, tokenizer = load_model(model_name, bnb_config)
#max_length = get_max_length(model)
max_length = 8128
dataset = preprocess_dataset(tokenizer, max_length, dataset)
train(
model=model,
tokenizer=tokenizer,
lora_config=lora_config,
train_config=train_config,
dataset=dataset,
output_dir=output_dir
)
Configuration:
data_file: "../data/post_data.json"
output_dir: "../data/results/mistral-instruct/final_checkpoint"
model_name: "mistralai/Mistral-7B-Instruct-v0.2"
bnb:
load_in_4bit: True
bnb_4bit_use_double_quant: True
bnb_4bit_quant_type: "nf4"
bnb_4bit_compute_dtype: torch.bfloat16
lora:
lora_alpha: 16
lora_dropout: 0.1
r: 8
bias: "none"
task_type: "CAUSAL_LM"
train:
output_dir: "../data/saved_models/mistral-instruct"
num_train_epochs: 1
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
gradient_checkpointing: True
optim: "paged_adamw_8bit"
logging_steps: 1
logging_strategy: "steps"
save_strategy: "steps"
save_steps: 10
learning_rate: 0.0002
max_steps: 40
fp16: True
max_grad_norm: 1.0
warmup_ratio: 0.03
lr_scheduler_type: "constant"
report_to: "mlflow"
While fine-tuning my LLM, I run into the following error:
Number of prompts: 836
Column names are: ['instruction', 'system', 'output']
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00, 1.44s/it]
Preprocessing dataset...
Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%
Fine-tuning...
0%| | 0/40 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
{'loss': 1.4845, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 1.1674, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 0.9353, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 1.0586, 'learning_rate': 0.0002, 'epoch': 0.02}
{'loss': 1.1123, 'learning_rate': 0.0002, 'epoch': 0.02}
{'loss': 0.9471, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 0.9835, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 0.7003, 'learning_rate': 0.0002, 'epoch': 0.04}
{'loss': 0.8453, 'learning_rate': 0.0002, 'epoch': 0.04}
{'loss': 0.6728, 'learning_rate': 0.0002, 'epoch': 0.05}
25%|██████████████████████████████████ | 10/40 [01:22<04:38, 9.27s/it]Checkpoint destination directory ../data/saved_models/mistral-instruct/checkpoint-10 already exists and is non-empty.Saving will proceed but saved results may be invalid.
{'loss': 0.6997, 'learning_rate': 0.0002, 'epoch': 0.05}
{'loss': 0.7768, 'learning_rate': 0.0002, 'epoch': 0.06}
{'loss': 0.5921, 'learning_rate': 0.0002, 'epoch': 0.06}
{'loss': 0.8339, 'learning_rate': 0.0002, 'epoch': 0.07}
{'loss': 0.6867, 'learning_rate': 0.0002, 'epoch': 0.07}
{'loss': 0.6971, 'learning_rate': 0.0002, 'epoch': 0.08}
{'loss': 0.6413, 'learning_rate': 0.0002, 'epoch': 0.08}
{'loss': 0.6958, 'learning_rate': 0.0002, 'epoch': 0.09}
45%|█████████████████████████████████████████████████████████████▏ | 18/40 [02:25<03:00, 8.19s/it]
...
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.34 GiB. GPU 0 has a total capacty of 39.39 GiB of which 2.54 GiB is free. Including non-PyTorch memory, this process has 36.73 GiB memory in use. Of the allocated memory 29.39 GiB is allocated by PyTorch, and 5.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
I already tried several things to reduce the memory footprint during fine-tuning, such as:
- decreased trainable parameters by adapting LoRa parameters
- decreased max length of tokens
- decreased batch size to 1
- freeing memory before starting fine-tuning
- enable gradient checkpointing