I’m new to coding and I’ve been learning to code in LLM for some time now. I’m trying to create a chatbot in French using the vigolstral-chat model, which is a fine-tuned model based on Mistral7b. I’m attempting to create a chatbot using my own dataset, which consists of data from my faculty that I scraped from their website (public data), such as the number of students within the faculty, etc. I followed a tutorial to fine-tune the model and uploaded it correctly on Hugging Face. However, when I try to infer with my model, it responds with a random sequence of letters. I think there might be an issue with my dataset, which is structured as follows: LINE: text…
Does anyone have any idea where the problem might be coming from? Thank you very much.
from huggingface_hub import interpreter_login
interpreter_login()
import os
import argparse
import torch
import torch.nn as nn
from datasets import load_dataset,Features,Value,load_from_disk
import transformers
from functools import partial
from transformers import MistralForCausalLM, MistralModel, MistralConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed, TrainingArguments, BitsAndBytesConfig, \
DataCollatorForLanguageModeling, Trainer, TrainingArguments
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM ,PeftModel
from trl import SFTTrainer
from unidecode import unidecode
base_model="bofenghuang/vigostral-7b-chat"
new_model="ALIE_0.5"
data = load_dataset('json',data_files={'train': ['C:/Users/sacha/Documents/projet ALI/test vs/ALIE0.2/Dataset/lyon1_charlie_dataset_train.json'],
'test': ['C:/Users/sacha/Documents/projet ALI/test vs/ALIE0.2/Dataset/lyon1_delta_dataset_validation.json']})
print(data['train'])
bnb_config = BitsAndBytesConfig(
load_in_4bit= True,
bnb_4bit_quant_type= "nf4",
bnb_4bit_compute_dtype= torch.bfloat16,
bnb_4bit_use_double_quant= False, #a voir
)
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=bnb_config,
device_map={"": 0} )
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
r=16,
lora_alpha=16,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
output_dir= "C:/Users/sacha/Documents/projet ALI/test vs/ALIE0.2/end",
num_train_epochs= 1,
per_device_train_batch_size= 2,
gradient_accumulation_steps= 4,
optim = "paged_adamw_8bit",
save_steps= 5000,
logging_steps= 30,
learning_rate= 2e-4,
weight_decay= 0.001,
fp16= False,
bf16= False,
max_grad_norm= 0.3,
max_steps= -1,
warmup_ratio= 0.3,
group_by_length= True,
lr_scheduler_type= "constant",
)
trainer = SFTTrainer(
model=model,
train_dataset=data['train'],
eval_dataset=data['test'],
peft_config=peft_config,
max_seq_length= None,
tokenizer=tokenizer,
dataset_text_field="line",
args=training_arguments,
packing= False,
)
trainer.train()
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
model.config.use_cache = True
model.eval()
model.push_to_hub(new_model,use_temp_dir=False)
base_model="bofenghuang/vigostral-7b-chat"
new_model="AscheZ/ALIE_0.5"
base_model_reload = AutoModelForCausalLM.from_pretrained(
base_model, low_cpu_mem_usage=True,
return_dict=True,torch_dtype=torch.bfloat16,
device_map= {"": 0})
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)
My code for inference :
model_name="bofenghuang/vigostral-7b-chat"
adapater_name="AscheZ/ALIE_0.5"
print(f"Starting to load the model {model_name} into memory")
m = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
torch_dtype=torch.bfloat16,
device_map={"": 0}
)
m = PeftModel.from_pretrained(m, adapater_name)
m = m.merge_and_unload()
tok = AutoTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1
stop_token_ids = [0]
print(f"Successfully loaded the model {model_name} into memory")
prompt = "Combien il y'a d'étudiant à lyon 1? "
inputs = tok(prompt, return_tensors="pt").to('cuda')
outputs = m.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
print(tok.batch_decode(outputs, skip_special_tokens=True))
Screen inference type of my dataset
I think is dataset problems but what? Or hyperparameters of my training ? But i dont see what...
You are trying to finetune on already finetuned model, and that model got its own format for data (as mentioned in hf card it uses llamma-2 chat format) so you should format your data into that first
https://huggingface.co/blog/llama2#how-to-prompt-llama-2
convert your data into this format and then pass it in trainer, also when you infer you have to provide in this specific format only and model will give output accordingly (you can use apply_chat_template and it will automatically convert it using config)