I have loaded a model with the follow code:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(DEVICE)
embeddings = HuggingFaceInstructEmbeddings(
model_name="hkunlp/instructor-xl",model_kwargs={"device":DEVICE}
)
model_name_or_path = "./models/Llama-2-13B-chat-GPTQ"
model_basename = "model"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
revision= "gptq-4bit-128g-actorder_True", #"gptq-8bit-128g-actorder_False", #revision="gptq-4bit-128g-actorder_True",
model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
inject_fused_attention=False,
device=DEVICE,
quantize_config=None,
)
And if I print out this model, it has the following structure:
LlamaGPTQForCausalLM(
(model): LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(32000, 5120, padding_idx=0)
(layers): ModuleList(
(0-39): 40 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(rotary_emb): LlamaRotaryEmbedding()
(k_proj): QuantLinear()
(o_proj): QuantLinear()
(q_proj): QuantLinear()
(v_proj): QuantLinear()
)
(mlp): LlamaMLP(
(act_fn): SiLUActivation()
(down_proj): QuantLinear()
(gate_proj): QuantLinear()
(up_proj): QuantLinear()
)
(input_layernorm): LlamaRMSNorm()
(post_attention_layernorm): LlamaRMSNorm()
)
)
(norm): LlamaRMSNorm()
)
(lm_head): Linear(in_features=5120, out_features=32000, bias=False)
)
)
Is there a way that I can unload this model from the GPU to free up GPU memory?
I tried model.to(torch.device("cpu"))
but it did not work.
thanks