Update app.py
Browse files
app.py
CHANGED
@@ -17,8 +17,10 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
17 |
zero = torch.Tensor([0]).cuda()
|
18 |
print(zero.device) # <-- 'cpu' 🤔
|
19 |
|
20 |
-
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
21 |
-
peft_model_id = "Imran1/Llama3.1_8b_Qlora_bnk"
|
|
|
|
|
22 |
#attn_implementation="flash_attention_2",
|
23 |
model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", torch_dtype= torch.bfloat16)
|
24 |
model.load_adapter(peft_model_id)
|
@@ -32,10 +34,10 @@ if tokenizer.pad_token_id is None:
|
|
32 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
33 |
|
34 |
# Define terminators
|
35 |
-
terminators = [
|
36 |
-
|
37 |
-
|
38 |
-
]
|
39 |
|
40 |
generation_params = {
|
41 |
'max_new_tokens': 2000,
|
@@ -44,8 +46,8 @@ generation_params = {
|
|
44 |
'temperature': 0.7,
|
45 |
'top_p': 0.9,
|
46 |
# 'top_k': 50,
|
47 |
-
'pad_token_id': tokenizer.pad_token_id,
|
48 |
-
'eos_token_id': terminators,
|
49 |
}
|
50 |
|
51 |
|
@@ -89,9 +91,10 @@ def inference(query):
|
|
89 |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
90 |
outputs = model.generate(tokenized_chat, **generation_params)
|
91 |
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
|
92 |
-
assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
93 |
-
|
94 |
-
|
|
|
95 |
# outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
|
96 |
# return outputs
|
97 |
|
|
|
17 |
zero = torch.Tensor([0]).cuda()
|
18 |
print(zero.device) # <-- 'cpu' 🤔
|
19 |
|
20 |
+
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
21 |
+
# peft_model_id = "Imran1/Llama3.1_8b_Qlora_bnk"
|
22 |
+
model_id = "Qwen/Qwen2.5-14B-Instruct"
|
23 |
+
peft_model_id = "Imran1/Qwen2.5-14b-bnk-lora-11"
|
24 |
#attn_implementation="flash_attention_2",
|
25 |
model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", torch_dtype= torch.bfloat16)
|
26 |
model.load_adapter(peft_model_id)
|
|
|
34 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
35 |
|
36 |
# Define terminators
|
37 |
+
# terminators = [
|
38 |
+
# tokenizer.eos_token_id,
|
39 |
+
# tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
40 |
+
# ]
|
41 |
|
42 |
generation_params = {
|
43 |
'max_new_tokens': 2000,
|
|
|
46 |
'temperature': 0.7,
|
47 |
'top_p': 0.9,
|
48 |
# 'top_k': 50,
|
49 |
+
# 'pad_token_id': tokenizer.pad_token_id,
|
50 |
+
# 'eos_token_id': terminators,
|
51 |
}
|
52 |
|
53 |
|
|
|
91 |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
92 |
outputs = model.generate(tokenized_chat, **generation_params)
|
93 |
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
|
94 |
+
# assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
95 |
+
response = decoded_outputs[0][tokenized_chat.shape[-1]:]
|
96 |
+
response = tokenizer.decode(response, skip_special_tokens=True))
|
97 |
+
return response
|
98 |
# outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
|
99 |
# return outputs
|
100 |
|