FINGU-AI commited on
Commit
60f0d65
·
verified ·
1 Parent(s): 81bf075

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -11
app.py CHANGED
@@ -17,8 +17,10 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
17
  zero = torch.Tensor([0]).cuda()
18
  print(zero.device) # <-- 'cpu' 🤔
19
 
20
- model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
21
- peft_model_id = "Imran1/Llama3.1_8b_Qlora_bnk"
 
 
22
  #attn_implementation="flash_attention_2",
23
  model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", torch_dtype= torch.bfloat16)
24
  model.load_adapter(peft_model_id)
@@ -32,10 +34,10 @@ if tokenizer.pad_token_id is None:
32
  tokenizer.pad_token_id = tokenizer.eos_token_id
33
 
34
  # Define terminators
35
- terminators = [
36
- tokenizer.eos_token_id,
37
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
38
- ]
39
 
40
  generation_params = {
41
  'max_new_tokens': 2000,
@@ -44,8 +46,8 @@ generation_params = {
44
  'temperature': 0.7,
45
  'top_p': 0.9,
46
  # 'top_k': 50,
47
- 'pad_token_id': tokenizer.pad_token_id,
48
- 'eos_token_id': terminators,
49
  }
50
 
51
 
@@ -89,9 +91,10 @@ def inference(query):
89
  tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
90
  outputs = model.generate(tokenized_chat, **generation_params)
91
  decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
92
- assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
93
- response_ = assistant_response.replace('<|eot_id|>', "")
94
- return assistant_response
 
95
  # outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
96
  # return outputs
97
 
 
17
  zero = torch.Tensor([0]).cuda()
18
  print(zero.device) # <-- 'cpu' 🤔
19
 
20
+ # model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
21
+ # peft_model_id = "Imran1/Llama3.1_8b_Qlora_bnk"
22
+ model_id = "Qwen/Qwen2.5-14B-Instruct"
23
+ peft_model_id = "Imran1/Qwen2.5-14b-bnk-lora-11"
24
  #attn_implementation="flash_attention_2",
25
  model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", torch_dtype= torch.bfloat16)
26
  model.load_adapter(peft_model_id)
 
34
  tokenizer.pad_token_id = tokenizer.eos_token_id
35
 
36
  # Define terminators
37
+ # terminators = [
38
+ # tokenizer.eos_token_id,
39
+ # tokenizer.convert_tokens_to_ids("<|eot_id|>")
40
+ # ]
41
 
42
  generation_params = {
43
  'max_new_tokens': 2000,
 
46
  'temperature': 0.7,
47
  'top_p': 0.9,
48
  # 'top_k': 50,
49
+ # 'pad_token_id': tokenizer.pad_token_id,
50
+ # 'eos_token_id': terminators,
51
  }
52
 
53
 
 
91
  tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
92
  outputs = model.generate(tokenized_chat, **generation_params)
93
  decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
94
+ # assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
95
+ response = decoded_outputs[0][tokenized_chat.shape[-1]:]
96
+ response = tokenizer.decode(response, skip_special_tokens=True))
97
+ return response
98
  # outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
99
  # return outputs
100