bunyaminergen commited on
Commit
e5dde1f
·
1 Parent(s): d17fff8
Files changed (1) hide show
  1. app.py +58 -33
app.py CHANGED
@@ -1,44 +1,75 @@
1
  # Standard library imports
2
  import os
 
3
 
4
- # Related third-party imports
5
  import gradio as gr
6
- from huggingface_hub import InferenceClient
 
7
 
8
- client = InferenceClient("bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning", token=os.getenv("HF_ACCESS_TOKEN"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def respond(
12
- message,
13
  history: list[tuple[str, str]],
14
- system_message,
15
- max_tokens,
16
- temperature,
17
- top_p,
18
  ):
19
  messages = [{"role": "system", "content": system_message}]
20
-
21
- for val in history:
22
- if val[0]:
23
- messages.append({"role": "user", "content": val[0]})
24
- if val[1]:
25
- messages.append({"role": "assistant", "content": val[1]})
26
-
27
  messages.append({"role": "user", "content": message})
28
 
29
- response = ""
 
 
 
30
 
31
- for message in client.chat_completion(
32
- messages,
33
- max_tokens=max_tokens,
34
- stream=True,
35
- temperature=temperature,
36
- top_p=top_p,
37
- ):
38
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
39
 
40
- response += token
41
- yield response
 
 
42
 
43
 
44
  demo = gr.ChatInterface(
@@ -47,13 +78,7 @@ demo = gr.ChatInterface(
47
  gr.Textbox(value="You are a helpful coding assistant.", label="System message"),
48
  gr.Slider(minimum=512, maximum=8192, value=2048, step=1, label="Max new tokens"),
49
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
50
- gr.Slider(
51
- minimum=0.1,
52
- maximum=1.0,
53
- value=0.95,
54
- step=0.05,
55
- label="Top-p (nucleus sampling)",
56
- ),
57
  ],
58
  )
59
 
 
1
  # Standard library imports
2
  import os
3
+ import threading
4
 
5
+ # Third-party imports
6
  import gradio as gr
7
+ from peft import PeftModel
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
9
 
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained(
13
+ "bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
14
+ use_auth_token=HF_TOKEN,
15
+ trust_remote_code=True
16
+ )
17
+
18
+ base_model = AutoModelForCausalLM.from_pretrained(
19
+ "Qwen/Qwen2.5-Coder-1.5B-Instruct",
20
+ device_map="auto",
21
+ torch_dtype="auto",
22
+ use_auth_token=HF_TOKEN
23
+ )
24
+ model = PeftModel.from_pretrained(
25
+ base_model,
26
+ "bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
27
+ use_auth_token=HF_TOKEN
28
+ )
29
+ model.eval()
30
 
31
 
32
  def respond(
33
+ message: str,
34
  history: list[tuple[str, str]],
35
+ system_message: str,
36
+ max_tokens: int,
37
+ temperature: float,
38
+ top_p: float,
39
  ):
40
  messages = [{"role": "system", "content": system_message}]
41
+ for u, a in history:
42
+ if u:
43
+ messages.append({"role": "user", "content": u})
44
+ if a:
45
+ messages.append({"role": "assistant", "content": a})
 
 
46
  messages.append({"role": "user", "content": message})
47
 
48
+ prompt = tokenizer.apply_chat_template(
49
+ messages, tokenize=False, add_generation_prompt=True
50
+ )
51
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
52
 
53
+ streamer = TextIteratorStreamer(
54
+ tokenizer,
55
+ timeout=10.0,
56
+ skip_prompt=True,
57
+ skip_special_tokens=True
58
+ )
59
+ generation_kwargs = {
60
+ **inputs,
61
+ "max_new_tokens": max_tokens,
62
+ "temperature": temperature,
63
+ "top_p": top_p,
64
+ "streamer": streamer,
65
+ }
66
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
67
+ thread.start()
68
 
69
+ output = ""
70
+ for chunk in streamer:
71
+ output += chunk
72
+ yield output
73
 
74
 
75
  demo = gr.ChatInterface(
 
78
  gr.Textbox(value="You are a helpful coding assistant.", label="System message"),
79
  gr.Slider(minimum=512, maximum=8192, value=2048, step=1, label="Max new tokens"),
80
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
81
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 
 
 
 
 
 
82
  ],
83
  )
84