Bhaskar2611 commited on
Commit
e6601b8
·
verified ·
1 Parent(s): fc33bb2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -37
app.py CHANGED
@@ -266,48 +266,106 @@ For more information on `huggingface_hub` Inference API support, please check th
266
  # if __name__ == "__main__":
267
  # demo.launch()
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  import gradio as gr
270
- from transformers import AutoTokenizer, AutoModelForCausalLM
271
- import torch
272
-
273
- # Load once globally
274
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
275
- model = AutoModelForCausalLM.from_pretrained(
276
- "Qwen/Qwen2.5-Coder-32B-Instruct",
277
- device_map="auto",
278
- torch_dtype=torch.float16,
 
 
279
  )
280
 
281
  def respond(message, history):
282
- system_prompt = (
283
- "You are a helpful coding assistant specialized in web development. "
284
- "Provide complete code snippets for HTML, CSS, JS, Flask, Node.js etc."
285
- )
286
- # Build input prompt including chat history
287
- chat_history = ""
288
- for user_msg, bot_msg in history:
289
- chat_history += f"User: {user_msg}\nAssistant: {bot_msg}\n"
290
- prompt = f"{system_prompt}\n{chat_history}User: {message}\nAssistant:"
291
-
292
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
293
- outputs = model.generate(
294
- **inputs,
295
- max_new_tokens=512,
296
- temperature=0.7,
297
- do_sample=True,
298
- top_p=0.95,
299
- eos_token_id=tokenizer.eos_token_id,
300
  )
301
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
302
-
303
- # Extract only the new response part after the prompt
304
- response = generated_text[len(prompt):].strip()
305
-
306
- # Append current Q/A to history
307
- history.append((message, response))
308
- return "", history
309
-
310
- demo = gr.ChatInterface(respond, type="messages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  if __name__ == "__main__":
313
  demo.launch()
@@ -318,3 +376,4 @@ if __name__ == "__main__":
318
 
319
 
320
 
 
 
266
  # if __name__ == "__main__":
267
  # demo.launch()
268
 
269
+ # import gradio as gr
270
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
271
+ # import torch
272
+
273
+ # # Load once globally
274
+ # tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
275
+ # model = AutoModelForCausalLM.from_pretrained(
276
+ # "Qwen/Qwen2.5-Coder-32B-Instruct",
277
+ # device_map="auto",
278
+ # torch_dtype=torch.float16,
279
+ # )
280
+
281
+ # def respond(message, history):
282
+ # system_prompt = (
283
+ # "You are a helpful coding assistant specialized in web development. "
284
+ # "Provide complete code snippets for HTML, CSS, JS, Flask, Node.js etc."
285
+ # )
286
+ # # Build input prompt including chat history
287
+ # chat_history = ""
288
+ # for user_msg, bot_msg in history:
289
+ # chat_history += f"User: {user_msg}\nAssistant: {bot_msg}\n"
290
+ # prompt = f"{system_prompt}\n{chat_history}User: {message}\nAssistant:"
291
+
292
+ # inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
293
+ # outputs = model.generate(
294
+ # **inputs,
295
+ # max_new_tokens=512,
296
+ # temperature=0.7,
297
+ # do_sample=True,
298
+ # top_p=0.95,
299
+ # eos_token_id=tokenizer.eos_token_id,
300
+ # )
301
+ # generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
302
+
303
+ # # Extract only the new response part after the prompt
304
+ # response = generated_text[len(prompt):].strip()
305
+
306
+ # # Append current Q/A to history
307
+ # history.append((message, response))
308
+ # return "", history
309
+
310
+ # demo = gr.ChatInterface(respond, type="messages")
311
+
312
+ # if __name__ == "__main__":
313
+ # demo.launch()
314
+ import os
315
  import gradio as gr
316
+ from huggingface_hub import InferenceClient
317
+ from dotenv import load_dotenv
318
+
319
+ # Load .env variables (make sure to have HF_TOKEN in .env or set as env var)
320
+ load_dotenv()
321
+ HF_TOKEN = os.getenv("HF_TOKEN") # or directly assign your token here as string
322
+
323
+ # Initialize InferenceClient with Hugging Face API token
324
+ client = InferenceClient(
325
+ model="Qwen/Qwen2.5-Coder-32B-Instruct",
326
+ token=HF_TOKEN
327
  )
328
 
329
  def respond(message, history):
330
+ """
331
+ Chat response generator function streaming from Hugging Face Inference API.
332
+ """
333
+ system_message = (
334
+ "You are a helpful and experienced coding assistant specialized in web development. "
335
+ "Help the user by generating complete and functional code for building websites. "
336
+ "You can provide HTML, CSS, JavaScript, and backend code (like Flask, Node.js, etc.) "
337
+ "based on their requirements."
 
 
 
 
 
 
 
 
 
 
338
  )
339
+ max_tokens = 2048
340
+ temperature = 0.7
341
+ top_p = 0.95
342
+
343
+ # Prepare messages in OpenAI chat format
344
+ messages = [{"role": "system", "content": system_message}]
345
+ for user_msg, assistant_msg in history:
346
+ if user_msg:
347
+ messages.append({"role": "user", "content": user_msg})
348
+ if assistant_msg:
349
+ messages.append({"role": "assistant", "content": assistant_msg})
350
+ messages.append({"role": "user", "content": message})
351
+
352
+ response = ""
353
+ # Stream response tokens from Hugging Face Inference API
354
+ for chunk in client.chat.completions.create(
355
+ model="Qwen/Qwen2.5-Coder-32B-Instruct",
356
+ messages=messages,
357
+ max_tokens=max_tokens,
358
+ stream=True,
359
+ temperature=temperature,
360
+ top_p=top_p,
361
+ ):
362
+ token = chunk.choices[0].delta.get("content", "")
363
+ response += token
364
+ yield response
365
+
366
+ # Create Gradio chat interface
367
+ demo = gr.ChatInterface(fn=respond, title="Coding Assistant",
368
+ description="Ask for web development code help!")
369
 
370
  if __name__ == "__main__":
371
  demo.launch()
 
376
 
377
 
378
 
379
+