Spaces:
Sleeping
Sleeping
File size: 3,224 Bytes
cf38aa5 43b5bef 518be16 c29137d 43b5bef 518be16 cf38aa5 43b5bef 77ac272 518be16 c29137d 518be16 edb32fe 77ac272 edb32fe 518be16 c29137d 6617dfe edb32fe 6617dfe 518be16 77ac272 518be16 6617dfe edb32fe c29137d 518be16 6617dfe 518be16 edb32fe 518be16 c29137d 43b5bef 77ac272 43b5bef edb32fe 43b5bef cf38aa5 43b5bef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import gradio as gr
import requests
import json
# Get the Hugging Face API key from Spaces secrets.
HF_API_KEY = os.getenv("HF_API_KEY")
# Model endpoints on Hugging Face
MODEL_ENDPOINTS = {
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}
# System prompts for each model
SYSTEM_PROMPTS = {
"Qwen2.5-72B-Instruct": "System: You are a knowledgeable assistant for general inquiries.",
"Llama3.3-70B-Instruct": "System: You are a research expert assistant specialized in in-depth analysis.",
"Qwen2.5-Coder-32B-Instruct": "System: You are a coding expert who helps with code-related tasks.",
}
def query_model(prompt, model_endpoint, system_prompt):
headers = {
"Authorization": f"Bearer {HF_API_KEY}",
"Content-Type": "application/json",
"Accept": "application/json"
}
# Format the prompt to include the system instruction and structure the conversation.
formatted_prompt = f"{system_prompt}\nUser: {prompt}\nAssistant:"
# Include the stop sequence so generation halts when the next user turn starts.
data = {
"inputs": formatted_prompt,
"parameters": {
"max_new_tokens": 512,
"temperature": 0.6,
"stop_sequences": ["\nUser:"]
}
}
response = requests.post(model_endpoint, headers=headers, json=data)
# Uncomment the next line to print raw API responses for debugging.
# print("Raw response:", response.text)
try:
result = response.json()
except Exception:
return f"Error: Unable to parse JSON. Response: {response.text}"
if isinstance(result, dict) and "error" in result:
return f"Error: {result['error']}"
try:
generated_text = result[0].get("generated_text", "No generated_text found in response")
# Optionally, strip off the prompt if needed:
# generated_text = generated_text[len(formatted_prompt):].strip()
return generated_text
except Exception:
return f"Error: Unexpected response format: {json.dumps(result)}"
def chat_with_models(user_input, history):
responses = []
for model_name, endpoint in MODEL_ENDPOINTS.items():
system_prompt = SYSTEM_PROMPTS.get(model_name, "")
model_response = query_model(user_input, endpoint, system_prompt)
responses.append(f"**{model_name}**: {model_response}")
combined_answer = "\n\n".join(responses)
history.append((user_input, combined_answer))
return history, history
with gr.Blocks() as demo:
gr.Markdown("# Multi-LLM Chatbot using Hugging Face Inference API with Stop Sequences")
chatbot = gr.Chatbot()
msg = gr.Textbox(label="Your Message")
clear = gr.Button("Clear")
def clear_chat():
return [], []
msg.submit(fn=chat_with_models, inputs=[msg, chatbot], outputs=[chatbot, chatbot])
clear.click(fn=clear_chat, outputs=[chatbot, chatbot])
demo.launch()
|