File size: 3,224 Bytes
cf38aa5
43b5bef
518be16
c29137d
43b5bef
518be16
cf38aa5
43b5bef
 
 
 
 
 
 
 
77ac272
 
 
 
 
 
 
 
518be16
 
c29137d
 
518be16
edb32fe
77ac272
 
edb32fe
518be16
c29137d
6617dfe
 
edb32fe
 
6617dfe
518be16
77ac272
518be16
6617dfe
edb32fe
c29137d
 
518be16
 
 
 
 
6617dfe
518be16
 
 
edb32fe
 
 
 
518be16
c29137d
43b5bef
 
 
 
77ac272
 
43b5bef
 
 
 
 
 
edb32fe
43b5bef
 
 
 
 
 
 
cf38aa5
43b5bef
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import gradio as gr
import requests
import json

# Get the Hugging Face API key from Spaces secrets.
HF_API_KEY = os.getenv("HF_API_KEY")

# Model endpoints on Hugging Face
MODEL_ENDPOINTS = {
    "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
    "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
    "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}

# System prompts for each model
SYSTEM_PROMPTS = {
    "Qwen2.5-72B-Instruct": "System: You are a knowledgeable assistant for general inquiries.",
    "Llama3.3-70B-Instruct": "System: You are a research expert assistant specialized in in-depth analysis.",
    "Qwen2.5-Coder-32B-Instruct": "System: You are a coding expert who helps with code-related tasks.",
}

def query_model(prompt, model_endpoint, system_prompt):
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }
    # Format the prompt to include the system instruction and structure the conversation.
    formatted_prompt = f"{system_prompt}\nUser: {prompt}\nAssistant:"
    
    # Include the stop sequence so generation halts when the next user turn starts.
    data = {
        "inputs": formatted_prompt,
        "parameters": {
            "max_new_tokens": 512,
            "temperature": 0.6,
            "stop_sequences": ["\nUser:"]
        }
    }
    
    response = requests.post(model_endpoint, headers=headers, json=data)
    
    # Uncomment the next line to print raw API responses for debugging.
    # print("Raw response:", response.text)
    
    try:
        result = response.json()
    except Exception:
        return f"Error: Unable to parse JSON. Response: {response.text}"
    
    if isinstance(result, dict) and "error" in result:
        return f"Error: {result['error']}"
    
    try:
        generated_text = result[0].get("generated_text", "No generated_text found in response")
        # Optionally, strip off the prompt if needed:
        # generated_text = generated_text[len(formatted_prompt):].strip()
        return generated_text
    except Exception:
        return f"Error: Unexpected response format: {json.dumps(result)}"

def chat_with_models(user_input, history):
    responses = []
    for model_name, endpoint in MODEL_ENDPOINTS.items():
        system_prompt = SYSTEM_PROMPTS.get(model_name, "")
        model_response = query_model(user_input, endpoint, system_prompt)
        responses.append(f"**{model_name}**: {model_response}")
    combined_answer = "\n\n".join(responses)
    history.append((user_input, combined_answer))
    return history, history

with gr.Blocks() as demo:
    gr.Markdown("# Multi-LLM Chatbot using Hugging Face Inference API with Stop Sequences")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Your Message")
    clear = gr.Button("Clear")

    def clear_chat():
        return [], []

    msg.submit(fn=chat_with_models, inputs=[msg, chatbot], outputs=[chatbot, chatbot])
    clear.click(fn=clear_chat, outputs=[chatbot, chatbot])

demo.launch()