choupijiang / app.py
luminoussg's picture
Update app.py
149cacc verified
raw
history blame
7 kB
# app.py
import gradio as gr
import os
import requests
import threading
from datetime import datetime
from typing import List, Dict, Any, Generator
from session_manager import SessionManager
# Initialize session manager and get HF API key
session_manager = SessionManager()
HF_API_KEY = os.getenv("HF_API_KEY")
# Model endpoints configuration
MODEL_ENDPOINTS = {
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}
def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
"""Query a single model with the chat history"""
endpoint = MODEL_ENDPOINTS[model_name]
headers = {
"Authorization": f"Bearer {HF_API_KEY}",
"Content-Type": "application/json"
}
# Build full conversation history for context
conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
# Model-specific prompt formatting with full history
model_prompts = {
"Qwen2.5-72B-Instruct": (
f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nMy analysis:"
),
"Llama3.3-70B-Instruct": (
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
"<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
),
"Qwen2.5-Coder-32B-Instruct": (
f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nTechnical perspective:"
)
}
# Model-specific stop sequences
stop_sequences = {
"Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
"Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
"Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
}
payload = {
"inputs": model_prompts[model_name],
"parameters": {
"max_tokens": 2048,
"temperature": 0.7,
"stop_sequences": stop_sequences[model_name],
"return_full_text": False
}
}
try:
response = requests.post(endpoint, json=payload, headers=headers)
response.raise_for_status()
result = response.json()[0]['generated_text']
# Do NOT remove ** or ## so that Markdown (including LaTeX) is preserved
result = result.split('<|')[0] # Remove any remaining special tokens
return result.strip() # Return the cleaned response
except Exception as e:
return f"{model_name} error: {str(e)}"
def respond(message: str, history: List[List[str]], session_id: str) -> Generator[str, None, None]:
"""Handle sequential model responses with context preservation"""
# Load or initialize session
session = session_manager.load_session(session_id)
if not isinstance(session, dict) or "history" not in session:
session = {"history": []}
# Build context from session history
messages = []
for entry in session["history"]:
if entry["type"] == "user":
messages.append({"role": "user", "content": entry["content"]})
else:
messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"})
# Add current message
messages.append({"role": "user", "content": message})
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "user",
"content": message
})
# First model
yield "πŸ”΅ Qwen2.5-Coder-32B-Instruct is thinking..."
response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": "Qwen2.5-Coder-32B-Instruct",
"content": response1
})
messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"})
yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}"
# Second model
yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 Qwen2.5-72B-Instruct is thinking..."
response2 = query_model("Qwen2.5-72B-Instruct", messages)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": "Qwen2.5-72B-Instruct",
"content": response2
})
messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"})
yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}"
# Final model
yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}\n\n🟑 Llama3.3-70B-Instruct is thinking..."
response3 = query_model("Llama3.3-70B-Instruct", messages)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": "Llama3.3-70B-Instruct",
"content": response3
})
messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"})
# Save final session state
session_manager.save_session(session_id, session)
# Return final combined response
yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}\n\n🟑 **Llama3.3-70B-Instruct**\n{response3}"
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Multi-LLM Collaboration Chat")
gr.Markdown(
"""
**LaTeX Support:** You can include inline math like `$E = mc^2$` or block math like:
```
$$
\\int_0^1 x^2 \\, dx
$$
```
and it will be rendered correctly below.
"""
)
with gr.Row():
session_id = gr.State(session_manager.create_session)
new_session = gr.Button("πŸ”„ New Session")
chatbot = gr.Chatbot(height=600)
msg = gr.Textbox(label="Message")
def on_new_session():
new_id = session_manager.create_session()
return new_id, []
def user(message, history, session_id):
return "", history + [[message, None]]
def bot(history, session_id):
if history and history[-1][1] is None:
message = history[-1][0]
for response in respond(message, history[:-1], session_id):
history[-1][1] = response
yield history
msg.submit(user, [msg, chatbot, session_id], [msg, chatbot]).then(
bot, [chatbot, session_id], [chatbot]
)
new_session.click(on_new_session, None, [session_id, chatbot])
if __name__ == "__main__":
demo.launch(share=True)