choupijiang / app.py
luminoussg's picture
Update app.py
dcc233f verified
raw
history blame
6.07 kB
import gradio as gr
import os
import threading
from datetime import datetime
from typing import List, Dict, Any, Generator
from session_manager import SessionManager
from huggingface_hub import InferenceClient
# Initialize session manager and get HF API key
session_manager = SessionManager()
HF_API_KEY = os.getenv("HF_API_KEY")
# Model endpoints configuration
MODEL_ENDPOINTS = {
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}
def query_model(model_name: str, messages: List[Dict[str, str]]) -> Generator[str, None, None]:
"""Query a single model with the chat history and stream the response"""
endpoint = MODEL_ENDPOINTS[model_name]
# Build full conversation history for context
conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
# Model-specific prompt formatting with full history
model_prompts = {
"Qwen2.5-72B-Instruct": (
f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nMy analysis:"
),
"Llama3.3-70B-Instruct": (
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
"<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
),
"Qwen2.5-Coder-32B-Instruct": (
f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nTechnical perspective:"
)
}
client = InferenceClient(base_url=endpoint, token=HF_API_KEY)
try:
stream = client.chat.completions.create(
messages=[{"role": "system", "content": model_prompts[model_name]}],
stream=True,
max_tokens=2048,
temperature=0.7,
)
for chunk in stream:
content = chunk.choices[0].delta.content or ""
yield content
except Exception as e:
yield f"{model_name} error: {str(e)}"
def respond(message: str, history: List[List[str]], session_id: str) -> Generator[str, None, None]:
"""Handle sequential model responses with context preservation and streaming"""
# Load or initialize session
session = session_manager.load_session(session_id)
if not isinstance(session, dict) or "history" not in session:
session = {"history": []}
# Build context from session history
messages = []
for entry in session["history"]:
if entry["type"] == "user":
messages.append({"role": "user", "content": entry["content"]})
else:
messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"})
# Add current message
messages.append({"role": "user", "content": message})
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "user",
"content": message
})
# Model responses
model_names = ["Qwen2.5-Coder-32B-Instruct", "Qwen2.5-72B-Instruct", "Llama3.3-70B-Instruct"]
model_colors = ["πŸ”΅", "🟣", "🟑"]
responses = {}
# Initialize responses
for model_name in model_names:
responses[model_name] = ""
# Stream responses from each model
for i, model_name in enumerate(model_names):
yield f"{model_colors[i]} {model_name} is thinking..."
full_response = ""
for chunk in query_model(model_name, messages):
full_response += chunk
yield f"{model_colors[i]} **{model_name}**\n{full_response}"
# Update session history and messages
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": model_name,
"content": full_response
})
messages.append({"role": "assistant", "content": f"{model_name}: {full_response}"})
responses[model_name] = full_response
# Save final session state
session_manager.save_session(session_id, session)
# Return final combined response (optional)
combined_response = ""
for i, model_name in enumerate(model_names):
combined_response += f"{model_colors[i]} **{model_name}**\n{responses[model_name]}\n\n"
yield combined_response
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Multi-LLM Collaboration Chat")
with gr.Row():
session_id = gr.State(session_manager.create_session)
new_session = gr.Button("πŸ”„ New Session")
chatbot = gr.Chatbot(height=600)
msg = gr.Textbox(label="Message")
save_history = gr.Checkbox(label="Save Conversation History", value=True)
def on_new_session():
new_id = session_manager.create_session()
return new_id, []
def user(message, history, session_id, save_history):
if save_history:
session = session_manager.load_session(session_id)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "user",
"content": message
})
session_manager.save_session(session_id, session)
return "", history + [[message, None]]
def bot(history, session_id):
if history and history[-1][1] is None:
message = history[-1][0]
for response in respond(message, history[:-1], session_id):
history[-1][1] = response
yield history
msg.submit(user, [msg, chatbot, session_id, save_history], [msg, chatbot]).then(
bot, [chatbot, session_id], [chatbot]
)
new_session.click(on_new_session, None, [session_id, chatbot])
if __name__ == "__main__":
demo.launch(share=True)