Spaces:
Sleeping
Sleeping
File size: 7,067 Bytes
43b5bef c1e5d4c 518be16 c1e5d4c 0d6849e cc3006a 43b5bef cc3006a cf38aa5 43b5bef 42b5787 43b5bef c1e5d4c 8190eb3 c1e5d4c 518be16 c1e5d4c 518be16 77ac272 6733659 8190eb3 c9870b1 5e8e5d5 6733659 c9870b1 6733659 c9870b1 6733659 4766698 c9870b1 c1e5d4c c9870b1 6617dfe 42b5787 c9870b1 6617dfe 518be16 77ac272 c1e5d4c c9870b1 8190eb3 c1e5d4c c9870b1 c1e5d4c fdaf591 8190eb3 57a76f2 0d6849e 57a76f2 73c4292 28f1fca 0d6849e 28f1fca fdaf591 327109c cc3006a 28f1fca fdaf591 f32ce56 fdaf591 327109c cc3006a 28f1fca fdaf591 f32ce56 fdaf591 327109c cc3006a 28f1fca 8190eb3 cc3006a 28f1fca fdaf591 8190eb3 43b5bef 8190eb3 fdaf591 8190eb3 fdaf591 8190eb3 fdaf591 8190eb3 fdaf591 43b5bef c1e5d4c cc3006a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import gradio as gr
import os
import requests
import threading
from datetime import datetime
from typing import List, Dict, Any, Generator
from session_manager import SessionManager
# Initialize session manager and get HF API key
session_manager = SessionManager()
HF_API_KEY = os.getenv("HF_API_KEY")
# Model endpoints configuration
MODEL_ENDPOINTS = {
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}
def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
"""Query a single model with the chat history"""
endpoint = MODEL_ENDPOINTS[model_name]
headers = {
"Authorization": f"Bearer {HF_API_KEY}",
"Content-Type": "application/json"
}
# Build full conversation history for context
conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
# Model-specific prompt formatting with full history
model_prompts = {
"Qwen2.5-72B-Instruct": (
f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nMy analysis:"
),
"Llama3.3-70B-Instruct": (
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
"<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
),
"Qwen2.5-Coder-32B-Instruct": (
f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nTechnical perspective:"
)
}
# Model-specific stop sequences
stop_sequences = {
"Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
"Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
"Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
}
payload = {
"inputs": model_prompts[model_name],
"parameters": {
"max_tokens": 2048,
"temperature": 0.7,
"stop_sequences": stop_sequences[model_name],
"return_full_text": False
}
}
try:
response = requests.post(endpoint, json=payload, headers=headers)
response.raise_for_status()
result = response.json()[0]['generated_text']
# Clean up response formatting
result = result.split('<|')[0] # Remove any remaining special tokens
result = result.replace('**', '').replace('##', '') # Remove markdown emphasis
result = result.strip() # Remove leading/trailing whitespace
return result
except Exception as e:
return f"{model_name} error: {str(e)}"
def respond(message: str, history: List[List[str]], session_id: str) -> Generator[str, None, None]:
"""Handle sequential model responses with context preservation"""
# Load or initialize session
session = session_manager.load_session(session_id)
if not isinstance(session, dict) or "history" not in session:
session = {"history": []}
# Build context from session history
messages = []
for entry in session["history"]:
if entry["type"] == "user":
messages.append({"role": "user", "content": entry["content"]})
else:
messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"})
# Add current message
messages.append({"role": "user", "content": message})
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "user",
"content": message
})
# First model
yield "π΅ Qwen2.5-Coder-32B-Instruct is thinking..."
response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": "Qwen2.5-Coder-32B-Instruct",
"content": response1
})
messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"})
yield f"π΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}"
# Second model
yield f"π΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nπ£ Qwen2.5-72B-Instruct is thinking..."
response2 = query_model("Qwen2.5-72B-Instruct", messages)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": "Qwen2.5-72B-Instruct",
"content": response2
})
messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"})
yield f"π΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nπ£ **Qwen2.5-72B-Instruct**\n{response2}"
# Final model
yield f"π΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nπ£ **Qwen2.5-72B-Instruct**\n{response2}\n\nπ‘ Llama3.3-70B-Instruct is thinking..."
response3 = query_model("Llama3.3-70B-Instruct", messages)
session["history"].append({
"timestamp": datetime.now().isoformat(),
"type": "assistant",
"model": "Llama3.3-70B-Instruct",
"content": response3
})
messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"})
# Save final session state
session_manager.save_session(session_id, session)
# Return final combined response
yield f"π΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nπ£ **Qwen2.5-72B-Instruct**\n{response2}\n\nπ‘ **Llama3.3-70B-Instruct**\n{response3}"
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Multi-LLM Collaboration Chat")
with gr.Row():
session_id = gr.State(session_manager.create_session)
new_session = gr.Button("π New Session")
# Add latex_delimiters to enable LaTeX rendering
chatbot = gr.Chatbot(
height=600,
latex_delimiters=[
{"left": "$", "right": "$", "display": False}, # inline math
{"left": "$$", "right": "$$", "display": True} # display math
]
)
msg = gr.Textbox(label="Message")
def on_new_session():
new_id = session_manager.create_session()
return new_id, []
def user(message, history, session_id):
return "", history + [[message, None]]
def bot(history, session_id):
if history and history[-1][1] is None:
message = history[-1][0]
for response in respond(message, history[:-1], session_id):
history[-1][1] = response
yield history
msg.submit(user, [msg, chatbot, session_id], [msg, chatbot]).then(
bot, [chatbot, session_id], [chatbot]
)
new_session.click(on_new_session, None, [session_id, chatbot])
if __name__ == "__main__":
demo.launch(share=True)
|