import gradio as gr import os import requests import threading from datetime import datetime from typing import List, Dict, Any, Generator from session_manager import SessionManager # Initialize session manager and get HF API key session_manager = SessionManager() HF_API_KEY = os.getenv("HF_API_KEY") # Model endpoints configuration MODEL_ENDPOINTS = { "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct", "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct", "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct", } def query_model(model_name: str, messages: List[Dict[str, str]]) -> str: """Query a single model with the chat history""" endpoint = MODEL_ENDPOINTS[model_name] headers = { "Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json" } # Build full conversation history for context conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) # Model-specific prompt formatting with full history model_prompts = { "Qwen2.5-72B-Instruct": ( f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n" "<|im_start|>assistant\nMy analysis:" ), "Llama3.3-70B-Instruct": ( "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n" f"Build upon this discussion:\n{conversation}<|eot_id|>\n" "<|start_header_id|>assistant<|end_header_id|>\nMy contribution:" ), "Qwen2.5-Coder-32B-Instruct": ( f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n" "<|im_start|>assistant\nTechnical perspective:" ) } # Model-specific stop sequences stop_sequences = { "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"], "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"], "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"] } payload = { "inputs": model_prompts[model_name], "parameters": { "max_tokens": 2048, "temperature": 0.7, "stop_sequences": stop_sequences[model_name], "return_full_text": False } } try: response = requests.post(endpoint, json=payload, headers=headers) response.raise_for_status() result = response.json()[0]['generated_text'] # Clean up response formatting result = result.split('<|')[0] # Remove any remaining special tokens result = result.replace('**', '').replace('##', '') # Remove markdown result = result.strip() # Remove leading/trailing whitespace return result # Return complete response except Exception as e: return f"{model_name} error: {str(e)}" def respond(message: str, history: List[List[str]], session_id: str) -> tuple[str, str]: """Handle sequential model responses with context preservation""" # Load or initialize session session = session_manager.load_session(session_id) if not isinstance(session, dict) or "history" not in session: session = {"history": []} # Build context from session history messages = [] for entry in session["history"]: if entry["type"] == "user": messages.append({"role": "user", "content": entry["content"]}) else: messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"}) # Add current message messages.append({"role": "user", "content": message}) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "user", "content": message }) responses = [] # Get first model's response response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "assistant", "model": "Qwen2.5-Coder-32B-Instruct", "content": response1 }) messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"}) responses.append(f"**Qwen2.5-Coder-32B-Instruct**:\n{response1}") # Get second model's response response2 = query_model("Qwen2.5-72B-Instruct", messages) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "assistant", "model": "Qwen2.5-72B-Instruct", "content": response2 }) messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"}) responses.append(f"**Qwen2.5-72B-Instruct**:\n{response2}") # Get final model's response response3 = query_model("Llama3.3-70B-Instruct", messages) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "assistant", "model": "Llama3.3-70B-Instruct", "content": response3 }) messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"}) responses.append(f"**Llama3.3-70B-Instruct**:\n{response3}") # Save final session state session_manager.save_session(session_id, session) # Return response as a single tuple for Gradio chat return message, "\n\n".join(responses) # Create the Gradio interface with gr.Blocks() as demo: session_id = gr.State(session_manager.create_session) gr.Markdown("## Multi-LLM Collaboration Chat") gr.Markdown("A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B") chatbot = gr.Chatbot() msg = gr.Textbox(label="Message") clear = gr.Button("Clear") def user(message, history, session_id): return "", history + [[message, None]] def bot(history, session_id): if history[-1][1] is None: message = history[-1][0] _, response = respond(message, history[:-1], session_id) history[-1][1] = response return history return history msg.submit(user, [msg, chatbot, session_id], [msg, chatbot]).then( bot, [chatbot, session_id], [chatbot] ) clear.click(lambda: (session_manager.create_session(), None, []), None, [session_id, msg, chatbot], queue=False) if __name__ == "__main__": demo.launch(share=True)