import gradio as gr import os import requests import threading from datetime import datetime from typing import List, Dict, Any, Generator from session_manager import SessionManager # Initialize session manager and get HF API key session_manager = SessionManager() HF_API_KEY = os.getenv("HF_API_KEY") # Model configurations MODEL_ENDPOINTS = { "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct", "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct", "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct", } MODEL_CONTEXT_WINDOWS = { "Qwen2.5-72B-Instruct": 128000, "Llama3.3-70B-Instruct": 128000, "Qwen2.5-Coder-32B-Instruct": 128000, } MODEL_MAX_TOKENS = { "Qwen2.5-72B-Instruct": 8192, "Llama3.3-70B-Instruct": 2048, "Qwen2.5-Coder-32B-Instruct": 8192, } def query_model(model_name: str, messages: List[Dict[str, str]]) -> str: """Query a single model with the chat history""" endpoint = MODEL_ENDPOINTS[model_name] headers = { "Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json" } # Build full conversation history for context conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) # Model-specific prompt formatting with full history model_prompts = { "Qwen2.5-72B-Instruct": ( f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n" "<|im_start|>assistant\nMy analysis:" ), "Llama3.3-70B-Instruct": ( "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n" f"Build upon this discussion:\n{conversation}<|eot_id|>\n" "<|start_header_id|>assistant<|end_header_id|>\nMy contribution:" ), "Qwen2.5-Coder-32B-Instruct": ( f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n" "<|im_start|>assistant\nTechnical perspective:" ) } # Model-specific stop sequences stop_sequences = { "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"], "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"], "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"] } payload = { "inputs": model_prompts[model_name], "parameters": { "max_tokens": MODEL_MAX_TOKENS[model_name], "temperature": 0.6, "stop_sequences": stop_sequences[model_name], "return_full_text": False } } try: response = requests.post(endpoint, json=payload, headers=headers) response.raise_for_status() result = response.json()[0]['generated_text'] # Clean up response formatting result = result.split('<|')[0] # Remove any remaining special tokens result = result.replace('**', '').replace('##', '') # Remove markdown result = result.strip() # Remove leading/trailing whitespace return result # Return complete response except Exception as e: return f"{model_name} error: {str(e)}" def respond(message: str, history: List[List[str]], session_id: str) -> Generator[str, None, None]: """Handle sequential model responses with context preservation""" # Load or initialize session session = session_manager.load_session(session_id) if not isinstance(session, dict) or "history" not in session: session = {"history": []} # Build context from session history messages = [] for entry in session["history"]: if entry["type"] == "user": messages.append({"role": "user", "content": entry["content"]}) else: messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"}) # Add current message messages.append({"role": "user", "content": message}) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "user", "content": message }) # First model yield "šŸ”µ Qwen2.5-Coder-32B-Instruct is thinking..." response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "assistant", "model": "Qwen2.5-Coder-32B-Instruct", "content": response1 }) messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"}) yield f"šŸ”µ **Qwen2.5-Coder-32B-Instruct**\n{response1}" # Second model yield f"šŸ”µ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nšŸŸ£ Qwen2.5-72B-Instruct is thinking..." response2 = query_model("Qwen2.5-72B-Instruct", messages) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "assistant", "model": "Qwen2.5-72B-Instruct", "content": response2 }) messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"}) yield f"šŸ”µ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nšŸŸ£ **Qwen2.5-72B-Instruct**\n{response2}" # Final model yield f"šŸ”µ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nšŸŸ£ **Qwen2.5-72B-Instruct**\n{response2}\n\nšŸŸ” Llama3.3-70B-Instruct is thinking..." response3 = query_model("Llama3.3-70B-Instruct", messages) session["history"].append({ "timestamp": datetime.now().isoformat(), "type": "assistant", "model": "Llama3.3-70B-Instruct", "content": response3 }) messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"}) # Save final session state session_manager.save_session(session_id, session) # Return final combined response yield f"šŸ”µ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\nšŸŸ£ **Qwen2.5-72B-Instruct**\n{response2}\n\nšŸŸ” **Llama3.3-70B-Instruct**\n{response3}" # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Multi-LLM Collaboration Chat") with gr.Row(): session_id = gr.State(session_manager.create_session) new_session = gr.Button("šŸ”„ New Session") chatbot = gr.Chatbot(height=600) msg = gr.Textbox(label="Message") def on_new_session(): new_id = session_manager.create_session() return new_id, [] def user(message, history, session_id): return "", history + [[message, None]] def bot(history, session_id): if history and history[-1][1] is None: message = history[-1][0] for response in respond(message, history[:-1], session_id): history[-1][1] = response yield history msg.submit(user, [msg, chatbot, session_id], [msg, chatbot]).then( bot, [chatbot, session_id], [chatbot] ) new_session.click(on_new_session, None, [session_id, chatbot]) if __name__ == "__main__": demo.launch(share=True)