Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import requests | |
import threading | |
from typing import List, Dict, Any | |
# Get the Hugging Face API key from Spaces secrets | |
HF_API_KEY = os.getenv("HF_API_KEY") | |
# Model endpoints configuration | |
MODEL_ENDPOINTS = { | |
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct", | |
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct", | |
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct", | |
} | |
def query_model(model_name: str, messages: List[Dict[str, str]]) -> str: | |
"""Query a single model with the chat history""" | |
endpoint = MODEL_ENDPOINTS[model_name] | |
headers = { | |
"Authorization": f"Bearer {HF_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
# Format the prompt according to each model's requirements | |
prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_tokens": 1024, | |
"temperature": 0.7, | |
"stop_sequences": ["\nUser:", "\nAssistant:", "###"] | |
} | |
} | |
try: | |
response = requests.post(endpoint, json=payload, headers=headers) | |
response.raise_for_status() | |
return response.json()[0]['generated_text'] | |
except Exception as e: | |
return f"Error from {model_name}: {str(e)}" | |
def respond(message: str, history: List[List[str]]) -> str: | |
"""Handle chat responses from all models""" | |
# Prepare messages in OpenAI format | |
messages = [{"role": "user", "content": message}] | |
# Create threads for concurrent model queries | |
threads = [] | |
results = {} | |
def get_model_response(model_name): | |
results[model_name] = query_model(model_name, messages) | |
for model_name in MODEL_ENDPOINTS: | |
thread = threading.Thread(target=get_model_response, args=(model_name,)) | |
thread.start() | |
threads.append(thread) | |
# Wait for all threads to complete | |
for thread in threads: | |
thread.join() | |
# Format responses from all models | |
responses = [] | |
for model_name, response in results.items(): | |
responses.append(f"**{model_name}**:\n{response}") | |
return "\n\n".join(responses) | |
# Create the Gradio interface | |
chat_interface = gr.ChatInterface( | |
respond, | |
title="Multi-LLM Collaboration Chat", | |
description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B", | |
examples=["How can I optimize Python code?", "Explain quantum computing basics"], | |
theme="soft" | |
) | |
if __name__ == "__main__": | |
chat_interface.launch(share=True) | |