Spaces:
Sleeping
Sleeping
File size: 4,815 Bytes
43b5bef c1e5d4c 518be16 c1e5d4c 43b5bef c1e5d4c cf38aa5 43b5bef c1e5d4c 43b5bef c1e5d4c 518be16 c1e5d4c 518be16 77ac272 6733659 c9870b1 6733659 c9870b1 6733659 c9870b1 6733659 4766698 c9870b1 c1e5d4c c9870b1 6617dfe 4766698 c1e5d4c c9870b1 6617dfe 518be16 77ac272 c1e5d4c c9870b1 d82511d c1e5d4c c9870b1 c1e5d4c 73c4292 4052330 73c4292 c1e5d4c f9c29be 73c4292 cac494e 640fedc 73c4292 cac494e f9c29be 73c4292 cac494e 43b5bef c1e5d4c 43b5bef c1e5d4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
import os
import requests
import threading
from typing import List, Dict, Any
# Get the Hugging Face API key from Spaces secrets
HF_API_KEY = os.getenv("HF_API_KEY")
# Model endpoints configuration
MODEL_ENDPOINTS = {
"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}
def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
"""Query a single model with the chat history"""
endpoint = MODEL_ENDPOINTS[model_name]
headers = {
"Authorization": f"Bearer {HF_API_KEY}",
"Content-Type": "application/json"
}
# Build full conversation history for context
conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
# Model-specific prompt formatting with full history
model_prompts = {
"Qwen2.5-72B-Instruct": (
f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nMy analysis:"
),
"Llama3.3-70B-Instruct": (
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
"<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
),
"Qwen2.5-Coder-32B-Instruct": (
f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
"<|im_start|>assistant\nTechnical perspective:"
)
}
# Model-specific stop sequences
stop_sequences = {
"Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
"Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
"Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
}
payload = {
"inputs": model_prompts[model_name],
"parameters": {
"max_tokens": 2048,
"temperature": 0.7,
"stop_sequences": stop_sequences[model_name],
"return_full_text": False
}
}
try:
response = requests.post(endpoint, json=payload, headers=headers)
response.raise_for_status()
result = response.json()[0]['generated_text']
# Clean up response formatting
result = result.split('<|')[0] # Remove any remaining special tokens
result = result.replace('**', '').replace('##', '') # Remove markdown
result = result.strip() # Remove leading/trailing whitespace
return result # Return complete response
except Exception as e:
return f"{model_name} error: {str(e)}"
def respond(message: str, history: List[List[str]]) -> str:
"""Handle sequential model responses with continuous context"""
# Build full conversation history from previous interactions
conversation = []
if history:
for user_msg, assistant_msg in history:
conversation.append({"role": "user", "content": user_msg})
if assistant_msg:
# Split assistant message into individual model responses
responses = assistant_msg.split("\n\n")
for resp in responses:
if resp:
conversation.append({"role": "assistant", "content": resp})
# Add current message
conversation.append({"role": "user", "content": message})
# Get first model's response
response1 = query_model("Qwen2.5-Coder-32B-Instruct", conversation)
yield f"**Qwen2.5-Coder-32B-Instruct**:\n{response1}"
# Add first response to context
conversation.append({
"role": "assistant",
"content": f"**Qwen2.5-Coder-32B-Instruct**:\n{response1}"
})
# Get second model's response
response2 = query_model("Qwen2.5-72B-Instruct", conversation)
yield f"**Qwen2.5-72B-Instruct**:\n{response2}"
# Add second response to context
conversation.append({
"role": "assistant",
"content": f"**Qwen2.5-72B-Instruct**:\n{response2}"
})
# Get final model's response
response3 = query_model("Llama3.3-70B-Instruct", conversation)
yield f"**Llama3.3-70B-Instruct**:\n{response3}"
# Create the Gradio interface
chat_interface = gr.ChatInterface(
respond,
title="Multi-LLM Collaboration Chat",
description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
examples=["How can I optimize Python code?", "Explain quantum computing basics"],
theme="soft"
)
if __name__ == "__main__":
chat_interface.launch(share=True)
|