File size: 3,918 Bytes
43b5bef
c1e5d4c
518be16
c1e5d4c
 
43b5bef
c1e5d4c
cf38aa5
43b5bef
c1e5d4c
43b5bef
 
 
 
 
 
c1e5d4c
 
 
518be16
 
c1e5d4c
518be16
77ac272
c9870b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1e5d4c
c9870b1
6617dfe
c1e5d4c
 
c9870b1
 
6617dfe
518be16
77ac272
c1e5d4c
 
 
c9870b1
 
 
 
 
 
c1e5d4c
c9870b1
c1e5d4c
 
 
 
 
 
 
 
 
6617dfe
c1e5d4c
 
c29137d
c1e5d4c
 
 
 
518be16
c1e5d4c
 
 
518be16
c1e5d4c
43b5bef
c1e5d4c
 
 
c9870b1
 
43b5bef
c1e5d4c
 
 
 
 
 
 
 
43b5bef
c1e5d4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import os
import requests
import threading
from typing import List, Dict, Any

# Get the Hugging Face API key from Spaces secrets
HF_API_KEY = os.getenv("HF_API_KEY")

# Model endpoints configuration
MODEL_ENDPOINTS = {
    "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
    "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
    "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}

def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
    """Query a single model with the chat history"""
    endpoint = MODEL_ENDPOINTS[model_name]
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Model-specific prompt formatting
    model_prompts = {
        "Qwen2.5-72B-Instruct": (
            f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
        ),
        "Llama3.3-70B-Instruct": (
            "<|begin_of_text|>"
            "<|start_header_id|>user<|end_header_id|>\n\n"
            f"{messages[-1]['content']}<|eot_id|>"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
        ),
        "Qwen2.5-Coder-32B-Instruct": (
            f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
        )
    }

    # Model-specific stop sequences
    stop_sequences = {
        "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
        "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
        "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
    }

    payload = {
        "inputs": model_prompts[model_name],
        "parameters": {
            "max_tokens": 1024,
            "temperature": 0.7,
            "stop_sequences": stop_sequences[model_name],
            "return_full_text": False
        }
    }
    
    try:
        response = requests.post(endpoint, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()[0]['generated_text']
        # Clean up response formatting
        result = result.split('<|')[0]  # Remove any remaining special tokens
        result = result.replace('**', '').replace('##', '')  # Remove markdown
        result = result.strip()  # Remove leading/trailing whitespace
        return result.split('\n\n')[0]  # Return only first paragraph
    except Exception as e:
        return f"{model_name} error: {str(e)}"

def respond(message: str, history: List[List[str]]) -> str:
    """Handle chat responses from all models"""
    # Prepare messages in OpenAI format
    messages = [{"role": "user", "content": message}]
    
    # Create threads for concurrent model queries
    threads = []
    results = {}
    
    def get_model_response(model_name):
        results[model_name] = query_model(model_name, messages)
    
    for model_name in MODEL_ENDPOINTS:
        thread = threading.Thread(target=get_model_response, args=(model_name,))
        thread.start()
        threads.append(thread)
    
    # Wait for all threads to complete
    for thread in threads:
        thread.join()
    
    # Format responses from all models
    responses = []
    for model_name, response in results.items():
        responses.append(f"**{model_name}**:\n{response}")
    
    # Format responses with clear separation
    return "\n\n----------------------------------------\n\n".join(responses)

# Create the Gradio interface
chat_interface = gr.ChatInterface(
    respond,
    title="Multi-LLM Collaboration Chat",
    description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
    examples=["How can I optimize Python code?", "Explain quantum computing basics"],
    theme="soft"
)

if __name__ == "__main__":
    chat_interface.launch(share=True)