File size: 4,815 Bytes
43b5bef
c1e5d4c
518be16
c1e5d4c
 
43b5bef
c1e5d4c
cf38aa5
43b5bef
c1e5d4c
43b5bef
 
 
 
 
 
c1e5d4c
 
 
518be16
 
c1e5d4c
518be16
77ac272
6733659
 
 
 
c9870b1
 
6733659
 
c9870b1
 
6733659
 
 
c9870b1
 
6733659
4766698
c9870b1
 
 
 
 
 
 
 
 
 
c1e5d4c
c9870b1
6617dfe
4766698
c1e5d4c
c9870b1
 
6617dfe
518be16
77ac272
c1e5d4c
 
 
c9870b1
 
 
 
 
d82511d
c1e5d4c
c9870b1
c1e5d4c
 
73c4292
 
 
 
 
 
 
 
 
 
 
 
 
4052330
73c4292
 
c1e5d4c
f9c29be
73c4292
cac494e
640fedc
73c4292
 
 
 
 
 
 
 
cac494e
f9c29be
73c4292
 
 
 
 
 
 
 
cac494e
43b5bef
c1e5d4c
 
 
 
 
 
 
 
43b5bef
c1e5d4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import os
import requests
import threading
from typing import List, Dict, Any

# Get the Hugging Face API key from Spaces secrets
HF_API_KEY = os.getenv("HF_API_KEY")

# Model endpoints configuration
MODEL_ENDPOINTS = {
    "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
    "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
    "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}

def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
    """Query a single model with the chat history"""
    endpoint = MODEL_ENDPOINTS[model_name]
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Build full conversation history for context
    conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
    
    # Model-specific prompt formatting with full history
    model_prompts = {
        "Qwen2.5-72B-Instruct": (
            f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
            "<|im_start|>assistant\nMy analysis:"
        ),
        "Llama3.3-70B-Instruct": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
            f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
            "<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
        ),
        "Qwen2.5-Coder-32B-Instruct": (
            f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
            "<|im_start|>assistant\nTechnical perspective:"
        )
    }

    # Model-specific stop sequences
    stop_sequences = {
        "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
        "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
        "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
    }

    payload = {
        "inputs": model_prompts[model_name],
        "parameters": {
            "max_tokens": 2048,
            "temperature": 0.7,
            "stop_sequences": stop_sequences[model_name],
            "return_full_text": False
        }
    }
    
    try:
        response = requests.post(endpoint, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()[0]['generated_text']
        # Clean up response formatting
        result = result.split('<|')[0]  # Remove any remaining special tokens
        result = result.replace('**', '').replace('##', '')  # Remove markdown
        result = result.strip()  # Remove leading/trailing whitespace
        return result  # Return complete response
    except Exception as e:
        return f"{model_name} error: {str(e)}"

def respond(message: str, history: List[List[str]]) -> str:
    """Handle sequential model responses with continuous context"""
    
    # Build full conversation history from previous interactions
    conversation = []
    if history:
        for user_msg, assistant_msg in history:
            conversation.append({"role": "user", "content": user_msg})
            if assistant_msg:
                # Split assistant message into individual model responses
                responses = assistant_msg.split("\n\n")
                for resp in responses:
                    if resp:
                        conversation.append({"role": "assistant", "content": resp})
    
    # Add current message
    conversation.append({"role": "user", "content": message})
    
    # Get first model's response
    response1 = query_model("Qwen2.5-Coder-32B-Instruct", conversation)
    yield f"**Qwen2.5-Coder-32B-Instruct**:\n{response1}"
    
    # Add first response to context
    conversation.append({
        "role": "assistant",
        "content": f"**Qwen2.5-Coder-32B-Instruct**:\n{response1}"
    })
    
    # Get second model's response
    response2 = query_model("Qwen2.5-72B-Instruct", conversation)
    yield f"**Qwen2.5-72B-Instruct**:\n{response2}"
    
    # Add second response to context
    conversation.append({
        "role": "assistant",
        "content": f"**Qwen2.5-72B-Instruct**:\n{response2}"
    })
    
    # Get final model's response
    response3 = query_model("Llama3.3-70B-Instruct", conversation)
    yield f"**Llama3.3-70B-Instruct**:\n{response3}"

# Create the Gradio interface
chat_interface = gr.ChatInterface(
    respond,
    title="Multi-LLM Collaboration Chat",
    description="A group chat with Qwen2.5-72B, Llama3.3-70B, and Qwen2.5-Coder-32B",
    examples=["How can I optimize Python code?", "Explain quantum computing basics"],
    theme="soft"
)

if __name__ == "__main__":
    chat_interface.launch(share=True)