File size: 7,067 Bytes
43b5bef
c1e5d4c
518be16
c1e5d4c
0d6849e
cc3006a
 
43b5bef
cc3006a
 
cf38aa5
43b5bef
42b5787
43b5bef
 
 
 
 
 
c1e5d4c
8190eb3
c1e5d4c
518be16
 
c1e5d4c
518be16
77ac272
6733659
 
 
8190eb3
c9870b1
 
5e8e5d5
6733659
c9870b1
 
6733659
 
 
c9870b1
 
6733659
4766698
c9870b1
 
 
 
 
 
 
 
 
 
c1e5d4c
c9870b1
6617dfe
42b5787
 
c9870b1
 
6617dfe
518be16
77ac272
c1e5d4c
 
 
c9870b1
8190eb3
 
 
 
 
c1e5d4c
c9870b1
c1e5d4c
fdaf591
8190eb3
57a76f2
0d6849e
57a76f2
 
73c4292
28f1fca
 
 
 
 
 
 
 
 
 
0d6849e
 
 
 
 
28f1fca
fdaf591
 
327109c
cc3006a
 
 
 
 
 
28f1fca
fdaf591
f32ce56
fdaf591
 
327109c
cc3006a
 
 
 
 
 
28f1fca
fdaf591
f32ce56
fdaf591
 
327109c
cc3006a
 
 
 
 
 
28f1fca
 
8190eb3
cc3006a
28f1fca
fdaf591
8190eb3
43b5bef
8190eb3
fdaf591
8190eb3
 
fdaf591
 
 
 
8190eb3
 
 
 
 
 
 
 
 
 
 
fdaf591
 
 
 
 
 
 
 
 
 
 
 
 
8190eb3
fdaf591
 
 
 
43b5bef
c1e5d4c
cc3006a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import gradio as gr
import os
import requests
import threading
from datetime import datetime
from typing import List, Dict, Any, Generator
from session_manager import SessionManager

# Initialize session manager and get HF API key
session_manager = SessionManager()
HF_API_KEY = os.getenv("HF_API_KEY")

# Model endpoints configuration
MODEL_ENDPOINTS = {
    "Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
    "Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
    "Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
}

def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
    """Query a single model with the chat history"""
    endpoint = MODEL_ENDPOINTS[model_name]
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Build full conversation history for context
    conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
    
    # Model-specific prompt formatting with full history
    model_prompts = {
        "Qwen2.5-72B-Instruct": (
            f"<|im_start|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<|im_end|>\n"
            "<|im_start|>assistant\nMy analysis:"
        ),
        "Llama3.3-70B-Instruct": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
            f"Build upon this discussion:\n{conversation}<|eot_id|>\n"
            "<|start_header_id|>assistant<|end_header_id|>\nMy contribution:"
        ),
        "Qwen2.5-Coder-32B-Instruct": (
            f"<|im_start|>system\nTechnical discussion context:\n{conversation}<|im_end|>\n"
            "<|im_start|>assistant\nTechnical perspective:"
        )
    }

    # Model-specific stop sequences
    stop_sequences = {
        "Qwen2.5-72B-Instruct": ["<|im_end|>", "<|endoftext|>"],
        "Llama3.3-70B-Instruct": ["<|eot_id|>", "\nuser:"],
        "Qwen2.5-Coder-32B-Instruct": ["<|im_end|>", "<|endoftext|>"]
    }

    payload = {
        "inputs": model_prompts[model_name],
        "parameters": {
            "max_tokens": 2048,
            "temperature": 0.7,
            "stop_sequences": stop_sequences[model_name],
            "return_full_text": False
        }
    }
    
    try:
        response = requests.post(endpoint, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()[0]['generated_text']
        # Clean up response formatting
        result = result.split('<|')[0]  # Remove any remaining special tokens
        result = result.replace('**', '').replace('##', '')  # Remove markdown emphasis
        result = result.strip()  # Remove leading/trailing whitespace
        return result
    except Exception as e:
        return f"{model_name} error: {str(e)}"

def respond(message: str, history: List[List[str]], session_id: str) -> Generator[str, None, None]:
    """Handle sequential model responses with context preservation"""
    # Load or initialize session
    session = session_manager.load_session(session_id)
    if not isinstance(session, dict) or "history" not in session:
        session = {"history": []}
    
    # Build context from session history
    messages = []
    for entry in session["history"]:
        if entry["type"] == "user":
            messages.append({"role": "user", "content": entry["content"]})
        else:
            messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "user",
        "content": message
    })
    
    # First model
    yield "πŸ”΅ Qwen2.5-Coder-32B-Instruct is thinking..."
    response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages)
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "assistant",
        "model": "Qwen2.5-Coder-32B-Instruct",
        "content": response1
    })
    messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"})
    yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}"
    
    # Second model
    yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 Qwen2.5-72B-Instruct is thinking..."
    response2 = query_model("Qwen2.5-72B-Instruct", messages)
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "assistant",
        "model": "Qwen2.5-72B-Instruct",
        "content": response2
    })
    messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"})
    yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}"
    
    # Final model
    yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}\n\n🟑 Llama3.3-70B-Instruct is thinking..."
    response3 = query_model("Llama3.3-70B-Instruct", messages)
    session["history"].append({
        "timestamp": datetime.now().isoformat(),
        "type": "assistant",
        "model": "Llama3.3-70B-Instruct",
        "content": response3
    })
    messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"})
    
    # Save final session state
    session_manager.save_session(session_id, session)
    
    # Return final combined response
    yield f"πŸ”΅ **Qwen2.5-Coder-32B-Instruct**\n{response1}\n\n🟣 **Qwen2.5-72B-Instruct**\n{response2}\n\n🟑 **Llama3.3-70B-Instruct**\n{response3}"

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Multi-LLM Collaboration Chat")
    
    with gr.Row():
        session_id = gr.State(session_manager.create_session)
        new_session = gr.Button("πŸ”„ New Session")
    
    # Add latex_delimiters to enable LaTeX rendering
    chatbot = gr.Chatbot(
        height=600,
        latex_delimiters=[
            {"left": "$", "right": "$", "display": False},  # inline math
            {"left": "$$", "right": "$$", "display": True}   # display math
        ]
    )
    
    msg = gr.Textbox(label="Message")
    
    def on_new_session():
        new_id = session_manager.create_session()
        return new_id, []
    
    def user(message, history, session_id):
        return "", history + [[message, None]]
    
    def bot(history, session_id):
        if history and history[-1][1] is None:
            message = history[-1][0]
            for response in respond(message, history[:-1], session_id):
                history[-1][1] = response
                yield history
    
    msg.submit(user, [msg, chatbot, session_id], [msg, chatbot]).then(
        bot, [chatbot, session_id], [chatbot]
    )
    new_session.click(on_new_session, None, [session_id, chatbot])

if __name__ == "__main__":
    demo.launch(share=True)