Spaces:

luminoussg
/

choupijiang

Sleeping

App Files Files Community

choupijiang / app.py

luminoussg

Update app.py

89c6fc8 verified 4 months ago

raw

history blame

6.61 kB

	import gradio as gr
	import os
	import requests
	import threading
	from datetime import datetime
	from typing import List, Dict, Any, Generator
	from session_manager import SessionManager

	# Initialize session manager and get HF API key
	session_manager = SessionManager()
	HF_API_KEY = os.getenv("HF_API_KEY")

	# Model endpoints configuration
	MODEL_ENDPOINTS = {
	"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
	"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
	"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
	}

	def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
	"""Query a single model with the chat history"""
	endpoint = MODEL_ENDPOINTS[model_name]
	headers = {
	"Authorization": f"Bearer {HF_API_KEY}",
	"Content-Type": "application/json"
	}

	# Build full conversation history for context
	conversation = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])

	# Model-specific prompt formatting with full history
	model_prompts = {
	"Qwen2.5-72B-Instruct": (
	f"<\|im_start\|>system\nCollaborate with other experts. Previous discussion:\n{conversation}<\|im_end\|>\n"
	"<\|im_start\|>assistant\nMy analysis:"
	),
	"Llama3.3-70B-Instruct": (
	"<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n"
	f"Build upon this discussion:\n{conversation}<\|eot_id\|>\n"
	"<\|start_header_id\|>assistant<\|end_header_id\|>\nMy contribution:"
	),
	"Qwen2.5-Coder-32B-Instruct": (
	f"<\|im_start\|>system\nTechnical discussion context:\n{conversation}<\|im_end\|>\n"
	"<\|im_start\|>assistant\nTechnical perspective:"
	)
	}

	# Model-specific stop sequences
	stop_sequences = {
	"Qwen2.5-72B-Instruct": ["<\|im_end\|>", "<\|endoftext\|>"],
	"Llama3.3-70B-Instruct": ["<\|eot_id\|>", "\nuser:"],
	"Qwen2.5-Coder-32B-Instruct": ["<\|im_end\|>", "<\|endoftext\|>"]
	}

	payload = {
	"inputs": model_prompts[model_name],
	"parameters": {
	"max_tokens": 2048,
	"temperature": 0.7,
	"stop_sequences": stop_sequences[model_name],
	"return_full_text": False
	}
	}

	try:
	response = requests.post(endpoint, json=payload, headers=headers)
	response.raise_for_status()
	result = response.json()[0]['generated_text']
	# Clean up response formatting
	result = result.split('<\|')[0] # Remove any remaining special tokens
	result = result.replace('**', '').replace('##', '') # Remove markdown
	result = result.strip() # Remove leading/trailing whitespace
	return result # Return complete response
	except Exception as e:
	return f"{model_name} error: {str(e)}"

	def respond(message: str, history: List[List[str]], session_id: str) -> str:
	"""Handle sequential model responses with context preservation"""
	# Load or initialize session
	session = session_manager.load_session(session_id)
	if not isinstance(session, dict) or "history" not in session:
	session = {"history": []}

	# Build context from session history
	messages = []
	for entry in session["history"]:
	if entry["type"] == "user":
	messages.append({"role": "user", "content": entry["content"]})
	else:
	messages.append({"role": "assistant", "content": f"{entry['model']}: {entry['content']}"})

	# Add current message
	messages.append({"role": "user", "content": message})
	session["history"].append({
	"timestamp": datetime.now().isoformat(),
	"type": "user",
	"content": message
	})

	responses = []

	# First model response
	response1 = query_model("Qwen2.5-Coder-32B-Instruct", messages)
	session["history"].append({
	"timestamp": datetime.now().isoformat(),
	"type": "assistant",
	"model": "Qwen2.5-Coder-32B-Instruct",
	"content": response1
	})
	messages.append({"role": "assistant", "content": f"Qwen2.5-Coder-32B-Instruct: {response1}"})
	responses.append(f"🔵 Qwen2.5-Coder-32B-Instruct\n{response1}")

	# Second model response
	response2 = query_model("Qwen2.5-72B-Instruct", messages)
	session["history"].append({
	"timestamp": datetime.now().isoformat(),
	"type": "assistant",
	"model": "Qwen2.5-72B-Instruct",
	"content": response2
	})
	messages.append({"role": "assistant", "content": f"Qwen2.5-72B-Instruct: {response2}"})
	responses.append(f"🟣 Qwen2.5-72B-Instruct\n{response2}")

	# Final model response
	response3 = query_model("Llama3.3-70B-Instruct", messages)
	session["history"].append({
	"timestamp": datetime.now().isoformat(),
	"type": "assistant",
	"model": "Llama3.3-70B-Instruct",
	"content": response3
	})
	messages.append({"role": "assistant", "content": f"Llama3.3-70B-Instruct: {response3}"})
	responses.append(f"🟡 Llama3.3-70B-Instruct\n{response3}")

	# Save final session state
	session_manager.save_session(session_id, session)

	# Return responses
	return "\n\n".join(responses)

	# Custom CSS for styling
	css = """
	.message { padding: 15px; margin: 10px 0; border-radius: 10px; }
	.assistant { background: #f8fafc; border-left: 4px solid #3b82f6; }
	.user { background: #eff6ff; border-left: 4px solid #60a5fa; }
	.model-name { font-weight: 600; color: #1e40af; margin-bottom: 8px; }
	.thinking { color: #6b7280; font-style: italic; }
	"""

	# Create the Gradio interface
	demo = gr.ChatInterface(
	fn=respond,
	title="Multi-LLM Collaboration Chat",
	description="Experience collaborative AI thinking with three powerful language models",
	examples=[
	["Explain how quantum computing works"],
	["Write a Python function to find prime numbers"],
	],
	additional_inputs=[gr.State(session_manager.create_session)],
	chatbot=gr.Chatbot(
	height=600,
	show_label=False,
	bubble_full_width=False,
	show_copy_button=True,
	container=True,
	sanitize_html=False,
	render_markdown=True
	),
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="indigo",
	neutral_hue="slate",
	font=("Inter", "sans-serif"),
	),
	css=css,
	)

	if __name__ == "__main__":
	demo.launch(share=True)