Spaces:

luminoussg
/

choupijiang

Sleeping

App Files Files Community

choupijiang / app.py

luminoussg

Update app.py

eebaa87 verified 5 months ago

raw

history blame

6.54 kB

	import gradio as gr
	import os
	import requests
	import time
	from datetime import datetime
	from typing import List, Dict
	from session_manager import SessionManager # only if you need sessions

	# Initialize session manager and get HF API key (adjust if not using sessions)
	session_manager = SessionManager()
	HF_API_KEY = os.getenv("HF_API_KEY")

	# Model endpoints configuration
	MODEL_ENDPOINTS = {
	"Qwen2.5-72B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct",
	"Llama3.3-70B-Instruct": "https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
	"Qwen2.5-Coder-32B-Instruct": "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B-Instruct",
	}

	def query_model(model_name: str, messages: List[Dict[str, str]]) -> str:
	"""
	Query a single model with the conversation so far (list of dicts with 'role' and 'content').
	"""
	endpoint = MODEL_ENDPOINTS[model_name]
	headers = {
	"Authorization": f"Bearer {HF_API_KEY}",
	"Content-Type": "application/json"
	}

	# Combine conversation into a single string (simple example)
	conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)

	# Model-specific prompt formatting
	model_prompts = {
	"Qwen2.5-72B-Instruct": (
	f"<\|im_start\|>system\nCollaborate with other experts:\n{conversation}<\|im_end\|>\n"
	"<\|im_start\|>assistant\nMy analysis:"
	),
	"Llama3.3-70B-Instruct": (
	"<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n"
	f"Build on the conversation:\n{conversation}<\|eot_id\|>\n"
	"<\|start_header_id\|>assistant<\|end_header_id\|>\nMy contribution:"
	),
	"Qwen2.5-Coder-32B-Instruct": (
	f"<\|im_start\|>system\nTechnical discussion context:\n{conversation}<\|im_end\|>\n"
	"<\|im_start\|>assistant\nTechnical perspective:"
	)
	}

	stop_sequences = {
	"Qwen2.5-72B-Instruct": ["<\|im_end\|>", "<\|endoftext\|>"],
	"Llama3.3-70B-Instruct": ["<\|eot_id\|>", "\nuser:"],
	"Qwen2.5-Coder-32B-Instruct": ["<\|im_end\|>", "<\|endoftext\|>"]
	}

	payload = {
	"inputs": model_prompts[model_name],
	"parameters": {
	"max_tokens": 1024,
	"temperature": 0.7,
	"stop_sequences": stop_sequences[model_name],
	"return_full_text": False
	}
	}

	try:
	response = requests.post(endpoint, json=payload, headers=headers)
	response.raise_for_status()
	generated = response.json()[0]["generated_text"]
	# Clean up possible leftover tokens
	generated = generated.split("<\|")[0].strip()
	return generated
	except Exception as e:
	return f"{model_name} error: {str(e)}"


	def on_new_session():
	"""Create a new session and clear the chat."""
	new_id = session_manager.create_session()
	return new_id, []

	def user_message(user_msg, history, session_id):
	"""
	After the user hits enter, append the user's message to the conversation.
	Return updated conversation so the UI can display it.
	"""
	if not user_msg.strip():
	return "", history # if user didn't type anything
	# Append the new user message to the conversation
	history.append({"role": "user", "content": user_msg})
	return "", history

	def bot_reply(history, session_id):
	"""
	Stream the multi-model response. We rely on the last user message in `history`,
	then call each model in turn, appending partial updates. Yields updated conversation each time.
	"""
	if not history or history[-1]["role"] != "user":
	return # There's no new user message to respond to

	# Optionally load existing session, if you have session logic
	session = session_manager.load_session(session_id) if session_id else None
	if session is None:
	session = {"history": []}

	# 1) Qwen2.5-Coder-32B
	# Add an assistant message placeholder
	history.append({"role": "assistant", "content": "🔵 Qwen2.5-Coder-32B-Instruct is thinking..."})
	yield history

	resp1 = query_model("Qwen2.5-Coder-32B-Instruct", history)
	updated_content = f"🔵 Qwen2.5-Coder-32B-Instruct\n{resp1}"
	history[-1]["content"] = updated_content
	yield history

	# 2) Qwen2.5-72B
	updated_content += "\n\n🟣 Qwen2.5-72B-Instruct is thinking..."
	history[-1]["content"] = updated_content
	yield history

	resp2 = query_model("Qwen2.5-72B-Instruct", history)
	updated_content += f"\n\n🟣 Qwen2.5-72B-Instruct\n{resp2}"
	history[-1]["content"] = updated_content
	yield history

	# 3) Llama3.3-70B
	updated_content += "\n\n🟡 Llama3.3-70B-Instruct is thinking..."
	history[-1]["content"] = updated_content
	yield history

	resp3 = query_model("Llama3.3-70B-Instruct", history)
	updated_content += f"\n\n🟡 Llama3.3-70B-Instruct\n{resp3}"
	history[-1]["content"] = updated_content
	yield history

	# Save session, if needed
	session["history"] = history
	session_manager.save_session(session_id, session)

	def clear_chat():
	"""
	Clears the Chatbot entirely (set it to an empty list).
	"""
	return []

	# Build the Gradio Blocks interface
	with gr.Blocks() as demo:
	gr.Markdown("## Multi-LLM Collaboration Chat (Streaming)")

	with gr.Row():
	session_id = gr.State(session_manager.create_session)
	new_session_btn = gr.Button("🔄 New Session")

	# Chatbot with "type='messages'" for streaming messages and LaTeX delimiters
	chatbot = gr.Chatbot(
	type="messages",
	height=550,
	latex_delimiters=[
	{"left": "$", "right": "$", "display": False}, # inline math
	{"left": "$$", "right": "$$", "display": True} # display math
	]
	)

	msg = gr.Textbox(label="Your Message")
	clear_btn = gr.Button("Clear")

	# Wire up the events:
	# 1) On user submit:
	msg.submit(
	fn=user_message,
	inputs=[msg, chatbot, session_id],
	outputs=[msg, chatbot],
	queue=False
	).then(
	fn=bot_reply,
	inputs=[chatbot, session_id],
	outputs=[chatbot]
	)

	# 2) On "Clear" click, empty the chat:
	clear_btn.click(fn=clear_chat, outputs=chatbot, queue=False)

	# 3) On "New Session" click, get a fresh session ID and clear chat:
	new_session_btn.click(fn=on_new_session, outputs=[session_id, chatbot], queue=False)

	if __name__ == "__main__":
	demo.launch()