import spaces import gradio as gr import torch import subprocess import aiohttp from gradio import State import asyncio import json import asyncio import threading # Function to start the ochat server @spaces.GPU def start_ochat_server(): print(f"Is CUDA available: {torch.cuda.is_available()}") print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") command = [ "python", "-m", "ochat.serving.openai_api_server", "--model", "openchat/openchat_3.5" ] # Start the server in a separate process try: subprocess.Popen(command) return "ochat server started successfully" except Exception as e: return f"Failed to start ochat server: {e}" async def monitor_server(): while True: async with aiohttp.ClientSession() as session: try: async with session.get("http://localhost:18888/docs") as response: if response.status == 200: print("Server is running.") else: print("Server is not running. Attempting to restart...") start_ochat_server() except aiohttp.ClientError: print("Server is not running. Attempting to restart...") start_ochat_server() await asyncio.sleep(60) # Check every 60 seconds start_ochat_server() # Start the monitoring in a separate thread thread = threading.Thread(target=monitor_server) thread.start() # Function to send a message to the ochat server and get a response async def chat_with_ochat(message): base_url = "http://localhost:18888" chat_url = f"{base_url}/v1/chat/completions" headers = {"Content-Type": "application/json"} data = { "model": "openchat_3.5", "messages": [{"role": "user", "content": message}] } async with aiohttp.ClientSession() as session: try: async with session.post(chat_url, headers=headers, json=data) as response: if response.status == 200: response_data = await response.json() return response_data['choices'][0]['message']['content'] else: return f"Error: Server responded with status code {response.status}" except aiohttp.ClientError as e: return f"Error: {e}" # Create a Gradio Blocks interface with session state with gr.Blocks(theme=gr.themes.Soft()) as app: gr.Markdown("## vLLM OpenChat-3.5 Interface") gr.Markdown("### the vLLM server cannot handle concurrent users in spaces. If you get an error, run it on docker.") gr.Markdown("This will run better on your own machine: ```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \ registry.hf.space/macadeliccc-openchat-3-5-chatbot:latest python app.py```") message = gr.Textbox(label="Your Message", placeholder="Type your message here") chatbot = gr.Chatbot() clear = gr.Button("Clear") history = State([]) # Session state for chat history async def user(message, history): return "", history + [[message, None]] async def bot(history): if history and history[-1] and history[-1][0]: user_message = history[-1][0] bot_response = await chat_with_ochat(user_message) history[-1][1] = bot_response # Update the last entry with the bot's response return history message.submit(user, [message, chatbot], [message, chatbot], queue=True).then( bot, chatbot, chatbot ) clear.click(lambda: None, None, chatbot, queue=False) app.queue() app.launch()