import os import gradio as gr from openai import OpenAI # Configure the OpenAI client with your custom API endpoint and API key. client = OpenAI(base_url="http://home.mayhew.cloud:1234/v1", api_key="lm-studio") # UI text and styling SYSTEM_PROMPT = "You are an assistant." DESCRIPTION = '''

HealthAssistant

''' LICENSE = "

" PLACEHOLDER = """

The "Doctor" is in.

Available 1:00pm - 5:00pm EST

""" css = """ h1 { text-align: center; display: block; } #duplicate-button { margin: auto; color: white; background: #1565c0; border-radius: 100vh; } """ def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int): """ Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses. Implements logic: - The assistant is forced to begin its answer with " ". - We then wait until a closing "" marker is received. - Only text after "" is displayed as the final answer. Args: message (str): The latest user message. history (list): Conversation history as a list of (user, assistant) tuples. temperature (float): Sampling temperature. max_new_tokens (int): Maximum tokens to generate. Yields: str: Partial cumulative output from the assistant. """ conversation = [] if not history: # Add a system prompt and initial assistant confirmation. conversation.append({"role": "system", "content": SYSTEM_PROMPT}) conversation.append({"role": "assistant", "content": "Understood!"}) for user_msg, assistant_msg in history: conversation.append({"role": "user", "content": user_msg}) conversation.append({"role": "assistant", "content": assistant_msg}) conversation.append({"role": "user", "content": message}) # Force the model to begin its answer with a "" block. conversation.append({"role": "assistant", "content": " "}) full_response = "" # Stores the raw assistant response (including the block). buffer = "" # Accumulates tokens until we detect the closing . display_text = "" # Holds text to display (only text after ). think_detected = False # Immediately yield a "thinking" status message. yield "A.I. Healthcare is Thinking! Please wait, your response will output shortly...\n\n" # Call the API with streaming enabled. response = client.chat.completions.create( model="model-identifier", # Replace with your actual model identifier. messages=conversation, temperature=temperature, max_tokens=max_new_tokens, stream=True, ) # Process streaming responses. for chunk in response: # Extract the new token text from the chunk. delta = chunk.choices[0].delta token_text = delta.content or "" full_response += token_text if not think_detected: # Accumulate tokens until we see the closing marker. buffer += token_text if "" in buffer: think_detected = True # Discard everything up to and including the "" marker. display_text = buffer.split("", 1)[1] yield display_text else: display_text += token_text yield display_text # Append the full (raw) response, including the section, to the conversation history. history.append((message, full_response)) # Create the Chatbot component. chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='HealthAssistant') # Build the Gradio interface. with gr.Blocks(css=css) as demo: gr.Markdown(DESCRIPTION) gr.ChatInterface( fn=chat_with_openai, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.6, label="Temperature", render=False), gr.Slider(minimum=1024, maximum=4096, step=128, value=2048, label="Max new tokens", render=False), ], examples=[ ['What is, and do I need it?'], ['What medications help manage being invisible?'], ['How do I know if a clown is the right option?'], ['How can I access music in states where it is regulated?'], ], cache_examples=False, ) gr.Markdown(LICENSE) if __name__ == "__main__": demo.launch()