HealthAssistant

import os
import gradio as gr
from openai import OpenAI

# Configure the OpenAI client with your custom API endpoint and API key.
client = OpenAI(base_url="http://home.mayhew.cloud:1234/v1", api_key="lm-studio")

# UI text and styling
SYSTEM_PROMPT = "You are an assistant."
DESCRIPTION = '''
<div>
  <h1 style="text-align: center;">HealthAssistant</h1>
</div>
'''
LICENSE = "<p></p>"
PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">The "Doctor" is in.</h1>
   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Available 1:00pm - 5:00pm EST</p>
</div>
"""
css = """
h1 {
  text-align: center;
  display: block;
}

#duplicate-button {
  margin: auto;
  color: white;
  background: #1565c0;
  border-radius: 100vh;
}
"""

def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int):
    """
    Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
    Implements <think> logic:
      - The assistant is forced to begin its answer with "<think> ".
      - We then wait until a closing "</think>" marker is received.
      - Only text after "</think>" is displayed as the final answer.
    
    Args:
        message (str): The latest user message.
        history (list): Conversation history as a list of (user, assistant) tuples.
        temperature (float): Sampling temperature.
        max_new_tokens (int): Maximum tokens to generate.
    
    Yields:
        str: Partial cumulative output from the assistant.
    """
    conversation = []
    if not history:
        # Add a system prompt and initial assistant confirmation.
        conversation.append({"role": "system", "content": SYSTEM_PROMPT})
        conversation.append({"role": "assistant", "content": "Understood!"})
    for user_msg, assistant_msg in history:
        conversation.append({"role": "user", "content": user_msg})
        conversation.append({"role": "assistant", "content": assistant_msg})
    conversation.append({"role": "user", "content": message})
    # Force the model to begin its answer with a "<think>" block.
    conversation.append({"role": "assistant", "content": "<think> "})

    full_response = ""   # Stores the raw assistant response (including the <think> block).
    buffer = ""          # Accumulates tokens until we detect the closing </think>.
    display_text = ""    # Holds text to display (only text after </think>).
    think_detected = False

    # Immediately yield a "thinking" status message.
    yield "A.I. Healthcare is Thinking! Please wait, your response will output shortly...\n\n"

    # Call the API with streaming enabled.
    response = client.chat.completions.create(
        model="model-identifier",  # Replace with your actual model identifier.
        messages=conversation,
        temperature=temperature,
        max_tokens=max_new_tokens,
        stream=True,
    )

    # Process streaming responses.
    for chunk in response:
        # Extract the new token text from the chunk.
        delta = chunk.choices[0].delta
        token_text = delta.content or ""
        full_response += token_text

        if not think_detected:
            # Accumulate tokens until we see the closing </think> marker.
            buffer += token_text
            if "</think>" in buffer:
                think_detected = True
                # Discard everything up to and including the "</think>" marker.
                display_text = buffer.split("</think>", 1)[1]
                yield display_text
        else:
            display_text += token_text
            yield display_text

    # Append the full (raw) response, including the <think> section, to the conversation history.
    history.append((message, full_response))

# Create the Chatbot component.
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='HealthAssistant')

# Build the Gradio interface.
with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    
    gr.ChatInterface(
        fn=chat_with_openai,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.6, label="Temperature", render=False),
            gr.Slider(minimum=1024, maximum=4096, step=128, value=2048, label="Max new tokens", render=False),
        ],
        examples=[
            ['What is, and do I need it?'],
            ['What medications help manage being invisible?'],
            ['How do I know if a clown is the right option?'],
            ['How can I access music in states where it is regulated?'],
        ],
        cache_examples=False,
    )
    
    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.launch()