import os
import gradio as gr
from openai import OpenAI
# Configure the OpenAI client with your custom API endpoint and API key.
client = OpenAI(base_url="http://home.mayhew.cloud:1234/v1", api_key="lm-studio")
# UI text and styling
SYSTEM_PROMPT = "You are an assistant."
DESCRIPTION = '''
HealthAssistant
'''
LICENSE = ""
PLACEHOLDER = """
The "Doctor" is in.
Available 1:00pm - 5:00pm EST
"""
css = """
h1 {
text-align: center;
display: block;
}
#duplicate-button {
margin: auto;
color: white;
background: #1565c0;
border-radius: 100vh;
}
"""
def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int):
"""
Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
Implements logic:
- The assistant is forced to begin its answer with " ".
- We then wait until a closing "" marker is received.
- Only text after "" is displayed as the final answer.
Args:
message (str): The latest user message.
history (list): Conversation history as a list of (user, assistant) tuples.
temperature (float): Sampling temperature.
max_new_tokens (int): Maximum tokens to generate.
Yields:
str: Partial cumulative output from the assistant.
"""
conversation = []
if not history:
# Add a system prompt and initial assistant confirmation.
conversation.append({"role": "system", "content": SYSTEM_PROMPT})
conversation.append({"role": "assistant", "content": "Understood!"})
for user_msg, assistant_msg in history:
conversation.append({"role": "user", "content": user_msg})
conversation.append({"role": "assistant", "content": assistant_msg})
conversation.append({"role": "user", "content": message})
# Force the model to begin its answer with a "" block.
conversation.append({"role": "assistant", "content": " "})
full_response = "" # Stores the raw assistant response (including the block).
buffer = "" # Accumulates tokens until we detect the closing .
display_text = "" # Holds text to display (only text after ).
think_detected = False
# Immediately yield a "thinking" status message.
yield "A.I. Healthcare is Thinking! Please wait, your response will output shortly...\n\n"
# Call the API with streaming enabled.
response = client.chat.completions.create(
model="model-identifier", # Replace with your actual model identifier.
messages=conversation,
temperature=temperature,
max_tokens=max_new_tokens,
stream=True,
)
# Process streaming responses.
for chunk in response:
# Extract the new token text from the chunk.
delta = chunk.choices[0].delta
token_text = delta.content or ""
full_response += token_text
if not think_detected:
# Accumulate tokens until we see the closing marker.
buffer += token_text
if "" in buffer:
think_detected = True
# Discard everything up to and including the "" marker.
display_text = buffer.split("", 1)[1]
yield display_text
else:
display_text += token_text
yield display_text
# Append the full (raw) response, including the section, to the conversation history.
history.append((message, full_response))
# Create the Chatbot component.
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='HealthAssistant')
# Build the Gradio interface.
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=chat_with_openai,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.6, label="Temperature", render=False),
gr.Slider(minimum=1024, maximum=4096, step=128, value=2048, label="Max new tokens", render=False),
],
examples=[
['What is, and do I need it?'],
['What medications help manage being invisible?'],
['How do I know if a clown is the right option?'],
['How can I access music in states where it is regulated?'],
],
cache_examples=False,
)
gr.Markdown(LICENSE)
if __name__ == "__main__":
demo.launch()