HealthAssistant

Running

App Files Files

HealthAssistant / app.py

reedmayhew

Update app.py

38bd2d4 verified 5 months ago

raw

history blame

4.93 kB

	import os
	import gradio as gr
	from openai import OpenAI

	# Configure the OpenAI client with your custom API endpoint and API key.
	client = OpenAI(base_url="http://home.mayhew.cloud:1234/v1", api_key="lm-studio")

	# UI text and styling
	SYSTEM_PROMPT = "You are an assistant."
	DESCRIPTION = '''
	<div>
	<h1 style="text-align: center;">HealthAssistant</h1>
	</div>
	'''
	LICENSE = "<p></p>"
	PLACEHOLDER = """
	<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
	<h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">The "Doctor" is in.</h1>
	<p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Available 1:00pm - 5:00pm EST</p>
	</div>
	"""
	css = """
	h1 {
	text-align: center;
	display: block;
	}

	#duplicate-button {
	margin: auto;
	color: white;
	background: #1565c0;
	border-radius: 100vh;
	}
	"""

	def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int):
	"""
	Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
	Implements <think> logic:
	- The assistant is forced to begin its answer with "<think> ".
	- We then wait until a closing "</think>" marker is received.
	- Only text after "</think>" is displayed as the final answer.

	Args:
	message (str): The latest user message.
	history (list): Conversation history as a list of (user, assistant) tuples.
	temperature (float): Sampling temperature.
	max_new_tokens (int): Maximum tokens to generate.

	Yields:
	str: Partial cumulative output from the assistant.
	"""
	conversation = []
	if not history:
	# Add a system prompt and initial assistant confirmation.
	conversation.append({"role": "system", "content": SYSTEM_PROMPT})
	conversation.append({"role": "assistant", "content": "Understood!"})
	for user_msg, assistant_msg in history:
	conversation.append({"role": "user", "content": user_msg})
	conversation.append({"role": "assistant", "content": assistant_msg})
	conversation.append({"role": "user", "content": message})
	# Force the model to begin its answer with a "<think>" block.
	conversation.append({"role": "assistant", "content": "<think> "})

	full_response = "" # Stores the raw assistant response (including the <think> block).
	buffer = "" # Accumulates tokens until we detect the closing </think>.
	display_text = "" # Holds text to display (only text after </think>).
	think_detected = False

	# Immediately yield a "thinking" status message.
	yield "A.I. Healthcare is Thinking! Please wait, your response will output shortly...\n\n"

	# Call the API with streaming enabled.
	response = client.chat.completions.create(
	model="model-identifier", # Replace with your actual model identifier.
	messages=conversation,
	temperature=temperature,
	max_tokens=max_new_tokens,
	stream=True,
	)

	# Process streaming responses.
	for chunk in response:
	# Extract the new token text from the chunk.
	delta = chunk.choices[0].delta
	token_text = delta.content or ""
	full_response += token_text

	if not think_detected:
	# Accumulate tokens until we see the closing </think> marker.
	buffer += token_text
	if "</think>" in buffer:
	think_detected = True
	# Discard everything up to and including the "</think>" marker.
	display_text = buffer.split("</think>", 1)[1]
	yield display_text
	else:
	display_text += token_text
	yield display_text

	# Append the full (raw) response, including the <think> section, to the conversation history.
	history.append((message, full_response))

	# Create the Chatbot component.
	chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='HealthAssistant')

	# Build the Gradio interface.
	with gr.Blocks(css=css) as demo:
	gr.Markdown(DESCRIPTION)

	gr.ChatInterface(
	fn=chat_with_openai,
	chatbot=chatbot,
	fill_height=True,
	additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
	additional_inputs=[
	gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.6, label="Temperature", render=False),
	gr.Slider(minimum=1024, maximum=4096, step=128, value=2048, label="Max new tokens", render=False),
	],
	examples=[
	['What is, and do I need it?'],
	['What medications help manage being invisible?'],
	['How do I know if a clown is the right option?'],
	['How can I access music in states where it is regulated?'],
	],
	cache_examples=False,
	)

	gr.Markdown(LICENSE)

	if __name__ == "__main__":
	demo.launch()