HealthAssistant

Running

App Files Files

reedmayhew commited on Feb 2

Commit

f98d1cf

verified ·

1 Parent(s): a14d521

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -67

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
-  <h1 style="text-align: center;">A.I. Healthcare</h1>
 </div>
 '''
@@ -40,7 +40,7 @@ h1 {
 }
 """
-# Load the tokenizer and model with the updated model name
 tokenizer = AutoTokenizer.from_pretrained("reedmayhew/HealthCare-Reasoning-Assistant-Llama-3.1-8B-HF", device_map="cuda")
 model = AutoModelForCausalLM.from_pretrained("reedmayhew/HealthCare-Reasoning-Assistant-Llama-3.1-8B-HF", device_map="cuda")
@@ -51,28 +51,21 @@ terminators = [
 @spaces.GPU(duration=60)
 def chat_llama3_8b(message: str,
-                   history: list,
-                   temperature: float,
-                   max_new_tokens: int,
-                   confirm: bool) -> str:
     """
-    Generate a streaming response using the Healthcare-Reasoning-Assistant-Llama-3.1-8B-HF model.
     Args:
         message (str): The input message.
-        history (list): The conversation history.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
-        confirm (bool): Whether the user has confirmed the usage disclaimer.
-    Yields:
-        str: The generated response, streamed token-by-token.
     """
-    # Ensure the user has confirmed the disclaimer
-    if not confirm:
-        return "⚠️ You must confirm that you meet the usage requirements before sending a message."
-    # Prepare the conversation history for the model input
     conversation = []
     for user, assistant in history:
         conversation.extend([
@@ -80,15 +73,14 @@ def chat_llama3_8b(message: str,
             {"role": "assistant", "content": assistant}
         ])
-    # Append the current user message
     conversation.append({"role": "user", "content": message})
-    # Convert the conversation into input ids using the chat template
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
-    # Set up the streamer to stream text output
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         streamer=streamer,
@@ -101,57 +93,50 @@ def chat_llama3_8b(message: str,
     if temperature == 0:
         generate_kwargs['do_sample'] = False
-    # Launch the generation in a separate thread
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    full_response = ""
-    # Simply stream each token as it comes from the model
     for text in streamer:
-        full_response += text
-        yield text
-    # Save the full response (for context in the conversation history)
-    history.append((message, full_response))
-# Custom JavaScript to disable the send button until confirmation is given.
-CUSTOM_JS = """
-<script>
-document.addEventListener("DOMContentLoaded", function() {
-    const interval = setInterval(() => {
-        const checkbox = document.querySelector('input[type="checkbox"][aria-label*="I hereby confirm that I am at least 18 years of age"]');
-        const sendButton = document.querySelector('button[title="Send"]');
-        if (checkbox && sendButton) {
-            sendButton.disabled = !checkbox.checked;
-            checkbox.addEventListener('change', function() {
-                sendButton.disabled = !checkbox.checked;
-            });
-            clearInterval(interval);
-        }
-    }, 500);
-});
-</script>
-"""
-with gr.Blocks(css=css, title="A.I. Healthcare") as demo:
-    gr.Markdown(DESCRIPTION)
-    gr.HTML(CUSTOM_JS)
-    chat_interface = gr.ChatInterface(
         fn=chat_llama3_8b,
-        title="A.I. Healthcare Chat",
-        chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Conversation'),
         additional_inputs=[
-            gr.Checkbox(
-                value=False,
-                label=("I hereby confirm that I am at least 18 years of age (or accompanied by a legal guardian "
-                       "who is at least 18 years old), understand that the information provided by this service "
-                       "is for informational purposes only and is not intended to diagnose or treat any medical condition, "
-                       "and acknowledge that I am solely responsible for verifying any information provided."),
-                elem_id="age_confirm_checkbox"
-            ),
-            gr.Slider(minimum=0.6, maximum=0.6, step=0.1, value=0.6, label="Temperature", visible=False),
-            gr.Slider(minimum=128, maximum=4096, step=64, value=1024, label="Max new tokens", visible=False),
         ],
         examples=[
             ['What are the common symptoms of diabetes?'],
@@ -166,4 +151,4 @@ with gr.Blocks(css=css, title="A.I. Healthcare") as demo:
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
-    demo.launch()

 DESCRIPTION = '''
 <div>
+<h1 style="text-align: center;">A.I. Healthcare</h1>
 </div>
 '''
 }
 """
+# Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("reedmayhew/HealthCare-Reasoning-Assistant-Llama-3.1-8B-HF", device_map="cuda")
 model = AutoModelForCausalLM.from_pretrained("reedmayhew/HealthCare-Reasoning-Assistant-Llama-3.1-8B-HF", device_map="cuda")
 @spaces.GPU(duration=60)
 def chat_llama3_8b(message: str,
+                    history: list,
+                    temperature: float,
+                    max_new_tokens: int
+                   ) -> str:
     """
+    Generate a streaming response using the llama3-8b model.
     Args:
         message (str): The input message.
+        history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
+    Returns:
+        str: The generated response.
     """
     conversation = []
     for user, assistant in history:
         conversation.extend([
             {"role": "assistant", "content": assistant}
         ])
+    # Ensure the model starts with "<think>"
     conversation.append({"role": "user", "content": message})
+    conversation.append({"role": "assistant", "content": "<think> "})  # Force <think> at start
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         streamer=streamer,
     if temperature == 0:
         generate_kwargs['do_sample'] = False
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    outputs = []
+    buffer = ""
+    think_detected = False
+    thinking_message_sent = False
+    full_response = ""  # Store the full assistant response
     for text in streamer:
+        buffer += text
+        full_response += text  # Store raw assistant response (includes <think>)
+        # Send the "thinking" message once text starts generating
+        if not thinking_message_sent:
+            thinking_message_sent = True
+            yield "A.I. Healthcare is Thinking...\n\n"
+        # Wait until </think> is detected before streaming output
+        if not think_detected:
+            if "</think>" in buffer:
+                think_detected = True
+                buffer = buffer.split("</think>", 1)[1]  # Remove <think> section
+        else:
+            outputs.append(text)
+            yield "".join(outputs)
+    # Store the full response (including <think>) in history, but only show the user the cleaned response
+    history.append((message, full_response))  # Full assistant response saved for context
+# Gradio block
+chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
+with gr.Blocks(fill_height=True, css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.ChatInterface(
         fn=chat_llama3_8b,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
+            gr.Slider(minimum=0.6, maximum=0.6, step=0.1, value=0.6, label="Temperature", render=False),
+            gr.Slider(minimum=128, maximum=4096, step=64, value=1024, label="Max new tokens", render=False),
         ],
         examples=[
             ['What are the common symptoms of diabetes?'],
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    demo.launch()