HealthAssistant

Running

App Files Files

reedmayhew commited on Feb 16

Commit

79a8b0c

verified ·

1 Parent(s): dba1c38

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -55

app.py CHANGED Viewed

@@ -109,84 +109,93 @@ def apply_replacements(text):
 def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int, fast_mode: bool = False):
     """
     Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
-    Implements <think> logic:
-      - The assistant is forced to begin its answer with "<think> ".
-      - We then wait until a closing "</think>" marker is received.
-      - Only text after "</think>" is displayed as the final answer.
     Args:
         message (str): The latest user message.
         history (list): Conversation history as a list of (user, assistant) tuples.
         temperature (float): Sampling temperature.
         max_new_tokens (int): Maximum tokens to generate.
     Yields:
         str: Partial cumulative output from the assistant.
     """
     conversation = []
     if not history:
-        # Add a system prompt and initial assistant confirmation.
         conversation.append({"role": "system", "content": SYSTEM_PROMPT})
-        conversation.append({"role": "assistant", "content": "Understood! I will act as the user's healthcare provider, not hesitate with providing detailed responses, or refer them to another healthcare provider unless they ask for one or need a prescription, test, or resource unavailable via telehealth. I will provide detailed responses with markdown in the form of bullet point lists."})
     for user_msg, assistant_msg in history:
         conversation.append({"role": "user", "content": user_msg})
         conversation.append({"role": "assistant", "content": assistant_msg})
     conversation.append({"role": "user", "content": message})
     if not fast_mode:
-        # Immediately yield a "thinking" status message.
         yield "HealthAssistant is Thinking! Please wait, your response will output shortly. This may take 30-60 seconds...\n\n"
         think_result = think(conversation)
-        # Force the model to begin its answer with a "<think>" block.
-        conversation.append({"role": "assistant", "content": "<think>\n"+think_result+"\n</think>"})
     else:
         yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
-    # Call the API with streaming enabled.
-    response = client.chat.completions.create(
-        model=ai_model,  # Replace with your actual model identifier.
-        messages=conversation,
-        temperature=temperature,
-        max_tokens=max_new_tokens,
-        stream=True,
-    )
-    # Initialize buffers and state flags.
-    buffer = ""           # Accumulates tokens until the </think> marker is found.
-    pending_buffer = ""   # Holds the tail end of text that may contain a partial phrase.
-    display_text = ""     # Cumulative text that has been finalized and yielded.
-    think_detected = False
-    full_response = ""    # Accumulates the full raw response (without replacements applied).
-    # Process streaming responses.
-    for chunk in response:
-        # Extract the new token text from the current chunk.
-        delta = chunk.choices[0].delta
-        token_text = delta.content or ""
-        full_response += token_text
-        # After the </think> marker, add tokens to pending_buffer.
-        pending_buffer += token_text
-        if len(pending_buffer) >= MIN_FLUSH_SIZE:
-            safe_portion = pending_buffer[:-max_phrase_length] if len(pending_buffer) > max_phrase_length else ""
-            if safe_portion:
-                display_text += apply_replacements(safe_portion)
-                yield display_text
-                pending_buffer = pending_buffer[-max_phrase_length:]
-    # After processing all tokens, flush any remaining text.
-    if pending_buffer:
-        safe_portion = pending_buffer  # flush whatever remains
-        display_text += apply_replacements(safe_portion)
-        yield display_text
-    # Append the full (raw) response, including the <think> section, to the conversation history.
-    # If you want the history to reflect the replacements, apply them here.
-    modified_full_response = apply_replacements(full_response)
-    history.append((message, modified_full_response))
 # Create the Chatbot component.

 def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int, fast_mode: bool = False):
     """
     Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
+    Implements <think> logic and retries if the full response is blank.
     Args:
         message (str): The latest user message.
         history (list): Conversation history as a list of (user, assistant) tuples.
         temperature (float): Sampling temperature.
         max_new_tokens (int): Maximum tokens to generate.
     Yields:
         str: Partial cumulative output from the assistant.
     """
     conversation = []
     if not history:
+        # Initialize with system prompt and assistant confirmation.
         conversation.append({"role": "system", "content": SYSTEM_PROMPT})
+        conversation.append({"role": "assistant", "content": "Understood! I will act as the user's healthcare provider..."})
     for user_msg, assistant_msg in history:
         conversation.append({"role": "user", "content": user_msg})
         conversation.append({"role": "assistant", "content": assistant_msg})
     conversation.append({"role": "user", "content": message})
     if not fast_mode:
+        # Indicate that the assistant is thinking.
         yield "HealthAssistant is Thinking! Please wait, your response will output shortly. This may take 30-60 seconds...\n\n"
         think_result = think(conversation)
+        conversation.append({"role": "assistant", "content": "<think>\n" + think_result + "\n</think>"})
     else:
         yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
+    attempt = 0
+    response = None
+    while attempt < 5:
+        attempt += 1
+        response = client.chat.completions.create(
+            model=ai_model,
+            messages=conversation,
+            temperature=temperature,
+            max_tokens=max_new_tokens,
+            stream=True,
+        )
+        # Initialize buffers and state flags.
+        buffer = ""
+        pending_buffer = ""
+        display_text = ""
+        think_detected = False
+        full_response = ""
+        # Process streaming responses.
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            token_text = delta.content or ""
+            full_response += token_text
+            # Handle buffering of tokens as in previous logic.
+            pending_buffer += token_text
+            if len(pending_buffer) >= MIN_FLUSH_SIZE:
+                safe_portion = pending_buffer[:-max_phrase_length] if len(pending_buffer) > max_phrase_length else ""
+                if safe_portion:
+                    display_text += apply_replacements(safe_portion)
+                    yield display_text
+                    pending_buffer = pending_buffer[-max_phrase_length:]
+        # Flush remaining text.
+        if pending_buffer:
+            safe_portion = pending_buffer
+            display_text += apply_replacements(safe_portion)
+            yield display_text
+        # Check if the full response is valid.
+        if full_response.strip():
+            break  # Exit the loop if the response is not blank.
+    # If no valid response was generated after 5 attempts
+    if not full_response.strip():
+        yield "*The assistant did not provide a response. Please try again.*"
+    else:
+        # Apply replacements and append modified response to history.
+        modified_full_response = apply_replacements(full_response)
+        history.append((message, modified_full_response))
 # Create the Chatbot component.