HealthAssistant

Running

App Files Files

reedmayhew commited on Feb 2

Commit

40c24da

verified ·

1 Parent(s): d1594b3

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -29

app.py CHANGED Viewed

@@ -103,63 +103,61 @@ def chat_with_openai(message: str, history: list, temperature: float, max_new_to
         stream=True,
     )
-    # Buffers and state flags.
-    buffer = ""           # Used before the </think> marker is detected.
-    pending_buffer = ""   # Sliding buffer for safely holding the tail.
     think_detected = False
-    full_response = ""    # Accumulates the full (raw) response.
-    # Suppose these are defined elsewhere in your code:
     history = []         # The conversation history.
-    message = "User message"  # The user's message, for example.
     # 'response' is assumed to be an iterable of token chunks.
     # Process streaming responses.
     for chunk in response:
-        # Extract the new token text from the chunk.
         delta = chunk.choices[0].delta
         token_text = delta.content or ""
         full_response += token_text
         if not think_detected:
-            # Accumulate tokens until we see the closing </think> marker.
             buffer += token_text
             if "</think>" in buffer:
                 think_detected = True
-                # Discard everything up to and including the "</think>" marker.
                 after_think = buffer.split("</think>", 1)[1]
-                # Initialize the pending_buffer with the text after </think>.
                 pending_buffer += after_think
-                # If pending_buffer is large enough, yield the safe portion.
                 if len(pending_buffer) > max_phrase_length:
-                    # All except the last max_phrase_length characters are safe to yield.
-                    to_yield = pending_buffer[:-max_phrase_length]
-                    # Apply replacements on the safe portion.
-                    to_yield = apply_replacements(to_yield)
-                    yield to_yield
-                    # Retain the last part in pending_buffer for potential split phrases.
                     pending_buffer = pending_buffer[-max_phrase_length:]
         else:
-            # Append new token text to pending_buffer.
             pending_buffer += token_text
-            # If pending_buffer is longer than max_phrase_length, yield the safe portion.
             if len(pending_buffer) > max_phrase_length:
-                # Extract the part that is definitely not part of a split phrase.
-                to_yield = pending_buffer[:-max_phrase_length]
-                to_yield = apply_replacements(to_yield)
-                yield to_yield
-                # Keep the last max_phrase_length characters in pending_buffer.
                 pending_buffer = pending_buffer[-max_phrase_length:]
     # After processing all chunks, flush any remaining text in pending_buffer.
     if pending_buffer:
-        to_yield = apply_replacements(pending_buffer)
-        yield to_yield
-    # Append the full (raw) response, including the <think> section, to the conversation history.
-    # If you want the conversation history to reflect the replacements, apply them to full_response.
     modified_full_response = apply_replacements(full_response)
     history.append((message, modified_full_response))

         stream=True,
     )
+    # Initialize buffers and state flags.
+    buffer = ""           # Accumulates tokens until the </think> marker is found.
+    pending_buffer = ""   # Holds the tail end of text that may contain a partial phrase.
+    display_text = ""     # Cumulative text that has been finalized and yielded.
     think_detected = False
+    full_response = ""    # Accumulates the full raw response (without replacements applied).
+    # These are assumed to be defined in your application.
     history = []         # The conversation history.
+    message = "User message"  # The user’s message (or any identifier).
     # 'response' is assumed to be an iterable of token chunks.
     # Process streaming responses.
     for chunk in response:
+        # Extract the new token text from the current chunk.
         delta = chunk.choices[0].delta
         token_text = delta.content or ""
         full_response += token_text
         if not think_detected:
+            # Accumulate tokens until the </think> marker is detected.
             buffer += token_text
             if "</think>" in buffer:
                 think_detected = True
+                # Discard everything up to and including the </think> marker.
                 after_think = buffer.split("</think>", 1)[1]
+                # Start the pending buffer with the text after the marker.
                 pending_buffer += after_think
+                # If pending_buffer is large enough, extract the safe portion.
                 if len(pending_buffer) > max_phrase_length:
+                    safe_portion = pending_buffer[:-max_phrase_length]
+                    safe_portion = apply_replacements(safe_portion)
+                    display_text += safe_portion
+                    yield display_text
+                    # Retain only the last max_phrase_length characters in pending_buffer.
                     pending_buffer = pending_buffer[-max_phrase_length:]
         else:
+            # Already passed the </think> marker, so work with pending_buffer.
             pending_buffer += token_text
             if len(pending_buffer) > max_phrase_length:
+                safe_portion = pending_buffer[:-max_phrase_length]
+                safe_portion = apply_replacements(safe_portion)
+                display_text += safe_portion
+                yield display_text
                 pending_buffer = pending_buffer[-max_phrase_length:]
     # After processing all chunks, flush any remaining text in pending_buffer.
     if pending_buffer:
+        safe_portion = apply_replacements(pending_buffer)
+        display_text += safe_portion
+        yield display_text
+    # Optionally, store the full response (including the <think> section) into the conversation history.
+    # If you want the history to reflect the replacements, apply them here.
     modified_full_response = apply_replacements(full_response)
     history.append((message, modified_full_response))