Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -44,6 +44,24 @@ h1 {
|
|
44 |
}
|
45 |
"""
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int):
|
48 |
"""
|
49 |
Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
|
@@ -73,11 +91,6 @@ def chat_with_openai(message: str, history: list, temperature: float, max_new_to
|
|
73 |
# Force the model to begin its answer with a "<think>" block.
|
74 |
conversation.append({"role": "assistant", "content": "<think> "})
|
75 |
|
76 |
-
full_response = "" # Stores the raw assistant response (including the <think> block).
|
77 |
-
buffer = "" # Accumulates tokens until we detect the closing </think>.
|
78 |
-
display_text = "" # Holds text to display (only text after </think>).
|
79 |
-
think_detected = False
|
80 |
-
|
81 |
# Immediately yield a "thinking" status message.
|
82 |
yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
|
83 |
|
@@ -90,27 +103,66 @@ def chat_with_openai(message: str, history: list, temperature: float, max_new_to
|
|
90 |
stream=True,
|
91 |
)
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
# Process streaming responses.
|
94 |
for chunk in response:
|
95 |
# Extract the new token text from the chunk.
|
96 |
delta = chunk.choices[0].delta
|
97 |
token_text = delta.content or ""
|
98 |
full_response += token_text
|
99 |
-
|
100 |
if not think_detected:
|
101 |
# Accumulate tokens until we see the closing </think> marker.
|
102 |
buffer += token_text
|
103 |
if "</think>" in buffer:
|
104 |
think_detected = True
|
105 |
# Discard everything up to and including the "</think>" marker.
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
else:
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
# Append the full (raw) response, including the <think> section, to the conversation history.
|
113 |
-
history
|
|
|
|
|
|
|
114 |
|
115 |
# Create the Chatbot component.
|
116 |
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='HealthAssistant')
|
|
|
44 |
}
|
45 |
"""
|
46 |
|
47 |
+
# List of (phrase, replacement) pairs.
|
48 |
+
replacements = [
|
49 |
+
("a healthcare provider", "me")
|
50 |
+
# Add more pairs as needed.
|
51 |
+
]
|
52 |
+
|
53 |
+
# Calculate the maximum length of any phrase.
|
54 |
+
max_phrase_length = max(len(phrase) for phrase, _ in replacements)
|
55 |
+
|
56 |
+
def apply_replacements(text):
|
57 |
+
"""
|
58 |
+
Replace all specified phrases in the text.
|
59 |
+
"""
|
60 |
+
for phrase, replacement in replacements:
|
61 |
+
text = text.replace(phrase, replacement)
|
62 |
+
return text
|
63 |
+
|
64 |
+
|
65 |
def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int):
|
66 |
"""
|
67 |
Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
|
|
|
91 |
# Force the model to begin its answer with a "<think>" block.
|
92 |
conversation.append({"role": "assistant", "content": "<think> "})
|
93 |
|
|
|
|
|
|
|
|
|
|
|
94 |
# Immediately yield a "thinking" status message.
|
95 |
yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
|
96 |
|
|
|
103 |
stream=True,
|
104 |
)
|
105 |
|
106 |
+
# Buffers and state flags.
|
107 |
+
buffer = "" # Used before the </think> marker is detected.
|
108 |
+
pending_buffer = "" # Sliding buffer for safely holding the tail.
|
109 |
+
think_detected = False
|
110 |
+
full_response = "" # Accumulates the full (raw) response.
|
111 |
+
|
112 |
+
# Suppose these are defined elsewhere in your code:
|
113 |
+
history = [] # The conversation history.
|
114 |
+
message = "User message" # The user's message, for example.
|
115 |
+
# 'response' is assumed to be an iterable of token chunks.
|
116 |
+
|
117 |
# Process streaming responses.
|
118 |
for chunk in response:
|
119 |
# Extract the new token text from the chunk.
|
120 |
delta = chunk.choices[0].delta
|
121 |
token_text = delta.content or ""
|
122 |
full_response += token_text
|
123 |
+
|
124 |
if not think_detected:
|
125 |
# Accumulate tokens until we see the closing </think> marker.
|
126 |
buffer += token_text
|
127 |
if "</think>" in buffer:
|
128 |
think_detected = True
|
129 |
# Discard everything up to and including the "</think>" marker.
|
130 |
+
after_think = buffer.split("</think>", 1)[1]
|
131 |
+
# Initialize the pending_buffer with the text after </think>.
|
132 |
+
pending_buffer += after_think
|
133 |
+
|
134 |
+
# If pending_buffer is large enough, yield the safe portion.
|
135 |
+
if len(pending_buffer) > max_phrase_length:
|
136 |
+
# All except the last max_phrase_length characters are safe to yield.
|
137 |
+
to_yield = pending_buffer[:-max_phrase_length]
|
138 |
+
# Apply replacements on the safe portion.
|
139 |
+
to_yield = apply_replacements(to_yield)
|
140 |
+
yield to_yield
|
141 |
+
# Retain the last part in pending_buffer for potential split phrases.
|
142 |
+
pending_buffer = pending_buffer[-max_phrase_length:]
|
143 |
else:
|
144 |
+
# Append new token text to pending_buffer.
|
145 |
+
pending_buffer += token_text
|
146 |
+
|
147 |
+
# If pending_buffer is longer than max_phrase_length, yield the safe portion.
|
148 |
+
if len(pending_buffer) > max_phrase_length:
|
149 |
+
# Extract the part that is definitely not part of a split phrase.
|
150 |
+
to_yield = pending_buffer[:-max_phrase_length]
|
151 |
+
to_yield = apply_replacements(to_yield)
|
152 |
+
yield to_yield
|
153 |
+
# Keep the last max_phrase_length characters in pending_buffer.
|
154 |
+
pending_buffer = pending_buffer[-max_phrase_length:]
|
155 |
+
|
156 |
+
# After processing all chunks, flush any remaining text in pending_buffer.
|
157 |
+
if pending_buffer:
|
158 |
+
to_yield = apply_replacements(pending_buffer)
|
159 |
+
yield to_yield
|
160 |
+
|
161 |
# Append the full (raw) response, including the <think> section, to the conversation history.
|
162 |
+
# If you want the conversation history to reflect the replacements, apply them to full_response.
|
163 |
+
modified_full_response = apply_replacements(full_response)
|
164 |
+
history.append((message, modified_full_response))
|
165 |
+
|
166 |
|
167 |
# Create the Chatbot component.
|
168 |
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='HealthAssistant')
|