Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -109,84 +109,93 @@ def apply_replacements(text):
|
|
109 |
def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int, fast_mode: bool = False):
|
110 |
"""
|
111 |
Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
|
112 |
-
Implements <think> logic:
|
113 |
-
- The assistant is forced to begin its answer with "<think> ".
|
114 |
-
- We then wait until a closing "</think>" marker is received.
|
115 |
-
- Only text after "</think>" is displayed as the final answer.
|
116 |
|
|
|
|
|
117 |
Args:
|
118 |
message (str): The latest user message.
|
119 |
history (list): Conversation history as a list of (user, assistant) tuples.
|
120 |
temperature (float): Sampling temperature.
|
121 |
max_new_tokens (int): Maximum tokens to generate.
|
122 |
-
|
123 |
Yields:
|
124 |
str: Partial cumulative output from the assistant.
|
125 |
"""
|
|
|
126 |
conversation = []
|
|
|
127 |
if not history:
|
128 |
-
#
|
129 |
conversation.append({"role": "system", "content": SYSTEM_PROMPT})
|
130 |
-
conversation.append({"role": "assistant", "content": "Understood! I will act as the user's healthcare provider
|
|
|
131 |
for user_msg, assistant_msg in history:
|
132 |
conversation.append({"role": "user", "content": user_msg})
|
133 |
conversation.append({"role": "assistant", "content": assistant_msg})
|
|
|
134 |
conversation.append({"role": "user", "content": message})
|
135 |
|
136 |
if not fast_mode:
|
137 |
-
#
|
138 |
yield "HealthAssistant is Thinking! Please wait, your response will output shortly. This may take 30-60 seconds...\n\n"
|
139 |
-
|
140 |
think_result = think(conversation)
|
141 |
-
|
142 |
-
# Force the model to begin its answer with a "<think>" block.
|
143 |
-
conversation.append({"role": "assistant", "content": "<think>\n"+think_result+"\n</think>"})
|
144 |
else:
|
145 |
yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
|
146 |
-
|
147 |
-
# Call the API with streaming enabled.
|
148 |
-
response = client.chat.completions.create(
|
149 |
-
model=ai_model, # Replace with your actual model identifier.
|
150 |
-
messages=conversation,
|
151 |
-
temperature=temperature,
|
152 |
-
max_tokens=max_new_tokens,
|
153 |
-
stream=True,
|
154 |
-
)
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
|
192 |
# Create the Chatbot component.
|
|
|
109 |
def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int, fast_mode: bool = False):
|
110 |
"""
|
111 |
Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
|
|
|
|
|
|
|
|
|
112 |
|
113 |
+
Implements <think> logic and retries if the full response is blank.
|
114 |
+
|
115 |
Args:
|
116 |
message (str): The latest user message.
|
117 |
history (list): Conversation history as a list of (user, assistant) tuples.
|
118 |
temperature (float): Sampling temperature.
|
119 |
max_new_tokens (int): Maximum tokens to generate.
|
120 |
+
|
121 |
Yields:
|
122 |
str: Partial cumulative output from the assistant.
|
123 |
"""
|
124 |
+
|
125 |
conversation = []
|
126 |
+
|
127 |
if not history:
|
128 |
+
# Initialize with system prompt and assistant confirmation.
|
129 |
conversation.append({"role": "system", "content": SYSTEM_PROMPT})
|
130 |
+
conversation.append({"role": "assistant", "content": "Understood! I will act as the user's healthcare provider..."})
|
131 |
+
|
132 |
for user_msg, assistant_msg in history:
|
133 |
conversation.append({"role": "user", "content": user_msg})
|
134 |
conversation.append({"role": "assistant", "content": assistant_msg})
|
135 |
+
|
136 |
conversation.append({"role": "user", "content": message})
|
137 |
|
138 |
if not fast_mode:
|
139 |
+
# Indicate that the assistant is thinking.
|
140 |
yield "HealthAssistant is Thinking! Please wait, your response will output shortly. This may take 30-60 seconds...\n\n"
|
|
|
141 |
think_result = think(conversation)
|
142 |
+
conversation.append({"role": "assistant", "content": "<think>\n" + think_result + "\n</think>"})
|
|
|
|
|
143 |
else:
|
144 |
yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
+
attempt = 0
|
147 |
+
response = None
|
148 |
+
|
149 |
+
while attempt < 5:
|
150 |
+
attempt += 1
|
151 |
+
response = client.chat.completions.create(
|
152 |
+
model=ai_model,
|
153 |
+
messages=conversation,
|
154 |
+
temperature=temperature,
|
155 |
+
max_tokens=max_new_tokens,
|
156 |
+
stream=True,
|
157 |
+
)
|
158 |
+
|
159 |
+
# Initialize buffers and state flags.
|
160 |
+
buffer = ""
|
161 |
+
pending_buffer = ""
|
162 |
+
display_text = ""
|
163 |
+
think_detected = False
|
164 |
+
full_response = ""
|
165 |
+
|
166 |
+
# Process streaming responses.
|
167 |
+
for chunk in response:
|
168 |
+
delta = chunk.choices[0].delta
|
169 |
+
token_text = delta.content or ""
|
170 |
+
full_response += token_text
|
171 |
+
|
172 |
+
# Handle buffering of tokens as in previous logic.
|
173 |
+
pending_buffer += token_text
|
174 |
+
if len(pending_buffer) >= MIN_FLUSH_SIZE:
|
175 |
+
safe_portion = pending_buffer[:-max_phrase_length] if len(pending_buffer) > max_phrase_length else ""
|
176 |
+
if safe_portion:
|
177 |
+
display_text += apply_replacements(safe_portion)
|
178 |
+
yield display_text
|
179 |
+
pending_buffer = pending_buffer[-max_phrase_length:]
|
180 |
+
|
181 |
+
# Flush remaining text.
|
182 |
+
if pending_buffer:
|
183 |
+
safe_portion = pending_buffer
|
184 |
+
display_text += apply_replacements(safe_portion)
|
185 |
+
yield display_text
|
186 |
+
|
187 |
+
# Check if the full response is valid.
|
188 |
+
if full_response.strip():
|
189 |
+
break # Exit the loop if the response is not blank.
|
190 |
+
|
191 |
+
# If no valid response was generated after 5 attempts
|
192 |
+
if not full_response.strip():
|
193 |
+
yield "*The assistant did not provide a response. Please try again.*"
|
194 |
+
else:
|
195 |
+
# Apply replacements and append modified response to history.
|
196 |
+
modified_full_response = apply_replacements(full_response)
|
197 |
+
history.append((message, modified_full_response))
|
198 |
+
|
199 |
|
200 |
|
201 |
# Create the Chatbot component.
|