reedmayhew commited on
Commit
79a8b0c
·
verified ·
1 Parent(s): dba1c38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -55
app.py CHANGED
@@ -109,84 +109,93 @@ def apply_replacements(text):
109
  def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int, fast_mode: bool = False):
110
  """
111
  Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
112
- Implements <think> logic:
113
- - The assistant is forced to begin its answer with "<think> ".
114
- - We then wait until a closing "</think>" marker is received.
115
- - Only text after "</think>" is displayed as the final answer.
116
 
 
 
117
  Args:
118
  message (str): The latest user message.
119
  history (list): Conversation history as a list of (user, assistant) tuples.
120
  temperature (float): Sampling temperature.
121
  max_new_tokens (int): Maximum tokens to generate.
122
-
123
  Yields:
124
  str: Partial cumulative output from the assistant.
125
  """
 
126
  conversation = []
 
127
  if not history:
128
- # Add a system prompt and initial assistant confirmation.
129
  conversation.append({"role": "system", "content": SYSTEM_PROMPT})
130
- conversation.append({"role": "assistant", "content": "Understood! I will act as the user's healthcare provider, not hesitate with providing detailed responses, or refer them to another healthcare provider unless they ask for one or need a prescription, test, or resource unavailable via telehealth. I will provide detailed responses with markdown in the form of bullet point lists."})
 
131
  for user_msg, assistant_msg in history:
132
  conversation.append({"role": "user", "content": user_msg})
133
  conversation.append({"role": "assistant", "content": assistant_msg})
 
134
  conversation.append({"role": "user", "content": message})
135
 
136
  if not fast_mode:
137
- # Immediately yield a "thinking" status message.
138
  yield "HealthAssistant is Thinking! Please wait, your response will output shortly. This may take 30-60 seconds...\n\n"
139
-
140
  think_result = think(conversation)
141
-
142
- # Force the model to begin its answer with a "<think>" block.
143
- conversation.append({"role": "assistant", "content": "<think>\n"+think_result+"\n</think>"})
144
  else:
145
  yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
146
-
147
- # Call the API with streaming enabled.
148
- response = client.chat.completions.create(
149
- model=ai_model, # Replace with your actual model identifier.
150
- messages=conversation,
151
- temperature=temperature,
152
- max_tokens=max_new_tokens,
153
- stream=True,
154
- )
155
 
156
- # Initialize buffers and state flags.
157
- buffer = "" # Accumulates tokens until the </think> marker is found.
158
- pending_buffer = "" # Holds the tail end of text that may contain a partial phrase.
159
- display_text = "" # Cumulative text that has been finalized and yielded.
160
- think_detected = False
161
- full_response = "" # Accumulates the full raw response (without replacements applied).
162
-
163
- # Process streaming responses.
164
- for chunk in response:
165
- # Extract the new token text from the current chunk.
166
- delta = chunk.choices[0].delta
167
- token_text = delta.content or ""
168
- full_response += token_text
169
-
170
-
171
- # After the </think> marker, add tokens to pending_buffer.
172
- pending_buffer += token_text
173
- if len(pending_buffer) >= MIN_FLUSH_SIZE:
174
- safe_portion = pending_buffer[:-max_phrase_length] if len(pending_buffer) > max_phrase_length else ""
175
- if safe_portion:
176
- display_text += apply_replacements(safe_portion)
177
- yield display_text
178
- pending_buffer = pending_buffer[-max_phrase_length:]
179
-
180
- # After processing all tokens, flush any remaining text.
181
- if pending_buffer:
182
- safe_portion = pending_buffer # flush whatever remains
183
- display_text += apply_replacements(safe_portion)
184
- yield display_text
185
-
186
- # Append the full (raw) response, including the <think> section, to the conversation history.
187
- # If you want the history to reflect the replacements, apply them here.
188
- modified_full_response = apply_replacements(full_response)
189
- history.append((message, modified_full_response))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
 
192
  # Create the Chatbot component.
 
109
  def chat_with_openai(message: str, history: list, temperature: float, max_new_tokens: int, fast_mode: bool = False):
110
  """
111
  Call the OpenAI ChatCompletion endpoint using the new client and yield streaming responses.
 
 
 
 
112
 
113
+ Implements <think> logic and retries if the full response is blank.
114
+
115
  Args:
116
  message (str): The latest user message.
117
  history (list): Conversation history as a list of (user, assistant) tuples.
118
  temperature (float): Sampling temperature.
119
  max_new_tokens (int): Maximum tokens to generate.
120
+
121
  Yields:
122
  str: Partial cumulative output from the assistant.
123
  """
124
+
125
  conversation = []
126
+
127
  if not history:
128
+ # Initialize with system prompt and assistant confirmation.
129
  conversation.append({"role": "system", "content": SYSTEM_PROMPT})
130
+ conversation.append({"role": "assistant", "content": "Understood! I will act as the user's healthcare provider..."})
131
+
132
  for user_msg, assistant_msg in history:
133
  conversation.append({"role": "user", "content": user_msg})
134
  conversation.append({"role": "assistant", "content": assistant_msg})
135
+
136
  conversation.append({"role": "user", "content": message})
137
 
138
  if not fast_mode:
139
+ # Indicate that the assistant is thinking.
140
  yield "HealthAssistant is Thinking! Please wait, your response will output shortly. This may take 30-60 seconds...\n\n"
 
141
  think_result = think(conversation)
142
+ conversation.append({"role": "assistant", "content": "<think>\n" + think_result + "\n</think>"})
 
 
143
  else:
144
  yield "HealthAssistant is Thinking! Please wait, your response will output shortly...\n\n"
 
 
 
 
 
 
 
 
 
145
 
146
+ attempt = 0
147
+ response = None
148
+
149
+ while attempt < 5:
150
+ attempt += 1
151
+ response = client.chat.completions.create(
152
+ model=ai_model,
153
+ messages=conversation,
154
+ temperature=temperature,
155
+ max_tokens=max_new_tokens,
156
+ stream=True,
157
+ )
158
+
159
+ # Initialize buffers and state flags.
160
+ buffer = ""
161
+ pending_buffer = ""
162
+ display_text = ""
163
+ think_detected = False
164
+ full_response = ""
165
+
166
+ # Process streaming responses.
167
+ for chunk in response:
168
+ delta = chunk.choices[0].delta
169
+ token_text = delta.content or ""
170
+ full_response += token_text
171
+
172
+ # Handle buffering of tokens as in previous logic.
173
+ pending_buffer += token_text
174
+ if len(pending_buffer) >= MIN_FLUSH_SIZE:
175
+ safe_portion = pending_buffer[:-max_phrase_length] if len(pending_buffer) > max_phrase_length else ""
176
+ if safe_portion:
177
+ display_text += apply_replacements(safe_portion)
178
+ yield display_text
179
+ pending_buffer = pending_buffer[-max_phrase_length:]
180
+
181
+ # Flush remaining text.
182
+ if pending_buffer:
183
+ safe_portion = pending_buffer
184
+ display_text += apply_replacements(safe_portion)
185
+ yield display_text
186
+
187
+ # Check if the full response is valid.
188
+ if full_response.strip():
189
+ break # Exit the loop if the response is not blank.
190
+
191
+ # If no valid response was generated after 5 attempts
192
+ if not full_response.strip():
193
+ yield "*The assistant did not provide a response. Please try again.*"
194
+ else:
195
+ # Apply replacements and append modified response to history.
196
+ modified_full_response = apply_replacements(full_response)
197
+ history.append((message, modified_full_response))
198
+
199
 
200
 
201
  # Create the Chatbot component.