Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -103,63 +103,61 @@ def chat_with_openai(message: str, history: list, temperature: float, max_new_to
|
|
103 |
stream=True,
|
104 |
)
|
105 |
|
106 |
-
#
|
107 |
-
buffer = "" #
|
108 |
-
pending_buffer = "" #
|
|
|
109 |
think_detected = False
|
110 |
-
full_response = "" # Accumulates the full
|
111 |
|
112 |
-
#
|
113 |
history = [] # The conversation history.
|
114 |
-
message = "User message" # The user
|
115 |
# 'response' is assumed to be an iterable of token chunks.
|
116 |
|
117 |
# Process streaming responses.
|
118 |
for chunk in response:
|
119 |
-
# Extract the new token text from the chunk.
|
120 |
delta = chunk.choices[0].delta
|
121 |
token_text = delta.content or ""
|
122 |
full_response += token_text
|
123 |
|
124 |
if not think_detected:
|
125 |
-
# Accumulate tokens until
|
126 |
buffer += token_text
|
127 |
if "</think>" in buffer:
|
128 |
think_detected = True
|
129 |
-
# Discard everything up to and including the
|
130 |
after_think = buffer.split("</think>", 1)[1]
|
131 |
-
#
|
132 |
pending_buffer += after_think
|
133 |
|
134 |
-
# If pending_buffer is large enough,
|
135 |
if len(pending_buffer) > max_phrase_length:
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
# Retain the last part in pending_buffer for potential split phrases.
|
142 |
pending_buffer = pending_buffer[-max_phrase_length:]
|
143 |
else:
|
144 |
-
#
|
145 |
pending_buffer += token_text
|
146 |
-
|
147 |
-
# If pending_buffer is longer than max_phrase_length, yield the safe portion.
|
148 |
if len(pending_buffer) > max_phrase_length:
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
yield
|
153 |
-
# Keep the last max_phrase_length characters in pending_buffer.
|
154 |
pending_buffer = pending_buffer[-max_phrase_length:]
|
155 |
|
156 |
# After processing all chunks, flush any remaining text in pending_buffer.
|
157 |
if pending_buffer:
|
158 |
-
|
159 |
-
|
|
|
160 |
|
161 |
-
#
|
162 |
-
# If you want the
|
163 |
modified_full_response = apply_replacements(full_response)
|
164 |
history.append((message, modified_full_response))
|
165 |
|
|
|
103 |
stream=True,
|
104 |
)
|
105 |
|
106 |
+
# Initialize buffers and state flags.
|
107 |
+
buffer = "" # Accumulates tokens until the </think> marker is found.
|
108 |
+
pending_buffer = "" # Holds the tail end of text that may contain a partial phrase.
|
109 |
+
display_text = "" # Cumulative text that has been finalized and yielded.
|
110 |
think_detected = False
|
111 |
+
full_response = "" # Accumulates the full raw response (without replacements applied).
|
112 |
|
113 |
+
# These are assumed to be defined in your application.
|
114 |
history = [] # The conversation history.
|
115 |
+
message = "User message" # The user’s message (or any identifier).
|
116 |
# 'response' is assumed to be an iterable of token chunks.
|
117 |
|
118 |
# Process streaming responses.
|
119 |
for chunk in response:
|
120 |
+
# Extract the new token text from the current chunk.
|
121 |
delta = chunk.choices[0].delta
|
122 |
token_text = delta.content or ""
|
123 |
full_response += token_text
|
124 |
|
125 |
if not think_detected:
|
126 |
+
# Accumulate tokens until the </think> marker is detected.
|
127 |
buffer += token_text
|
128 |
if "</think>" in buffer:
|
129 |
think_detected = True
|
130 |
+
# Discard everything up to and including the </think> marker.
|
131 |
after_think = buffer.split("</think>", 1)[1]
|
132 |
+
# Start the pending buffer with the text after the marker.
|
133 |
pending_buffer += after_think
|
134 |
|
135 |
+
# If pending_buffer is large enough, extract the safe portion.
|
136 |
if len(pending_buffer) > max_phrase_length:
|
137 |
+
safe_portion = pending_buffer[:-max_phrase_length]
|
138 |
+
safe_portion = apply_replacements(safe_portion)
|
139 |
+
display_text += safe_portion
|
140 |
+
yield display_text
|
141 |
+
# Retain only the last max_phrase_length characters in pending_buffer.
|
|
|
142 |
pending_buffer = pending_buffer[-max_phrase_length:]
|
143 |
else:
|
144 |
+
# Already passed the </think> marker, so work with pending_buffer.
|
145 |
pending_buffer += token_text
|
|
|
|
|
146 |
if len(pending_buffer) > max_phrase_length:
|
147 |
+
safe_portion = pending_buffer[:-max_phrase_length]
|
148 |
+
safe_portion = apply_replacements(safe_portion)
|
149 |
+
display_text += safe_portion
|
150 |
+
yield display_text
|
|
|
151 |
pending_buffer = pending_buffer[-max_phrase_length:]
|
152 |
|
153 |
# After processing all chunks, flush any remaining text in pending_buffer.
|
154 |
if pending_buffer:
|
155 |
+
safe_portion = apply_replacements(pending_buffer)
|
156 |
+
display_text += safe_portion
|
157 |
+
yield display_text
|
158 |
|
159 |
+
# Optionally, store the full response (including the <think> section) into the conversation history.
|
160 |
+
# If you want the history to reflect the replacements, apply them here.
|
161 |
modified_full_response = apply_replacements(full_response)
|
162 |
history.append((message, modified_full_response))
|
163 |
|