bibibi12345 commited on
Commit
bee94f0
·
1 Parent(s): fb7432b

testing openai fake streaming and reasoning

Browse files
Files changed (3) hide show
  1. app/api_helpers.py +277 -154
  2. app/message_processing.py +95 -3
  3. app/routes/chat_api.py +87 -110
app/api_helpers.py CHANGED
@@ -2,17 +2,25 @@ import json
2
  import time
3
  import math
4
  import asyncio
5
- from typing import List, Dict, Any, Callable, Union
6
- from fastapi.responses import JSONResponse, StreamingResponse
7
 
 
8
  from google.auth.transport.requests import Request as AuthRequest
9
- from google.genai import types
10
- from google import genai # Needed if _execute_gemini_call uses genai.Client directly
 
 
11
 
12
- # Local module imports
13
- from models import OpenAIRequest, OpenAIMessage # Changed from relative
14
- from message_processing import deobfuscate_text, convert_to_openai_format, convert_chunk_to_openai, create_final_chunk # Changed from relative
15
- import config as app_config # Changed from relative
 
 
 
 
 
16
 
17
  def create_openai_error_response(status_code: int, message: str, error_type: str) -> Dict[str, Any]:
18
  return {
@@ -44,171 +52,286 @@ def create_generation_config(request: OpenAIRequest) -> Dict[str, Any]:
44
  ]
45
  return config
46
 
47
- def is_response_valid(response):
48
- if response is None:
49
- print("DEBUG: Response is None, therefore invalid.")
50
- return False
51
-
52
- # Check for direct text attribute
53
- if hasattr(response, 'text') and isinstance(response.text, str) and response.text.strip():
54
- # print("DEBUG: Response valid due to response.text")
55
- return True
56
-
57
- # Check candidates for text content
58
  if hasattr(response, 'candidates') and response.candidates:
59
- for candidate in response.candidates: # Iterate through all candidates
60
- if hasattr(candidate, 'text') and isinstance(candidate.text, str) and candidate.text.strip():
61
- # print(f"DEBUG: Response valid due to candidate.text in candidate")
62
- return True
63
  if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
64
  for part in candidate.content.parts:
65
- if hasattr(part, 'text') and isinstance(part.text, str) and part.text.strip():
66
- # print(f"DEBUG: Response valid due to part.text in candidate's content part")
67
- return True
68
-
69
- # Removed prompt_feedback as a sole criterion for validity.
70
- # It should only be valid if actual text content is found.
71
- # Block reasons will be checked explicitly by callers if they need to treat it as an error for retries.
72
- print("DEBUG: Response is invalid, no usable text content found by is_response_valid.")
73
  return False
74
 
75
- async def fake_stream_generator(client_instance, model_name: str, prompt: Union[types.Content, List[types.Content]], current_gen_config: Dict[str, Any], request_obj: OpenAIRequest, is_auto_attempt: bool):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  response_id = f"chatcmpl-{int(time.time())}"
77
- async def fake_stream_inner():
78
- print(f"FAKE STREAMING: Making non-streaming request to Gemini API (Model: {model_name})")
79
- api_call_task = asyncio.create_task(
80
- client_instance.aio.models.generate_content(
81
- model=model_name, contents=prompt, config=current_gen_config
 
 
 
 
 
 
82
  )
83
  )
84
- while not api_call_task.done():
85
- keep_alive_data = {
86
- "id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()),
87
- "model": request_obj.model, "choices": [{"delta": {"content": ""}, "index": 0, "finish_reason": None}]
88
- }
89
- yield f"data: {json.dumps(keep_alive_data)}\n\n"
90
- await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
91
- try:
92
- response = api_call_task.result()
93
-
94
- # Check for safety blocks first, as this should trigger a retry in auto-mode
95
- if hasattr(response, 'prompt_feedback') and \
96
- hasattr(response.prompt_feedback, 'block_reason') and \
97
- response.prompt_feedback.block_reason:
98
- block_message = f"Response blocked by safety filter: {response.prompt_feedback.block_reason}"
99
- if hasattr(response.prompt_feedback, 'block_reason_message') and response.prompt_feedback.block_reason_message:
100
- block_message = f"Response blocked by safety filter: {response.prompt_feedback.block_reason_message} (Reason: {response.prompt_feedback.block_reason})"
101
- print(f"DEBUG: {block_message} (in fake_stream_generator)") # Log this specific condition
102
- raise ValueError(block_message) # This will be caught by the except Exception as e below it
103
-
104
- if not is_response_valid(response): # is_response_valid now only checks for actual text
105
- raise ValueError(f"Invalid/empty response in fake stream (no text content): {str(response)[:200]}")
106
-
107
- full_text = ""
108
- if hasattr(response, 'text'):
109
- full_text = response.text or "" # Coalesce None to empty string
110
- elif hasattr(response, 'candidates') and response.candidates:
111
- # Typically, we focus on the first candidate for non-streaming synthesis
112
- candidate = response.candidates[0]
113
- if hasattr(candidate, 'text'):
114
- full_text = candidate.text or "" # Coalesce None to empty string
115
- elif hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
116
- # Ensure parts are iterated and text is joined correctly even if some parts have no text or part.text is None
117
- texts = []
118
- for part in candidate.content.parts:
119
- if hasattr(part, 'text') and part.text is not None: # Check part.text exists and is not None
120
- texts.append(part.text)
121
- full_text = "".join(texts)
122
- if request_obj.model.endswith("-encrypt-full"):
123
- full_text = deobfuscate_text(full_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- chunk_size = max(20, math.ceil(len(full_text) / 10))
126
- for i in range(0, len(full_text), chunk_size):
127
- chunk_text = full_text[i:i+chunk_size]
128
- delta_data = {
129
- "id": response_id, "object": "chat.completion.chunk", "created": int(time.time()),
130
- "model": request_obj.model, "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}]
131
- }
132
- yield f"data: {json.dumps(delta_data)}\n\n"
133
- await asyncio.sleep(0.05)
134
- yield create_final_chunk(request_obj.model, response_id)
135
  yield "data: [DONE]\n\n"
136
- except Exception as e:
137
- err_msg = f"Error in fake_stream_generator: {str(e)}"
138
- print(err_msg)
139
- err_resp = create_openai_error_response(500, err_msg, "server_error")
140
- # It's good practice to log the JSON payload here too for consistency,
141
- # though the main concern was the true streaming path.
142
- json_payload_for_fake_stream_error = json.dumps(err_resp)
143
- # Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
144
- print(f"DEBUG: Internal error in fake_stream_generator. JSON error for handler: {json_payload_for_fake_stream_error}")
145
- if not is_auto_attempt:
146
- yield f"data: {json_payload_for_fake_stream_error}\n\n"
147
- yield "data: [DONE]\n\n"
148
- raise e # Re-raise the original exception e
149
- return fake_stream_inner()
150
 
151
  async def execute_gemini_call(
152
- current_client: Any, # Should be genai.Client or similar AsyncClient
153
- model_to_call: str,
154
  prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
155
- gen_config_for_call: Dict[str, Any],
156
- request_obj: OpenAIRequest, # Pass the whole request object
157
  is_auto_attempt: bool = False
158
  ):
159
  actual_prompt_for_call = prompt_func(request_obj.messages)
160
-
 
 
 
 
 
 
 
 
 
161
  if request_obj.stream:
162
  if app_config.FAKE_STREAMING_ENABLED:
163
- return StreamingResponse(
164
- await fake_stream_generator(current_client, model_to_call, actual_prompt_for_call, gen_config_for_call, request_obj, is_auto_attempt=is_auto_attempt),
165
- media_type="text/event-stream"
166
- )
167
-
168
- response_id_for_stream = f"chatcmpl-{int(time.time())}"
169
- cand_count_stream = request_obj.n or 1
170
-
171
- async def _stream_generator_inner_for_execute(): # Renamed to avoid potential clashes
172
  try:
173
- for c_idx_call in range(cand_count_stream):
174
- async for chunk_item_call in await current_client.aio.models.generate_content_stream(
175
- model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
176
- ):
177
- yield convert_chunk_to_openai(chunk_item_call, request_obj.model, response_id_for_stream, c_idx_call)
178
  yield create_final_chunk(request_obj.model, response_id_for_stream, cand_count_stream)
179
  yield "data: [DONE]\n\n"
180
- except Exception as e_stream_call:
181
- print(f"Streaming Error in _execute_gemini_call: {e_stream_call}")
182
-
183
- error_message_str = str(e_stream_call)
184
- # Truncate very long error messages to prevent excessively large JSON payloads.
185
- if len(error_message_str) > 1024: # Max length for the error string
186
- error_message_str = error_message_str[:1024] + "..."
187
-
188
- err_resp_content_call = create_openai_error_response(500, error_message_str, "server_error")
189
- json_payload_for_error = json.dumps(err_resp_content_call)
190
- # Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
191
- print(f"DEBUG: Internal error in _stream_generator_inner_for_execute. JSON error for handler: {json_payload_for_error}")
192
- if not is_auto_attempt: # is_auto_attempt is from execute_gemini_call's scope
193
- yield f"data: {json_payload_for_error}\n\n"
194
- yield "data: [DONE]\n\n"
195
- raise e_stream_call # Re-raise the original exception
196
- return StreamingResponse(_stream_generator_inner_for_execute(), media_type="text/event-stream")
197
- else:
198
- response_obj_call = await current_client.aio.models.generate_content(
199
- model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
200
- )
201
-
202
- # Check for safety blocks first for non-streaming calls
203
- if hasattr(response_obj_call, 'prompt_feedback') and \
204
- hasattr(response_obj_call.prompt_feedback, 'block_reason') and \
205
- response_obj_call.prompt_feedback.block_reason:
206
- block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason}"
207
- if hasattr(response_obj_call.prompt_feedback, 'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message:
208
- block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason_message} (Reason: {response_obj_call.prompt_feedback.block_reason})"
209
- print(f"DEBUG: {block_message} (in execute_gemini_call non-streaming)") # Log this specific condition
210
- raise ValueError(block_message)
211
-
212
- if not is_response_valid(response_obj_call): # is_response_valid now only checks for actual text
213
- raise ValueError("Invalid/empty response from non-streaming Gemini call (no text content).")
214
  return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))
 
2
  import time
3
  import math
4
  import asyncio
5
+ import base64 # Added for tokenizer logic
6
+ from typing import List, Dict, Any, Callable, Union, Optional
7
 
8
+ from fastapi.responses import JSONResponse, StreamingResponse
9
  from google.auth.transport.requests import Request as AuthRequest
10
+ from google.genai import types
11
+ from google.genai.types import HttpOptions # Added for tokenizer logic
12
+ from google import genai
13
+ from openai import AsyncOpenAI
14
 
15
+ from models import OpenAIRequest, OpenAIMessage
16
+ from message_processing import (
17
+ deobfuscate_text,
18
+ convert_to_openai_format,
19
+ convert_chunk_to_openai,
20
+ create_final_chunk,
21
+ split_text_by_completion_tokens # Added
22
+ )
23
+ import config as app_config
24
 
25
  def create_openai_error_response(status_code: int, message: str, error_type: str) -> Dict[str, Any]:
26
  return {
 
52
  ]
53
  return config
54
 
55
+ def is_gemini_response_valid(response: Any) -> bool:
56
+ if response is None: return False
57
+ if hasattr(response, 'text') and isinstance(response.text, str) and response.text.strip(): return True
 
 
 
 
 
 
 
 
58
  if hasattr(response, 'candidates') and response.candidates:
59
+ for candidate in response.candidates:
60
+ if hasattr(candidate, 'text') and isinstance(candidate.text, str) and candidate.text.strip(): return True
 
 
61
  if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
62
  for part in candidate.content.parts:
63
+ if hasattr(part, 'text') and isinstance(part.text, str) and part.text.strip(): return True
 
 
 
 
 
 
 
64
  return False
65
 
66
+ async def _base_fake_stream_engine(
67
+ api_call_task_creator: Callable[[], asyncio.Task],
68
+ extract_text_from_response_func: Callable[[Any], str], # To get the *full* text before splitting
69
+ response_id: str,
70
+ sse_model_name: str,
71
+ is_auto_attempt: bool,
72
+ is_valid_response_func: Callable[[Any], bool],
73
+ process_text_func: Optional[Callable[[str, str], str]] = None,
74
+ check_block_reason_func: Optional[Callable[[Any], None]] = None,
75
+ # New parameters for pre-split content
76
+ reasoning_text_to_yield: Optional[str] = None,
77
+ actual_content_text_to_yield: Optional[str] = None
78
+ ):
79
+ api_call_task = api_call_task_creator()
80
+
81
+ while not api_call_task.done():
82
+ keep_alive_data = {"id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"delta": {"reasoning_content": ""}, "index": 0, "finish_reason": None}]}
83
+ yield f"data: {json.dumps(keep_alive_data)}\n\n"
84
+ await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
85
+
86
+ try:
87
+ full_api_response = await api_call_task
88
+
89
+ if check_block_reason_func:
90
+ check_block_reason_func(full_api_response)
91
+
92
+ if not is_valid_response_func(full_api_response):
93
+ raise ValueError(f"Invalid/empty response in fake stream for model {sse_model_name} (validation failed): {str(full_api_response)[:200]}")
94
+
95
+ # Determine content to chunk
96
+ content_to_chunk = ""
97
+ if actual_content_text_to_yield is not None:
98
+ content_to_chunk = actual_content_text_to_yield
99
+ if process_text_func: # Process only the actual content part if pre-split
100
+ content_to_chunk = process_text_func(content_to_chunk, sse_model_name)
101
+ else: # Fallback to old method if no pre-split content provided
102
+ content_to_chunk = extract_text_from_response_func(full_api_response)
103
+ if process_text_func:
104
+ content_to_chunk = process_text_func(content_to_chunk, sse_model_name)
105
+
106
+ # Yield reasoning chunk first if available
107
+ if reasoning_text_to_yield:
108
+ reasoning_delta_data = {
109
+ "id": response_id, "object": "chat.completion.chunk", "created": int(time.time()),
110
+ "model": sse_model_name, "choices": [{"index": 0, "delta": {"reasoning_content": reasoning_text_to_yield}, "finish_reason": None}]
111
+ }
112
+ yield f"data: {json.dumps(reasoning_delta_data)}\n\n"
113
+ await asyncio.sleep(0.05) # Small delay after reasoning
114
+
115
+ # Chunk and yield the main content
116
+ chunk_size = max(20, math.ceil(len(content_to_chunk) / 10)) if content_to_chunk else 0
117
+
118
+ if not content_to_chunk and content_to_chunk != "":
119
+ empty_delta_data = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": None}]}
120
+ yield f"data: {json.dumps(empty_delta_data)}\n\n"
121
+ else:
122
+ for i in range(0, len(content_to_chunk), chunk_size):
123
+ chunk_text = content_to_chunk[i:i+chunk_size]
124
+ content_delta_data = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}]}
125
+ yield f"data: {json.dumps(content_delta_data)}\n\n"
126
+ if len(content_to_chunk) > chunk_size: await asyncio.sleep(0.05)
127
+
128
+ yield create_final_chunk(sse_model_name, response_id)
129
+ yield "data: [DONE]\n\n"
130
+
131
+ except Exception as e:
132
+ err_msg_detail = f"Error in _base_fake_stream_engine (model: '{sse_model_name}'): {type(e).__name__} - {str(e)}"
133
+ print(f"ERROR: {err_msg_detail}")
134
+ sse_err_msg_display = str(e)
135
+ if len(sse_err_msg_display) > 512: sse_err_msg_display = sse_err_msg_display[:512] + "..."
136
+ err_resp_for_sse = create_openai_error_response(500, sse_err_msg_display, "server_error")
137
+ json_payload_for_fake_stream_error = json.dumps(err_resp_for_sse)
138
+ if not is_auto_attempt:
139
+ yield f"data: {json_payload_for_fake_stream_error}\n\n"
140
+ yield "data: [DONE]\n\n"
141
+ raise
142
+
143
+ def gemini_fake_stream_generator(
144
+ gemini_model_instance: genai.GenerativeModel,
145
+ prompt_for_api_call: Union[types.Content, List[types.Content]],
146
+ gen_config_for_api_call: Dict[str, Any],
147
+ request_obj: OpenAIRequest,
148
+ is_auto_attempt: bool
149
+ ):
150
+ print(f"FAKE STREAMING (Gemini): Prep for '{request_obj.model}' (API model: '{gemini_model_instance.model_name}')")
151
+ def _create_gemini_api_task() -> asyncio.Task:
152
+ return asyncio.create_task(gemini_model_instance.generate_content_async(contents=prompt_for_api_call, generation_config=gen_config_for_api_call))
153
+ def _extract_gemini_text(response: Any) -> str:
154
+ # ... (extraction logic as before) ...
155
+ full_text = ""
156
+ if hasattr(response, 'text') and response.text is not None: full_text = response.text
157
+ elif hasattr(response, 'candidates') and response.candidates:
158
+ candidate = response.candidates[0]
159
+ if hasattr(candidate, 'text') and candidate.text is not None: full_text = candidate.text
160
+ elif hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
161
+ texts = [part.text for part in candidate.content.parts if hasattr(part, 'text') and part.text is not None]
162
+ full_text = "".join(texts)
163
+ return full_text
164
+ def _process_gemini_text(text: str, sse_model_name: str) -> str:
165
+ if sse_model_name.endswith("-encrypt-full"): return deobfuscate_text(text)
166
+ return text
167
+ def _check_gemini_block(response: Any):
168
+ if hasattr(response, 'prompt_feedback') and hasattr(response.prompt_feedback, 'block_reason') and response.prompt_feedback.block_reason:
169
+ block_message = f"Response blocked by Gemini safety filter: {response.prompt_feedback.block_reason}"
170
+ if hasattr(response.prompt_feedback, 'block_reason_message') and response.prompt_feedback.block_reason_message: block_message += f" (Message: {response.prompt_feedback.block_reason_message})"
171
+ raise ValueError(block_message)
172
+ response_id = f"chatcmpl-{int(time.time())}"
173
+ return _base_fake_stream_engine(
174
+ api_call_task_creator=_create_gemini_api_task,
175
+ extract_text_from_response_func=_extract_gemini_text,
176
+ process_text_func=_process_gemini_text,
177
+ check_block_reason_func=_check_gemini_block,
178
+ is_valid_response_func=is_gemini_response_valid,
179
+ response_id=response_id, sse_model_name=request_obj.model,
180
+ keep_alive_interval_seconds=app_config.FAKE_STREAMING_INTERVAL_SECONDS,
181
+ is_auto_attempt=is_auto_attempt
182
+ # reasoning_text_to_yield and actual_content_text_to_yield are not used for Gemini
183
+ )
184
+
185
+ async def openai_fake_stream_generator( # Changed to async to await the tokenizer
186
+ openai_client: AsyncOpenAI,
187
+ openai_params: Dict[str, Any],
188
+ openai_extra_body: Dict[str, Any],
189
+ request_obj: OpenAIRequest,
190
+ is_auto_attempt: bool,
191
+ # New params for tokenizer
192
+ gcp_credentials: Any,
193
+ gcp_project_id: str,
194
+ gcp_location: str,
195
+ base_model_id_for_tokenizer: str
196
+ ):
197
+ api_model_name = openai_params.get("model", "unknown-openai-model")
198
+ print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
199
+
200
  response_id = f"chatcmpl-{int(time.time())}"
201
+
202
+ # This task creator now involves the full API call and subsequent token splitting.
203
+ # The _base_fake_stream_engine will then use the pre-split text.
204
+ async def _openai_api_call_and_split_task_creator_wrapper():
205
+ # This inner async function will be what the asyncio.Task runs.
206
+ # It first makes the API call, then does the sync tokenization in a thread.
207
+
208
+ # 1. Make the non-streaming API call
209
+ _api_call_task = asyncio.create_task(
210
+ openai_client.chat.completions.create(
211
+ **openai_params, extra_body=openai_extra_body, stream=False
212
  )
213
  )
214
+ raw_response = await _api_call_task # This is the openai.types.chat.ChatCompletion object
215
+
216
+ # 2. Extract full content and usage for splitting
217
+ full_content_from_api = ""
218
+ if raw_response.choices and raw_response.choices[0].message and raw_response.choices[0].message.content is not None:
219
+ full_content_from_api = raw_response.choices[0].message.content
220
+
221
+ vertex_completion_tokens = 0
222
+ if raw_response.usage and raw_response.usage.completion_tokens is not None:
223
+ vertex_completion_tokens = raw_response.usage.completion_tokens
224
+
225
+ reasoning_text = ""
226
+ actual_content_text = full_content_from_api # Default if split fails or not applicable
227
+
228
+ if full_content_from_api and vertex_completion_tokens > 0:
229
+ # 3. Perform synchronous tokenization and splitting in a separate thread
230
+ reasoning_text, actual_content_text, _ = await asyncio.to_thread(
231
+ split_text_by_completion_tokens, # Use imported function
232
+ gcp_credentials, gcp_project_id, gcp_location,
233
+ base_model_id_for_tokenizer, # The base model for the tokenizer
234
+ full_content_from_api,
235
+ vertex_completion_tokens
236
+ )
237
+ if reasoning_text:
238
+ print(f"DEBUG_FAKE_REASONING_SPLIT: Success. Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
239
+
240
+ # We pass the raw_response and the split text to the base engine.
241
+ # The base engine still needs the raw_response for initial validation,
242
+ # but will use the pre-split text for yielding chunks.
243
+ return raw_response, reasoning_text, actual_content_text
244
+
245
+ # The main generator logic starts here:
246
+ # Initial keep-alive loop
247
+ temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())
248
+ while not temp_task_for_keepalive_check.done():
249
+ keep_alive_data = {"id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()), "model": request_obj.model, "choices": [{"delta": {"content": ""}, "index": 0, "finish_reason": None}]}
250
+ yield f"data: {json.dumps(keep_alive_data)}\n\n"
251
+ await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
252
+
253
+ try:
254
+ # Get the results from our wrapper task
255
+ full_api_response, separated_reasoning_text, separated_actual_content_text = await temp_task_for_keepalive_check
256
+
257
+ # Define OpenAI specific helpers for _base_fake_stream_engine
258
+ def _extract_openai_full_text(response: Any) -> str: # Still needed for initial validation if used
259
+ if response.choices and response.choices[0].message and response.choices[0].message.content is not None:
260
+ return response.choices[0].message.content
261
+ return ""
262
+ def _is_openai_response_valid(response: Any) -> bool:
263
+ return bool(response.choices and response.choices[0].message is not None)
264
+
265
+ # Now, iterate through the base engine using the results
266
+ async for chunk in _base_fake_stream_engine(
267
+ api_call_task_creator=lambda: asyncio.create_task(asyncio.sleep(0, result=full_api_response)), # Dummy task, result already known
268
+ extract_text_from_response_func=_extract_openai_full_text, # For potential use by is_valid_response_func
269
+ is_valid_response_func=_is_openai_response_valid,
270
+ response_id=response_id,
271
+ sse_model_name=request_obj.model,
272
+ keep_alive_interval_seconds=0, # Keep-alive handled above for the combined op
273
+ is_auto_attempt=is_auto_attempt,
274
+ reasoning_text_to_yield=separated_reasoning_text,
275
+ actual_content_text_to_yield=separated_actual_content_text
276
+ ):
277
+ yield chunk
278
 
279
+ except Exception as e_outer: # Catch errors from the _openai_api_call_and_split_task_creator_wrapper or subsequent base engine
280
+ err_msg_detail = f"Error in openai_fake_stream_generator outer (model: '{request_obj.model}'): {type(e_outer).__name__} - {str(e_outer)}"
281
+ print(f"ERROR: {err_msg_detail}")
282
+ sse_err_msg_display = str(e_outer)
283
+ if len(sse_err_msg_display) > 512: sse_err_msg_display = sse_err_msg_display[:512] + "..."
284
+ err_resp_sse = create_openai_error_response(500, sse_err_msg_display, "server_error")
285
+ json_payload_error = json.dumps(err_resp_sse)
286
+ if not is_auto_attempt:
287
+ yield f"data: {json_payload_error}\n\n"
 
288
  yield "data: [DONE]\n\n"
289
+ # No re-raise here as we've handled sending the error via SSE.
290
+ # If auto-mode needs to retry, the exception from the inner task would have been raised before this point.
291
+
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  async def execute_gemini_call(
294
+ current_client: Any, model_to_call: str,
 
295
  prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
296
+ gen_config_for_call: Dict[str, Any], request_obj: OpenAIRequest,
 
297
  is_auto_attempt: bool = False
298
  ):
299
  actual_prompt_for_call = prompt_func(request_obj.messages)
300
+ gemini_model_instance: Optional[genai.GenerativeModel] = None
301
+ if hasattr(current_client, 'get_model') and callable(getattr(current_client, 'get_model')):
302
+ try: gemini_model_instance = current_client.get_model(model_name=model_to_call)
303
+ except Exception as e: raise ValueError(f"Could not get Gemini model '{model_to_call}' Express: {e}") from e
304
+ elif isinstance(current_client, genai.GenerativeModel):
305
+ if model_to_call not in current_client.model_name: print(f"WARNING: Mismatch! model_to_call='{model_to_call}', client.model_name='{current_client.model_name}'")
306
+ gemini_model_instance = current_client
307
+ else: raise ValueError(f"Unsupported current_client for Gemini: {type(current_client)}")
308
+ if not gemini_model_instance: raise ValueError(f"Failed to get GeminiModel for '{model_to_call}'.")
309
+
310
  if request_obj.stream:
311
  if app_config.FAKE_STREAMING_ENABLED:
312
+ return StreamingResponse(gemini_fake_stream_generator(gemini_model_instance, actual_prompt_for_call, gen_config_for_call, request_obj, is_auto_attempt), media_type="text/event-stream")
313
+ response_id_for_stream, cand_count_stream = f"chatcmpl-{int(time.time())}", request_obj.n or 1
314
+ async def _gemini_real_stream_generator_inner():
 
 
 
 
 
 
315
  try:
316
+ async for chunk_item_call in gemini_model_instance.generate_content_async(contents=actual_prompt_for_call, generation_config=gen_config_for_call, stream=True):
317
+ yield convert_chunk_to_openai(chunk_item_call, request_obj.model, response_id_for_stream, 0)
 
 
 
318
  yield create_final_chunk(request_obj.model, response_id_for_stream, cand_count_stream)
319
  yield "data: [DONE]\n\n"
320
+ except Exception as e:
321
+ # ... (error handling as before) ...
322
+ err_msg_detail_stream = f"Streaming Error (Gemini model: '{gemini_model_instance.model_name}'): {type(e).__name__} - {str(e)}"
323
+ print(f"ERROR: {err_msg_detail_stream}")
324
+ s_err = str(e); s_err = s_err[:1024]+"..." if len(s_err)>1024 else s_err
325
+ err_resp = create_openai_error_response(500,s_err,"server_error")
326
+ j_err = json.dumps(err_resp)
327
+ if not is_auto_attempt: yield f"data: {j_err}\n\n"; yield "data: [DONE]\n\n"
328
+ raise e
329
+ return StreamingResponse(_gemini_real_stream_generator_inner(), media_type="text/event-stream")
330
+ else:
331
+ response_obj_call = await gemini_model_instance.generate_content_async(contents=actual_prompt_for_call, generation_config=gen_config_for_call)
332
+ if hasattr(response_obj_call, 'prompt_feedback') and hasattr(response_obj_call.prompt_feedback, 'block_reason') and response_obj_call.prompt_feedback.block_reason:
333
+ block_msg = f"Blocked (Gemini): {response_obj_call.prompt_feedback.block_reason}"
334
+ if hasattr(response_obj_call.prompt_feedback,'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message: block_msg+=f" ({response_obj_call.prompt_feedback.block_reason_message})"
335
+ raise ValueError(block_msg)
336
+ if not is_gemini_response_valid(response_obj_call): raise ValueError(f"Invalid non-streaming Gemini response for '{gemini_model_instance.model_name}'. Resp: {str(response_obj_call)[:200]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))
app/message_processing.py CHANGED
@@ -3,10 +3,12 @@ import re
3
  import json
4
  import time
5
  import urllib.parse
6
- from typing import List, Dict, Any, Union, Literal # Optional removed
7
 
8
  from google.genai import types
9
- from models import OpenAIMessage, ContentPartText, ContentPartImage # Changed from relative
 
 
10
 
11
  # Define supported roles for Gemini API
12
  SUPPORTED_ROLES = ["user", "model"]
@@ -519,4 +521,94 @@ def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -
519
  "model": model,
520
  "choices": choices
521
  }
522
- return f"data: {json.dumps(final_chunk)}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
  import time
5
  import urllib.parse
6
+ from typing import List, Dict, Any, Union, Literal
7
 
8
  from google.genai import types
9
+ from google.genai.types import HttpOptions as GenAIHttpOptions # Renamed to avoid conflict if HttpOptions is used elsewhere
10
+ from google import genai as google_genai_client # For instantiating client in tokenizer
11
+ from models import OpenAIMessage, ContentPartText, ContentPartImage
12
 
13
  # Define supported roles for Gemini API
14
  SUPPORTED_ROLES = ["user", "model"]
 
521
  "model": model,
522
  "choices": choices
523
  }
524
+ return f"data: {json.dumps(final_chunk)}\n\n"
525
+
526
+ def split_text_by_completion_tokens(
527
+ gcp_creds: Any,
528
+ gcp_proj_id: str,
529
+ gcp_loc: str,
530
+ model_id_for_tokenizer: str,
531
+ full_text_to_tokenize: str,
532
+ num_completion_tokens_from_usage: int
533
+ ) -> tuple[str, str, List[str]]:
534
+ """
535
+ Splits a given text into reasoning and actual content based on a number of completion tokens.
536
+ Uses Google's tokenizer. This is a synchronous function.
537
+ Args:
538
+ gcp_creds: GCP credentials.
539
+ gcp_proj_id: GCP project ID.
540
+ gcp_loc: GCP location.
541
+ model_id_for_tokenizer: The base model ID (e.g., "gemini-1.5-pro") for the tokenizer.
542
+ full_text_to_tokenize: The full text string from the LLM.
543
+ num_completion_tokens_from_usage: The number of tokens designated as 'completion' by the LLM's usage stats.
544
+ Returns:
545
+ A tuple: (reasoning_text_str, actual_content_text_str, all_decoded_token_strings_list)
546
+ """
547
+ if not full_text_to_tokenize: # Handle empty input early
548
+ return "", "", []
549
+
550
+ try:
551
+ # This client is specifically for tokenization. Uses GenAIHttpOptions for api_version.
552
+ sync_tokenizer_client = google_genai_client.Client(
553
+ vertexai=True, credentials=gcp_creds, project=gcp_proj_id, location=gcp_loc,
554
+ http_options=GenAIHttpOptions(api_version="v1") # v1 is generally for compute_tokens
555
+ )
556
+
557
+ token_compute_response = sync_tokenizer_client.models.compute_tokens(
558
+ model=model_id_for_tokenizer, contents=full_text_to_tokenize
559
+ )
560
+
561
+ all_final_token_strings = []
562
+ if token_compute_response.tokens_info:
563
+ for token_info_item in token_compute_response.tokens_info:
564
+ for api_token_bytes in token_info_item.tokens:
565
+ # Attempt to decode from base64 first, as Vertex sometimes returns b64 encoded tokens.
566
+ # Fallback to direct UTF-8 decoding if b64 fails.
567
+ intermediate_str = ""
568
+ try:
569
+ # Vertex's tokens via compute_tokens for some models are plain UTF-8 strings,
570
+ # but sometimes they might be base64 encoded representations of bytes.
571
+ # The provided code in chat_api.py does a b64decode on a utf-8 string.
572
+ # Let's assume api_token_bytes is indeed bytes that represent a b64 string of the *actual* token bytes.
573
+ # This seems overly complex based on typical SDKs, but following existing pattern.
574
+ # More commonly, api_token_bytes would *be* the token bytes directly.
575
+ # If api_token_bytes is already text:
576
+ if isinstance(api_token_bytes, str):
577
+ intermediate_str = api_token_bytes
578
+ else: # Assuming it's bytes
579
+ intermediate_str = api_token_bytes.decode('utf-8', errors='replace')
580
+
581
+ final_token_text = ""
582
+ # Attempt to decode what we think is a base64 string
583
+ b64_decoded_bytes = base64.b64decode(intermediate_str)
584
+ final_token_text = b64_decoded_bytes.decode('utf-8', errors='replace')
585
+ except Exception:
586
+ # If b64decode fails, assume intermediate_str was the actual token text
587
+ final_token_text = intermediate_str
588
+ all_final_token_strings.append(final_token_text)
589
+
590
+ if not all_final_token_strings: # Should not happen if full_text_to_tokenize was not empty
591
+ # print(f"DEBUG_TOKEN_SPLIT: No tokens found for: '{full_text_to_tokenize[:50]}...'")
592
+ return "", full_text_to_tokenize, []
593
+
594
+ # Validate num_completion_tokens_from_usage
595
+ if not (0 < num_completion_tokens_from_usage <= len(all_final_token_strings)):
596
+ # print(f"WARNING_TOKEN_SPLIT: num_completion_tokens_from_usage ({num_completion_tokens_from_usage}) is invalid or out of bounds for total client-tokenized tokens ({len(all_final_token_strings)}). Full text returned as 'content'.")
597
+ # Return the text as re-joined by our tokenizer, not the original full_text_to_tokenize,
598
+ # as the tokenization process itself might subtly alter it (e.g. space handling, special chars).
599
+ return "", "".join(all_final_token_strings), all_final_token_strings
600
+
601
+ # Split tokens
602
+ completion_part_tokens = all_final_token_strings[-num_completion_tokens_from_usage:]
603
+ reasoning_part_tokens = all_final_token_strings[:-num_completion_tokens_from_usage]
604
+
605
+ reasoning_output_str = "".join(reasoning_part_tokens)
606
+ completion_output_str = "".join(completion_part_tokens)
607
+
608
+ # print(f"DEBUG_TOKEN_SPLIT: Reasoning: '{reasoning_output_str[:50]}...', Content: '{completion_output_str[:50]}...'")
609
+ return reasoning_output_str, completion_output_str, all_final_token_strings
610
+
611
+ except Exception as e_tok:
612
+ print(f"ERROR: Tokenizer failed in split_text_by_completion_tokens: {e_tok}")
613
+ # Fallback: no reasoning, original full text as content, empty token list
614
+ return "", full_text_to_tokenize, []
app/routes/chat_api.py CHANGED
@@ -22,12 +22,14 @@ from model_loader import get_vertex_models, get_vertex_express_models # Import f
22
  from message_processing import (
23
  create_gemini_prompt,
24
  create_encrypted_gemini_prompt,
25
- create_encrypted_full_gemini_prompt
 
26
  )
27
  from api_helpers import (
28
  create_generation_config,
29
  create_openai_error_response,
30
- execute_gemini_call
 
31
  )
32
 
33
  router = APIRouter()
@@ -222,72 +224,83 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
222
  }
223
 
224
  if request.stream:
225
- async def openai_stream_generator():
226
- try:
227
- stream_response = await openai_client.chat.completions.create(
228
- **openai_params,
229
- extra_body=openai_extra_body
230
- )
231
- async for chunk in stream_response:
232
- try:
233
- chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
234
-
235
- # Safely navigate and check for thought flag
236
- choices = chunk_as_dict.get('choices')
237
- if choices and isinstance(choices, list) and len(choices) > 0:
238
- delta = choices[0].get('delta')
239
- if delta and isinstance(delta, dict):
240
- extra_content = delta.get('extra_content')
241
- if isinstance(extra_content, dict):
242
- google_content = extra_content.get('google')
243
- if isinstance(google_content, dict) and google_content.get('thought') is True:
244
- # This is a thought chunk, modify chunk_as_dict's delta in place
245
- reasoning_text = delta.get('content')
246
- if reasoning_text is not None:
247
- delta['reasoning_content'] = reasoning_text
248
-
249
- if 'content' in delta:
250
- del delta['content']
251
-
252
- # Always delete extra_content for thought chunks
253
- if 'extra_content' in delta:
254
- del delta['extra_content']
255
-
256
- # Yield the (potentially modified) dictionary as JSON
257
- print(chunk_as_dict)
258
- yield f"data: {json.dumps(chunk_as_dict)}\n\n"
259
-
260
- except Exception as chunk_processing_error: # Catch errors from dict manipulation or json.dumps
261
- error_msg_chunk = f"Error processing or serializing OpenAI chunk for {request.model}: {str(chunk_processing_error)}. Chunk: {str(chunk)[:200]}"
262
- print(f"ERROR: {error_msg_chunk}")
263
- # Truncate
264
- if len(error_msg_chunk) > 1024:
265
- error_msg_chunk = error_msg_chunk[:1024] + "..."
266
- error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
267
- json_payload_for_chunk_error = json.dumps(error_response_chunk) # Ensure json is imported
268
- print(f"DEBUG: Yielding chunk processing error JSON payload (OpenAI path): {json_payload_for_chunk_error}")
269
- yield f"data: {json_payload_for_chunk_error}\n\n"
270
- yield "data: [DONE]\n\n"
271
- return # Stop further processing for this request
272
- yield "data: [DONE]\n\n"
273
- except Exception as stream_error:
274
- original_error_message = str(stream_error)
275
- # Truncate very long error messages
276
- if len(original_error_message) > 1024:
277
- original_error_message = original_error_message[:1024] + "..."
278
-
279
- error_msg_stream = f"Error during OpenAI client streaming for {request.model}: {original_error_message}"
280
- print(f"ERROR: {error_msg_stream}")
281
-
282
- error_response_content = create_openai_error_response(500, error_msg_stream, "server_error")
283
- json_payload_for_stream_error = json.dumps(error_response_content)
284
- print(f"DEBUG: Yielding stream error JSON payload (OpenAI path): {json_payload_for_stream_error}")
285
- yield f"data: {json_payload_for_stream_error}\n\n"
286
- yield "data: [DONE]\n\n"
287
- return StreamingResponse(openai_stream_generator(), media_type="text/event-stream")
288
- else: # Not streaming
 
 
 
 
 
 
 
 
289
  try:
 
 
290
  response = await openai_client.chat.completions.create(
 
291
  **openai_params,
292
  extra_body=openai_extra_body
293
  )
@@ -312,55 +325,19 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
312
  if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
313
  full_content = message_dict.get('content')
314
  if isinstance(full_content, str) and full_content:
315
-
316
- def _get_token_strings_and_split_texts_sync(creds, proj_id, loc, model_id_for_tokenizer, text_to_tokenize, num_completion_tokens_from_usage):
317
- sync_tokenizer_client = genai.Client(
318
- vertexai=True, credentials=creds, project=proj_id, location=loc,
319
- http_options=HttpOptions(api_version="v1")
320
- )
321
- if not text_to_tokenize: return "", text_to_tokenize, [] # No reasoning, original content, empty token list
322
-
323
- token_compute_response = sync_tokenizer_client.models.compute_tokens(
324
- model=model_id_for_tokenizer, contents=text_to_tokenize
325
- )
326
-
327
- all_final_token_strings = []
328
- if token_compute_response.tokens_info:
329
- for token_info_item in token_compute_response.tokens_info:
330
- for api_token_bytes in token_info_item.tokens:
331
- intermediate_str = api_token_bytes.decode('utf-8', errors='replace')
332
- final_token_text = ""
333
- try:
334
- b64_decoded_bytes = base64.b64decode(intermediate_str)
335
- final_token_text = b64_decoded_bytes.decode('utf-8', errors='replace')
336
- except Exception:
337
- final_token_text = intermediate_str
338
- all_final_token_strings.append(final_token_text)
339
-
340
- if not all_final_token_strings: # Should not happen if text_to_tokenize is not empty
341
- return "", text_to_tokenize, []
342
-
343
- if not (0 < num_completion_tokens_from_usage <= len(all_final_token_strings)):
344
- print(f"WARNING_TOKEN_SPLIT: num_completion_tokens_from_usage ({num_completion_tokens_from_usage}) is invalid for total client-tokenized tokens ({len(all_final_token_strings)}). Returning full content as 'content'.")
345
- return "", "".join(all_final_token_strings), all_final_token_strings
346
-
347
- completion_part_tokens = all_final_token_strings[-num_completion_tokens_from_usage:]
348
- reasoning_part_tokens = all_final_token_strings[:-num_completion_tokens_from_usage]
349
-
350
- reasoning_output_str = "".join(reasoning_part_tokens)
351
- completion_output_str = "".join(completion_part_tokens)
352
-
353
- return reasoning_output_str, completion_output_str, all_final_token_strings
354
-
355
  model_id_for_tokenizer = base_model_name
356
 
357
  reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
358
- _get_token_strings_and_split_texts_sync,
359
- rotated_credentials, PROJECT_ID, LOCATION,
360
- model_id_for_tokenizer, full_content, vertex_completion_tokens
 
 
 
 
361
  )
362
 
363
- message_dict['content'] = actual_content # Set the new content (potentially from joined tokens)
364
  if reasoning_text: # Only add reasoning_content if it's not empty
365
  message_dict['reasoning_content'] = reasoning_text
366
  print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")
 
22
  from message_processing import (
23
  create_gemini_prompt,
24
  create_encrypted_gemini_prompt,
25
+ create_encrypted_full_gemini_prompt,
26
+ split_text_by_completion_tokens # Added
27
  )
28
  from api_helpers import (
29
  create_generation_config,
30
  create_openai_error_response,
31
+ execute_gemini_call,
32
+ openai_fake_stream_generator # Added
33
  )
34
 
35
  router = APIRouter()
 
224
  }
225
 
226
  if request.stream:
227
+ if app_config.FAKE_STREAMING_ENABLED:
228
+ print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
229
+ # openai_params already has "stream": True from initial setup,
230
+ # but openai_fake_stream_generator will make a stream=False call internally.
231
+ # Call the now async generator
232
+ return StreamingResponse(
233
+ await openai_fake_stream_generator( # Added await
234
+ openai_client=openai_client,
235
+ openai_params=openai_params,
236
+ openai_extra_body=openai_extra_body,
237
+ request_obj=request,
238
+ is_auto_attempt=False,
239
+ # --- New parameters for tokenizer and reasoning split ---
240
+ gcp_credentials=rotated_credentials,
241
+ gcp_project_id=PROJECT_ID, # This is rotated_project_id
242
+ gcp_location=LOCATION, # This is "global"
243
+ base_model_id_for_tokenizer=base_model_name # Stripped model ID for tokenizer
244
+ ),
245
+ media_type="text/event-stream"
246
+ )
247
+ else: # Regular OpenAI streaming
248
+ print(f"INFO: OpenAI True Streaming ENABLED for model '{request.model}'.")
249
+ async def openai_true_stream_generator(): # Renamed to avoid conflict
250
+ try:
251
+ # Ensure stream=True is explicitly passed for real streaming
252
+ openai_params_for_true_stream = {**openai_params, "stream": True}
253
+ stream_response = await openai_client.chat.completions.create(
254
+ **openai_params_for_true_stream,
255
+ extra_body=openai_extra_body
256
+ )
257
+ async for chunk in stream_response:
258
+ try:
259
+ chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
260
+
261
+ choices = chunk_as_dict.get('choices')
262
+ if choices and isinstance(choices, list) and len(choices) > 0:
263
+ delta = choices[0].get('delta')
264
+ if delta and isinstance(delta, dict):
265
+ extra_content = delta.get('extra_content')
266
+ if isinstance(extra_content, dict):
267
+ google_content = extra_content.get('google')
268
+ if isinstance(google_content, dict) and google_content.get('thought') is True:
269
+ reasoning_text = delta.get('content')
270
+ if reasoning_text is not None:
271
+ delta['reasoning_content'] = reasoning_text
272
+ if 'content' in delta: del delta['content']
273
+ if 'extra_content' in delta: del delta['extra_content']
274
+
275
+ # print(f"DEBUG OpenAI Stream Chunk: {chunk_as_dict}") # Potential verbose log
276
+ yield f"data: {json.dumps(chunk_as_dict)}\n\n"
277
+
278
+ except Exception as chunk_processing_error:
279
+ error_msg_chunk = f"Error processing/serializing OpenAI chunk for {request.model}: {str(chunk_processing_error)}. Chunk: {str(chunk)[:200]}"
280
+ print(f"ERROR: {error_msg_chunk}")
281
+ if len(error_msg_chunk) > 1024: error_msg_chunk = error_msg_chunk[:1024] + "..."
282
+ error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
283
+ json_payload_for_chunk_error = json.dumps(error_response_chunk)
284
+ yield f"data: {json_payload_for_chunk_error}\n\n"
285
+ yield "data: [DONE]\n\n"
286
+ return
287
+ yield "data: [DONE]\n\n"
288
+ except Exception as stream_error:
289
+ original_error_message = str(stream_error)
290
+ if len(original_error_message) > 1024: original_error_message = original_error_message[:1024] + "..."
291
+ error_msg_stream = f"Error during OpenAI client true streaming for {request.model}: {original_error_message}"
292
+ print(f"ERROR: {error_msg_stream}")
293
+ error_response_content = create_openai_error_response(500, error_msg_stream, "server_error")
294
+ json_payload_for_stream_error = json.dumps(error_response_content)
295
+ yield f"data: {json_payload_for_stream_error}\n\n"
296
+ yield "data: [DONE]\n\n"
297
+ return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
298
+ else: # Not streaming (is_openai_direct_model and not request.stream)
299
  try:
300
+ # Ensure stream=False is explicitly passed for non-streaming
301
+ openai_params_for_non_stream = {**openai_params, "stream": False}
302
  response = await openai_client.chat.completions.create(
303
+ **openai_params_for_non_stream,
304
  **openai_params,
305
  extra_body=openai_extra_body
306
  )
 
325
  if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
326
  full_content = message_dict.get('content')
327
  if isinstance(full_content, str) and full_content:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  model_id_for_tokenizer = base_model_name
329
 
330
  reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
331
+ split_text_by_completion_tokens, # Use imported function
332
+ rotated_credentials,
333
+ PROJECT_ID,
334
+ LOCATION,
335
+ model_id_for_tokenizer,
336
+ full_content,
337
+ vertex_completion_tokens
338
  )
339
 
340
+ message_dict['content'] = actual_content
341
  if reasoning_text: # Only add reasoning_content if it's not empty
342
  message_dict['reasoning_content'] = reasoning_text
343
  print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")