Spaces:
Running
Running
Commit
·
bee94f0
1
Parent(s):
fb7432b
testing openai fake streaming and reasoning
Browse files- app/api_helpers.py +277 -154
- app/message_processing.py +95 -3
- app/routes/chat_api.py +87 -110
app/api_helpers.py
CHANGED
@@ -2,17 +2,25 @@ import json
|
|
2 |
import time
|
3 |
import math
|
4 |
import asyncio
|
5 |
-
|
6 |
-
from
|
7 |
|
|
|
8 |
from google.auth.transport.requests import Request as AuthRequest
|
9 |
-
from google.genai import types
|
10 |
-
from google import
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
from
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def create_openai_error_response(status_code: int, message: str, error_type: str) -> Dict[str, Any]:
|
18 |
return {
|
@@ -44,171 +52,286 @@ def create_generation_config(request: OpenAIRequest) -> Dict[str, Any]:
|
|
44 |
]
|
45 |
return config
|
46 |
|
47 |
-
def
|
48 |
-
if response is None:
|
49 |
-
|
50 |
-
return False
|
51 |
-
|
52 |
-
# Check for direct text attribute
|
53 |
-
if hasattr(response, 'text') and isinstance(response.text, str) and response.text.strip():
|
54 |
-
# print("DEBUG: Response valid due to response.text")
|
55 |
-
return True
|
56 |
-
|
57 |
-
# Check candidates for text content
|
58 |
if hasattr(response, 'candidates') and response.candidates:
|
59 |
-
for candidate in response.candidates:
|
60 |
-
if hasattr(candidate, 'text') and isinstance(candidate.text, str) and candidate.text.strip():
|
61 |
-
# print(f"DEBUG: Response valid due to candidate.text in candidate")
|
62 |
-
return True
|
63 |
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
|
64 |
for part in candidate.content.parts:
|
65 |
-
if hasattr(part, 'text') and isinstance(part.text, str) and part.text.strip():
|
66 |
-
# print(f"DEBUG: Response valid due to part.text in candidate's content part")
|
67 |
-
return True
|
68 |
-
|
69 |
-
# Removed prompt_feedback as a sole criterion for validity.
|
70 |
-
# It should only be valid if actual text content is found.
|
71 |
-
# Block reasons will be checked explicitly by callers if they need to treat it as an error for retries.
|
72 |
-
print("DEBUG: Response is invalid, no usable text content found by is_response_valid.")
|
73 |
return False
|
74 |
|
75 |
-
async def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
response_id = f"chatcmpl-{int(time.time())}"
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
)
|
83 |
)
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
yield create_final_chunk(request_obj.model, response_id)
|
135 |
yield "data: [DONE]\n\n"
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
err_resp = create_openai_error_response(500, err_msg, "server_error")
|
140 |
-
# It's good practice to log the JSON payload here too for consistency,
|
141 |
-
# though the main concern was the true streaming path.
|
142 |
-
json_payload_for_fake_stream_error = json.dumps(err_resp)
|
143 |
-
# Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
|
144 |
-
print(f"DEBUG: Internal error in fake_stream_generator. JSON error for handler: {json_payload_for_fake_stream_error}")
|
145 |
-
if not is_auto_attempt:
|
146 |
-
yield f"data: {json_payload_for_fake_stream_error}\n\n"
|
147 |
-
yield "data: [DONE]\n\n"
|
148 |
-
raise e # Re-raise the original exception e
|
149 |
-
return fake_stream_inner()
|
150 |
|
151 |
async def execute_gemini_call(
|
152 |
-
current_client: Any,
|
153 |
-
model_to_call: str,
|
154 |
prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
|
155 |
-
gen_config_for_call: Dict[str, Any],
|
156 |
-
request_obj: OpenAIRequest, # Pass the whole request object
|
157 |
is_auto_attempt: bool = False
|
158 |
):
|
159 |
actual_prompt_for_call = prompt_func(request_obj.messages)
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
if request_obj.stream:
|
162 |
if app_config.FAKE_STREAMING_ENABLED:
|
163 |
-
return StreamingResponse(
|
164 |
-
|
165 |
-
|
166 |
-
)
|
167 |
-
|
168 |
-
response_id_for_stream = f"chatcmpl-{int(time.time())}"
|
169 |
-
cand_count_stream = request_obj.n or 1
|
170 |
-
|
171 |
-
async def _stream_generator_inner_for_execute(): # Renamed to avoid potential clashes
|
172 |
try:
|
173 |
-
for
|
174 |
-
|
175 |
-
model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
|
176 |
-
):
|
177 |
-
yield convert_chunk_to_openai(chunk_item_call, request_obj.model, response_id_for_stream, c_idx_call)
|
178 |
yield create_final_chunk(request_obj.model, response_id_for_stream, cand_count_stream)
|
179 |
yield "data: [DONE]\n\n"
|
180 |
-
except Exception as
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
else:
|
198 |
-
response_obj_call = await current_client.aio.models.generate_content(
|
199 |
-
model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
|
200 |
-
)
|
201 |
-
|
202 |
-
# Check for safety blocks first for non-streaming calls
|
203 |
-
if hasattr(response_obj_call, 'prompt_feedback') and \
|
204 |
-
hasattr(response_obj_call.prompt_feedback, 'block_reason') and \
|
205 |
-
response_obj_call.prompt_feedback.block_reason:
|
206 |
-
block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason}"
|
207 |
-
if hasattr(response_obj_call.prompt_feedback, 'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message:
|
208 |
-
block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason_message} (Reason: {response_obj_call.prompt_feedback.block_reason})"
|
209 |
-
print(f"DEBUG: {block_message} (in execute_gemini_call non-streaming)") # Log this specific condition
|
210 |
-
raise ValueError(block_message)
|
211 |
-
|
212 |
-
if not is_response_valid(response_obj_call): # is_response_valid now only checks for actual text
|
213 |
-
raise ValueError("Invalid/empty response from non-streaming Gemini call (no text content).")
|
214 |
return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))
|
|
|
2 |
import time
|
3 |
import math
|
4 |
import asyncio
|
5 |
+
import base64 # Added for tokenizer logic
|
6 |
+
from typing import List, Dict, Any, Callable, Union, Optional
|
7 |
|
8 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
9 |
from google.auth.transport.requests import Request as AuthRequest
|
10 |
+
from google.genai import types
|
11 |
+
from google.genai.types import HttpOptions # Added for tokenizer logic
|
12 |
+
from google import genai
|
13 |
+
from openai import AsyncOpenAI
|
14 |
|
15 |
+
from models import OpenAIRequest, OpenAIMessage
|
16 |
+
from message_processing import (
|
17 |
+
deobfuscate_text,
|
18 |
+
convert_to_openai_format,
|
19 |
+
convert_chunk_to_openai,
|
20 |
+
create_final_chunk,
|
21 |
+
split_text_by_completion_tokens # Added
|
22 |
+
)
|
23 |
+
import config as app_config
|
24 |
|
25 |
def create_openai_error_response(status_code: int, message: str, error_type: str) -> Dict[str, Any]:
|
26 |
return {
|
|
|
52 |
]
|
53 |
return config
|
54 |
|
55 |
+
def is_gemini_response_valid(response: Any) -> bool:
|
56 |
+
if response is None: return False
|
57 |
+
if hasattr(response, 'text') and isinstance(response.text, str) and response.text.strip(): return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if hasattr(response, 'candidates') and response.candidates:
|
59 |
+
for candidate in response.candidates:
|
60 |
+
if hasattr(candidate, 'text') and isinstance(candidate.text, str) and candidate.text.strip(): return True
|
|
|
|
|
61 |
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
|
62 |
for part in candidate.content.parts:
|
63 |
+
if hasattr(part, 'text') and isinstance(part.text, str) and part.text.strip(): return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
return False
|
65 |
|
66 |
+
async def _base_fake_stream_engine(
|
67 |
+
api_call_task_creator: Callable[[], asyncio.Task],
|
68 |
+
extract_text_from_response_func: Callable[[Any], str], # To get the *full* text before splitting
|
69 |
+
response_id: str,
|
70 |
+
sse_model_name: str,
|
71 |
+
is_auto_attempt: bool,
|
72 |
+
is_valid_response_func: Callable[[Any], bool],
|
73 |
+
process_text_func: Optional[Callable[[str, str], str]] = None,
|
74 |
+
check_block_reason_func: Optional[Callable[[Any], None]] = None,
|
75 |
+
# New parameters for pre-split content
|
76 |
+
reasoning_text_to_yield: Optional[str] = None,
|
77 |
+
actual_content_text_to_yield: Optional[str] = None
|
78 |
+
):
|
79 |
+
api_call_task = api_call_task_creator()
|
80 |
+
|
81 |
+
while not api_call_task.done():
|
82 |
+
keep_alive_data = {"id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"delta": {"reasoning_content": ""}, "index": 0, "finish_reason": None}]}
|
83 |
+
yield f"data: {json.dumps(keep_alive_data)}\n\n"
|
84 |
+
await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
|
85 |
+
|
86 |
+
try:
|
87 |
+
full_api_response = await api_call_task
|
88 |
+
|
89 |
+
if check_block_reason_func:
|
90 |
+
check_block_reason_func(full_api_response)
|
91 |
+
|
92 |
+
if not is_valid_response_func(full_api_response):
|
93 |
+
raise ValueError(f"Invalid/empty response in fake stream for model {sse_model_name} (validation failed): {str(full_api_response)[:200]}")
|
94 |
+
|
95 |
+
# Determine content to chunk
|
96 |
+
content_to_chunk = ""
|
97 |
+
if actual_content_text_to_yield is not None:
|
98 |
+
content_to_chunk = actual_content_text_to_yield
|
99 |
+
if process_text_func: # Process only the actual content part if pre-split
|
100 |
+
content_to_chunk = process_text_func(content_to_chunk, sse_model_name)
|
101 |
+
else: # Fallback to old method if no pre-split content provided
|
102 |
+
content_to_chunk = extract_text_from_response_func(full_api_response)
|
103 |
+
if process_text_func:
|
104 |
+
content_to_chunk = process_text_func(content_to_chunk, sse_model_name)
|
105 |
+
|
106 |
+
# Yield reasoning chunk first if available
|
107 |
+
if reasoning_text_to_yield:
|
108 |
+
reasoning_delta_data = {
|
109 |
+
"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()),
|
110 |
+
"model": sse_model_name, "choices": [{"index": 0, "delta": {"reasoning_content": reasoning_text_to_yield}, "finish_reason": None}]
|
111 |
+
}
|
112 |
+
yield f"data: {json.dumps(reasoning_delta_data)}\n\n"
|
113 |
+
await asyncio.sleep(0.05) # Small delay after reasoning
|
114 |
+
|
115 |
+
# Chunk and yield the main content
|
116 |
+
chunk_size = max(20, math.ceil(len(content_to_chunk) / 10)) if content_to_chunk else 0
|
117 |
+
|
118 |
+
if not content_to_chunk and content_to_chunk != "":
|
119 |
+
empty_delta_data = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": None}]}
|
120 |
+
yield f"data: {json.dumps(empty_delta_data)}\n\n"
|
121 |
+
else:
|
122 |
+
for i in range(0, len(content_to_chunk), chunk_size):
|
123 |
+
chunk_text = content_to_chunk[i:i+chunk_size]
|
124 |
+
content_delta_data = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}]}
|
125 |
+
yield f"data: {json.dumps(content_delta_data)}\n\n"
|
126 |
+
if len(content_to_chunk) > chunk_size: await asyncio.sleep(0.05)
|
127 |
+
|
128 |
+
yield create_final_chunk(sse_model_name, response_id)
|
129 |
+
yield "data: [DONE]\n\n"
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
err_msg_detail = f"Error in _base_fake_stream_engine (model: '{sse_model_name}'): {type(e).__name__} - {str(e)}"
|
133 |
+
print(f"ERROR: {err_msg_detail}")
|
134 |
+
sse_err_msg_display = str(e)
|
135 |
+
if len(sse_err_msg_display) > 512: sse_err_msg_display = sse_err_msg_display[:512] + "..."
|
136 |
+
err_resp_for_sse = create_openai_error_response(500, sse_err_msg_display, "server_error")
|
137 |
+
json_payload_for_fake_stream_error = json.dumps(err_resp_for_sse)
|
138 |
+
if not is_auto_attempt:
|
139 |
+
yield f"data: {json_payload_for_fake_stream_error}\n\n"
|
140 |
+
yield "data: [DONE]\n\n"
|
141 |
+
raise
|
142 |
+
|
143 |
+
def gemini_fake_stream_generator(
|
144 |
+
gemini_model_instance: genai.GenerativeModel,
|
145 |
+
prompt_for_api_call: Union[types.Content, List[types.Content]],
|
146 |
+
gen_config_for_api_call: Dict[str, Any],
|
147 |
+
request_obj: OpenAIRequest,
|
148 |
+
is_auto_attempt: bool
|
149 |
+
):
|
150 |
+
print(f"FAKE STREAMING (Gemini): Prep for '{request_obj.model}' (API model: '{gemini_model_instance.model_name}')")
|
151 |
+
def _create_gemini_api_task() -> asyncio.Task:
|
152 |
+
return asyncio.create_task(gemini_model_instance.generate_content_async(contents=prompt_for_api_call, generation_config=gen_config_for_api_call))
|
153 |
+
def _extract_gemini_text(response: Any) -> str:
|
154 |
+
# ... (extraction logic as before) ...
|
155 |
+
full_text = ""
|
156 |
+
if hasattr(response, 'text') and response.text is not None: full_text = response.text
|
157 |
+
elif hasattr(response, 'candidates') and response.candidates:
|
158 |
+
candidate = response.candidates[0]
|
159 |
+
if hasattr(candidate, 'text') and candidate.text is not None: full_text = candidate.text
|
160 |
+
elif hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
|
161 |
+
texts = [part.text for part in candidate.content.parts if hasattr(part, 'text') and part.text is not None]
|
162 |
+
full_text = "".join(texts)
|
163 |
+
return full_text
|
164 |
+
def _process_gemini_text(text: str, sse_model_name: str) -> str:
|
165 |
+
if sse_model_name.endswith("-encrypt-full"): return deobfuscate_text(text)
|
166 |
+
return text
|
167 |
+
def _check_gemini_block(response: Any):
|
168 |
+
if hasattr(response, 'prompt_feedback') and hasattr(response.prompt_feedback, 'block_reason') and response.prompt_feedback.block_reason:
|
169 |
+
block_message = f"Response blocked by Gemini safety filter: {response.prompt_feedback.block_reason}"
|
170 |
+
if hasattr(response.prompt_feedback, 'block_reason_message') and response.prompt_feedback.block_reason_message: block_message += f" (Message: {response.prompt_feedback.block_reason_message})"
|
171 |
+
raise ValueError(block_message)
|
172 |
+
response_id = f"chatcmpl-{int(time.time())}"
|
173 |
+
return _base_fake_stream_engine(
|
174 |
+
api_call_task_creator=_create_gemini_api_task,
|
175 |
+
extract_text_from_response_func=_extract_gemini_text,
|
176 |
+
process_text_func=_process_gemini_text,
|
177 |
+
check_block_reason_func=_check_gemini_block,
|
178 |
+
is_valid_response_func=is_gemini_response_valid,
|
179 |
+
response_id=response_id, sse_model_name=request_obj.model,
|
180 |
+
keep_alive_interval_seconds=app_config.FAKE_STREAMING_INTERVAL_SECONDS,
|
181 |
+
is_auto_attempt=is_auto_attempt
|
182 |
+
# reasoning_text_to_yield and actual_content_text_to_yield are not used for Gemini
|
183 |
+
)
|
184 |
+
|
185 |
+
async def openai_fake_stream_generator( # Changed to async to await the tokenizer
|
186 |
+
openai_client: AsyncOpenAI,
|
187 |
+
openai_params: Dict[str, Any],
|
188 |
+
openai_extra_body: Dict[str, Any],
|
189 |
+
request_obj: OpenAIRequest,
|
190 |
+
is_auto_attempt: bool,
|
191 |
+
# New params for tokenizer
|
192 |
+
gcp_credentials: Any,
|
193 |
+
gcp_project_id: str,
|
194 |
+
gcp_location: str,
|
195 |
+
base_model_id_for_tokenizer: str
|
196 |
+
):
|
197 |
+
api_model_name = openai_params.get("model", "unknown-openai-model")
|
198 |
+
print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
|
199 |
+
|
200 |
response_id = f"chatcmpl-{int(time.time())}"
|
201 |
+
|
202 |
+
# This task creator now involves the full API call and subsequent token splitting.
|
203 |
+
# The _base_fake_stream_engine will then use the pre-split text.
|
204 |
+
async def _openai_api_call_and_split_task_creator_wrapper():
|
205 |
+
# This inner async function will be what the asyncio.Task runs.
|
206 |
+
# It first makes the API call, then does the sync tokenization in a thread.
|
207 |
+
|
208 |
+
# 1. Make the non-streaming API call
|
209 |
+
_api_call_task = asyncio.create_task(
|
210 |
+
openai_client.chat.completions.create(
|
211 |
+
**openai_params, extra_body=openai_extra_body, stream=False
|
212 |
)
|
213 |
)
|
214 |
+
raw_response = await _api_call_task # This is the openai.types.chat.ChatCompletion object
|
215 |
+
|
216 |
+
# 2. Extract full content and usage for splitting
|
217 |
+
full_content_from_api = ""
|
218 |
+
if raw_response.choices and raw_response.choices[0].message and raw_response.choices[0].message.content is not None:
|
219 |
+
full_content_from_api = raw_response.choices[0].message.content
|
220 |
+
|
221 |
+
vertex_completion_tokens = 0
|
222 |
+
if raw_response.usage and raw_response.usage.completion_tokens is not None:
|
223 |
+
vertex_completion_tokens = raw_response.usage.completion_tokens
|
224 |
+
|
225 |
+
reasoning_text = ""
|
226 |
+
actual_content_text = full_content_from_api # Default if split fails or not applicable
|
227 |
+
|
228 |
+
if full_content_from_api and vertex_completion_tokens > 0:
|
229 |
+
# 3. Perform synchronous tokenization and splitting in a separate thread
|
230 |
+
reasoning_text, actual_content_text, _ = await asyncio.to_thread(
|
231 |
+
split_text_by_completion_tokens, # Use imported function
|
232 |
+
gcp_credentials, gcp_project_id, gcp_location,
|
233 |
+
base_model_id_for_tokenizer, # The base model for the tokenizer
|
234 |
+
full_content_from_api,
|
235 |
+
vertex_completion_tokens
|
236 |
+
)
|
237 |
+
if reasoning_text:
|
238 |
+
print(f"DEBUG_FAKE_REASONING_SPLIT: Success. Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
|
239 |
+
|
240 |
+
# We pass the raw_response and the split text to the base engine.
|
241 |
+
# The base engine still needs the raw_response for initial validation,
|
242 |
+
# but will use the pre-split text for yielding chunks.
|
243 |
+
return raw_response, reasoning_text, actual_content_text
|
244 |
+
|
245 |
+
# The main generator logic starts here:
|
246 |
+
# Initial keep-alive loop
|
247 |
+
temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())
|
248 |
+
while not temp_task_for_keepalive_check.done():
|
249 |
+
keep_alive_data = {"id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()), "model": request_obj.model, "choices": [{"delta": {"content": ""}, "index": 0, "finish_reason": None}]}
|
250 |
+
yield f"data: {json.dumps(keep_alive_data)}\n\n"
|
251 |
+
await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
|
252 |
+
|
253 |
+
try:
|
254 |
+
# Get the results from our wrapper task
|
255 |
+
full_api_response, separated_reasoning_text, separated_actual_content_text = await temp_task_for_keepalive_check
|
256 |
+
|
257 |
+
# Define OpenAI specific helpers for _base_fake_stream_engine
|
258 |
+
def _extract_openai_full_text(response: Any) -> str: # Still needed for initial validation if used
|
259 |
+
if response.choices and response.choices[0].message and response.choices[0].message.content is not None:
|
260 |
+
return response.choices[0].message.content
|
261 |
+
return ""
|
262 |
+
def _is_openai_response_valid(response: Any) -> bool:
|
263 |
+
return bool(response.choices and response.choices[0].message is not None)
|
264 |
+
|
265 |
+
# Now, iterate through the base engine using the results
|
266 |
+
async for chunk in _base_fake_stream_engine(
|
267 |
+
api_call_task_creator=lambda: asyncio.create_task(asyncio.sleep(0, result=full_api_response)), # Dummy task, result already known
|
268 |
+
extract_text_from_response_func=_extract_openai_full_text, # For potential use by is_valid_response_func
|
269 |
+
is_valid_response_func=_is_openai_response_valid,
|
270 |
+
response_id=response_id,
|
271 |
+
sse_model_name=request_obj.model,
|
272 |
+
keep_alive_interval_seconds=0, # Keep-alive handled above for the combined op
|
273 |
+
is_auto_attempt=is_auto_attempt,
|
274 |
+
reasoning_text_to_yield=separated_reasoning_text,
|
275 |
+
actual_content_text_to_yield=separated_actual_content_text
|
276 |
+
):
|
277 |
+
yield chunk
|
278 |
|
279 |
+
except Exception as e_outer: # Catch errors from the _openai_api_call_and_split_task_creator_wrapper or subsequent base engine
|
280 |
+
err_msg_detail = f"Error in openai_fake_stream_generator outer (model: '{request_obj.model}'): {type(e_outer).__name__} - {str(e_outer)}"
|
281 |
+
print(f"ERROR: {err_msg_detail}")
|
282 |
+
sse_err_msg_display = str(e_outer)
|
283 |
+
if len(sse_err_msg_display) > 512: sse_err_msg_display = sse_err_msg_display[:512] + "..."
|
284 |
+
err_resp_sse = create_openai_error_response(500, sse_err_msg_display, "server_error")
|
285 |
+
json_payload_error = json.dumps(err_resp_sse)
|
286 |
+
if not is_auto_attempt:
|
287 |
+
yield f"data: {json_payload_error}\n\n"
|
|
|
288 |
yield "data: [DONE]\n\n"
|
289 |
+
# No re-raise here as we've handled sending the error via SSE.
|
290 |
+
# If auto-mode needs to retry, the exception from the inner task would have been raised before this point.
|
291 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
async def execute_gemini_call(
|
294 |
+
current_client: Any, model_to_call: str,
|
|
|
295 |
prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
|
296 |
+
gen_config_for_call: Dict[str, Any], request_obj: OpenAIRequest,
|
|
|
297 |
is_auto_attempt: bool = False
|
298 |
):
|
299 |
actual_prompt_for_call = prompt_func(request_obj.messages)
|
300 |
+
gemini_model_instance: Optional[genai.GenerativeModel] = None
|
301 |
+
if hasattr(current_client, 'get_model') and callable(getattr(current_client, 'get_model')):
|
302 |
+
try: gemini_model_instance = current_client.get_model(model_name=model_to_call)
|
303 |
+
except Exception as e: raise ValueError(f"Could not get Gemini model '{model_to_call}' Express: {e}") from e
|
304 |
+
elif isinstance(current_client, genai.GenerativeModel):
|
305 |
+
if model_to_call not in current_client.model_name: print(f"WARNING: Mismatch! model_to_call='{model_to_call}', client.model_name='{current_client.model_name}'")
|
306 |
+
gemini_model_instance = current_client
|
307 |
+
else: raise ValueError(f"Unsupported current_client for Gemini: {type(current_client)}")
|
308 |
+
if not gemini_model_instance: raise ValueError(f"Failed to get GeminiModel for '{model_to_call}'.")
|
309 |
+
|
310 |
if request_obj.stream:
|
311 |
if app_config.FAKE_STREAMING_ENABLED:
|
312 |
+
return StreamingResponse(gemini_fake_stream_generator(gemini_model_instance, actual_prompt_for_call, gen_config_for_call, request_obj, is_auto_attempt), media_type="text/event-stream")
|
313 |
+
response_id_for_stream, cand_count_stream = f"chatcmpl-{int(time.time())}", request_obj.n or 1
|
314 |
+
async def _gemini_real_stream_generator_inner():
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
try:
|
316 |
+
async for chunk_item_call in gemini_model_instance.generate_content_async(contents=actual_prompt_for_call, generation_config=gen_config_for_call, stream=True):
|
317 |
+
yield convert_chunk_to_openai(chunk_item_call, request_obj.model, response_id_for_stream, 0)
|
|
|
|
|
|
|
318 |
yield create_final_chunk(request_obj.model, response_id_for_stream, cand_count_stream)
|
319 |
yield "data: [DONE]\n\n"
|
320 |
+
except Exception as e:
|
321 |
+
# ... (error handling as before) ...
|
322 |
+
err_msg_detail_stream = f"Streaming Error (Gemini model: '{gemini_model_instance.model_name}'): {type(e).__name__} - {str(e)}"
|
323 |
+
print(f"ERROR: {err_msg_detail_stream}")
|
324 |
+
s_err = str(e); s_err = s_err[:1024]+"..." if len(s_err)>1024 else s_err
|
325 |
+
err_resp = create_openai_error_response(500,s_err,"server_error")
|
326 |
+
j_err = json.dumps(err_resp)
|
327 |
+
if not is_auto_attempt: yield f"data: {j_err}\n\n"; yield "data: [DONE]\n\n"
|
328 |
+
raise e
|
329 |
+
return StreamingResponse(_gemini_real_stream_generator_inner(), media_type="text/event-stream")
|
330 |
+
else:
|
331 |
+
response_obj_call = await gemini_model_instance.generate_content_async(contents=actual_prompt_for_call, generation_config=gen_config_for_call)
|
332 |
+
if hasattr(response_obj_call, 'prompt_feedback') and hasattr(response_obj_call.prompt_feedback, 'block_reason') and response_obj_call.prompt_feedback.block_reason:
|
333 |
+
block_msg = f"Blocked (Gemini): {response_obj_call.prompt_feedback.block_reason}"
|
334 |
+
if hasattr(response_obj_call.prompt_feedback,'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message: block_msg+=f" ({response_obj_call.prompt_feedback.block_reason_message})"
|
335 |
+
raise ValueError(block_msg)
|
336 |
+
if not is_gemini_response_valid(response_obj_call): raise ValueError(f"Invalid non-streaming Gemini response for '{gemini_model_instance.model_name}'. Resp: {str(response_obj_call)[:200]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))
|
app/message_processing.py
CHANGED
@@ -3,10 +3,12 @@ import re
|
|
3 |
import json
|
4 |
import time
|
5 |
import urllib.parse
|
6 |
-
from typing import List, Dict, Any, Union, Literal
|
7 |
|
8 |
from google.genai import types
|
9 |
-
from
|
|
|
|
|
10 |
|
11 |
# Define supported roles for Gemini API
|
12 |
SUPPORTED_ROLES = ["user", "model"]
|
@@ -519,4 +521,94 @@ def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -
|
|
519 |
"model": model,
|
520 |
"choices": choices
|
521 |
}
|
522 |
-
return f"data: {json.dumps(final_chunk)}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import json
|
4 |
import time
|
5 |
import urllib.parse
|
6 |
+
from typing import List, Dict, Any, Union, Literal
|
7 |
|
8 |
from google.genai import types
|
9 |
+
from google.genai.types import HttpOptions as GenAIHttpOptions # Renamed to avoid conflict if HttpOptions is used elsewhere
|
10 |
+
from google import genai as google_genai_client # For instantiating client in tokenizer
|
11 |
+
from models import OpenAIMessage, ContentPartText, ContentPartImage
|
12 |
|
13 |
# Define supported roles for Gemini API
|
14 |
SUPPORTED_ROLES = ["user", "model"]
|
|
|
521 |
"model": model,
|
522 |
"choices": choices
|
523 |
}
|
524 |
+
return f"data: {json.dumps(final_chunk)}\n\n"
|
525 |
+
|
526 |
+
def split_text_by_completion_tokens(
|
527 |
+
gcp_creds: Any,
|
528 |
+
gcp_proj_id: str,
|
529 |
+
gcp_loc: str,
|
530 |
+
model_id_for_tokenizer: str,
|
531 |
+
full_text_to_tokenize: str,
|
532 |
+
num_completion_tokens_from_usage: int
|
533 |
+
) -> tuple[str, str, List[str]]:
|
534 |
+
"""
|
535 |
+
Splits a given text into reasoning and actual content based on a number of completion tokens.
|
536 |
+
Uses Google's tokenizer. This is a synchronous function.
|
537 |
+
Args:
|
538 |
+
gcp_creds: GCP credentials.
|
539 |
+
gcp_proj_id: GCP project ID.
|
540 |
+
gcp_loc: GCP location.
|
541 |
+
model_id_for_tokenizer: The base model ID (e.g., "gemini-1.5-pro") for the tokenizer.
|
542 |
+
full_text_to_tokenize: The full text string from the LLM.
|
543 |
+
num_completion_tokens_from_usage: The number of tokens designated as 'completion' by the LLM's usage stats.
|
544 |
+
Returns:
|
545 |
+
A tuple: (reasoning_text_str, actual_content_text_str, all_decoded_token_strings_list)
|
546 |
+
"""
|
547 |
+
if not full_text_to_tokenize: # Handle empty input early
|
548 |
+
return "", "", []
|
549 |
+
|
550 |
+
try:
|
551 |
+
# This client is specifically for tokenization. Uses GenAIHttpOptions for api_version.
|
552 |
+
sync_tokenizer_client = google_genai_client.Client(
|
553 |
+
vertexai=True, credentials=gcp_creds, project=gcp_proj_id, location=gcp_loc,
|
554 |
+
http_options=GenAIHttpOptions(api_version="v1") # v1 is generally for compute_tokens
|
555 |
+
)
|
556 |
+
|
557 |
+
token_compute_response = sync_tokenizer_client.models.compute_tokens(
|
558 |
+
model=model_id_for_tokenizer, contents=full_text_to_tokenize
|
559 |
+
)
|
560 |
+
|
561 |
+
all_final_token_strings = []
|
562 |
+
if token_compute_response.tokens_info:
|
563 |
+
for token_info_item in token_compute_response.tokens_info:
|
564 |
+
for api_token_bytes in token_info_item.tokens:
|
565 |
+
# Attempt to decode from base64 first, as Vertex sometimes returns b64 encoded tokens.
|
566 |
+
# Fallback to direct UTF-8 decoding if b64 fails.
|
567 |
+
intermediate_str = ""
|
568 |
+
try:
|
569 |
+
# Vertex's tokens via compute_tokens for some models are plain UTF-8 strings,
|
570 |
+
# but sometimes they might be base64 encoded representations of bytes.
|
571 |
+
# The provided code in chat_api.py does a b64decode on a utf-8 string.
|
572 |
+
# Let's assume api_token_bytes is indeed bytes that represent a b64 string of the *actual* token bytes.
|
573 |
+
# This seems overly complex based on typical SDKs, but following existing pattern.
|
574 |
+
# More commonly, api_token_bytes would *be* the token bytes directly.
|
575 |
+
# If api_token_bytes is already text:
|
576 |
+
if isinstance(api_token_bytes, str):
|
577 |
+
intermediate_str = api_token_bytes
|
578 |
+
else: # Assuming it's bytes
|
579 |
+
intermediate_str = api_token_bytes.decode('utf-8', errors='replace')
|
580 |
+
|
581 |
+
final_token_text = ""
|
582 |
+
# Attempt to decode what we think is a base64 string
|
583 |
+
b64_decoded_bytes = base64.b64decode(intermediate_str)
|
584 |
+
final_token_text = b64_decoded_bytes.decode('utf-8', errors='replace')
|
585 |
+
except Exception:
|
586 |
+
# If b64decode fails, assume intermediate_str was the actual token text
|
587 |
+
final_token_text = intermediate_str
|
588 |
+
all_final_token_strings.append(final_token_text)
|
589 |
+
|
590 |
+
if not all_final_token_strings: # Should not happen if full_text_to_tokenize was not empty
|
591 |
+
# print(f"DEBUG_TOKEN_SPLIT: No tokens found for: '{full_text_to_tokenize[:50]}...'")
|
592 |
+
return "", full_text_to_tokenize, []
|
593 |
+
|
594 |
+
# Validate num_completion_tokens_from_usage
|
595 |
+
if not (0 < num_completion_tokens_from_usage <= len(all_final_token_strings)):
|
596 |
+
# print(f"WARNING_TOKEN_SPLIT: num_completion_tokens_from_usage ({num_completion_tokens_from_usage}) is invalid or out of bounds for total client-tokenized tokens ({len(all_final_token_strings)}). Full text returned as 'content'.")
|
597 |
+
# Return the text as re-joined by our tokenizer, not the original full_text_to_tokenize,
|
598 |
+
# as the tokenization process itself might subtly alter it (e.g. space handling, special chars).
|
599 |
+
return "", "".join(all_final_token_strings), all_final_token_strings
|
600 |
+
|
601 |
+
# Split tokens
|
602 |
+
completion_part_tokens = all_final_token_strings[-num_completion_tokens_from_usage:]
|
603 |
+
reasoning_part_tokens = all_final_token_strings[:-num_completion_tokens_from_usage]
|
604 |
+
|
605 |
+
reasoning_output_str = "".join(reasoning_part_tokens)
|
606 |
+
completion_output_str = "".join(completion_part_tokens)
|
607 |
+
|
608 |
+
# print(f"DEBUG_TOKEN_SPLIT: Reasoning: '{reasoning_output_str[:50]}...', Content: '{completion_output_str[:50]}...'")
|
609 |
+
return reasoning_output_str, completion_output_str, all_final_token_strings
|
610 |
+
|
611 |
+
except Exception as e_tok:
|
612 |
+
print(f"ERROR: Tokenizer failed in split_text_by_completion_tokens: {e_tok}")
|
613 |
+
# Fallback: no reasoning, original full text as content, empty token list
|
614 |
+
return "", full_text_to_tokenize, []
|
app/routes/chat_api.py
CHANGED
@@ -22,12 +22,14 @@ from model_loader import get_vertex_models, get_vertex_express_models # Import f
|
|
22 |
from message_processing import (
|
23 |
create_gemini_prompt,
|
24 |
create_encrypted_gemini_prompt,
|
25 |
-
create_encrypted_full_gemini_prompt
|
|
|
26 |
)
|
27 |
from api_helpers import (
|
28 |
create_generation_config,
|
29 |
create_openai_error_response,
|
30 |
-
execute_gemini_call
|
|
|
31 |
)
|
32 |
|
33 |
router = APIRouter()
|
@@ -222,72 +224,83 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
222 |
}
|
223 |
|
224 |
if request.stream:
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
try:
|
|
|
|
|
290 |
response = await openai_client.chat.completions.create(
|
|
|
291 |
**openai_params,
|
292 |
extra_body=openai_extra_body
|
293 |
)
|
@@ -312,55 +325,19 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
|
|
312 |
if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
|
313 |
full_content = message_dict.get('content')
|
314 |
if isinstance(full_content, str) and full_content:
|
315 |
-
|
316 |
-
def _get_token_strings_and_split_texts_sync(creds, proj_id, loc, model_id_for_tokenizer, text_to_tokenize, num_completion_tokens_from_usage):
|
317 |
-
sync_tokenizer_client = genai.Client(
|
318 |
-
vertexai=True, credentials=creds, project=proj_id, location=loc,
|
319 |
-
http_options=HttpOptions(api_version="v1")
|
320 |
-
)
|
321 |
-
if not text_to_tokenize: return "", text_to_tokenize, [] # No reasoning, original content, empty token list
|
322 |
-
|
323 |
-
token_compute_response = sync_tokenizer_client.models.compute_tokens(
|
324 |
-
model=model_id_for_tokenizer, contents=text_to_tokenize
|
325 |
-
)
|
326 |
-
|
327 |
-
all_final_token_strings = []
|
328 |
-
if token_compute_response.tokens_info:
|
329 |
-
for token_info_item in token_compute_response.tokens_info:
|
330 |
-
for api_token_bytes in token_info_item.tokens:
|
331 |
-
intermediate_str = api_token_bytes.decode('utf-8', errors='replace')
|
332 |
-
final_token_text = ""
|
333 |
-
try:
|
334 |
-
b64_decoded_bytes = base64.b64decode(intermediate_str)
|
335 |
-
final_token_text = b64_decoded_bytes.decode('utf-8', errors='replace')
|
336 |
-
except Exception:
|
337 |
-
final_token_text = intermediate_str
|
338 |
-
all_final_token_strings.append(final_token_text)
|
339 |
-
|
340 |
-
if not all_final_token_strings: # Should not happen if text_to_tokenize is not empty
|
341 |
-
return "", text_to_tokenize, []
|
342 |
-
|
343 |
-
if not (0 < num_completion_tokens_from_usage <= len(all_final_token_strings)):
|
344 |
-
print(f"WARNING_TOKEN_SPLIT: num_completion_tokens_from_usage ({num_completion_tokens_from_usage}) is invalid for total client-tokenized tokens ({len(all_final_token_strings)}). Returning full content as 'content'.")
|
345 |
-
return "", "".join(all_final_token_strings), all_final_token_strings
|
346 |
-
|
347 |
-
completion_part_tokens = all_final_token_strings[-num_completion_tokens_from_usage:]
|
348 |
-
reasoning_part_tokens = all_final_token_strings[:-num_completion_tokens_from_usage]
|
349 |
-
|
350 |
-
reasoning_output_str = "".join(reasoning_part_tokens)
|
351 |
-
completion_output_str = "".join(completion_part_tokens)
|
352 |
-
|
353 |
-
return reasoning_output_str, completion_output_str, all_final_token_strings
|
354 |
-
|
355 |
model_id_for_tokenizer = base_model_name
|
356 |
|
357 |
reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
|
358 |
-
|
359 |
-
rotated_credentials,
|
360 |
-
|
|
|
|
|
|
|
|
|
361 |
)
|
362 |
|
363 |
-
message_dict['content'] = actual_content
|
364 |
if reasoning_text: # Only add reasoning_content if it's not empty
|
365 |
message_dict['reasoning_content'] = reasoning_text
|
366 |
print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")
|
|
|
22 |
from message_processing import (
|
23 |
create_gemini_prompt,
|
24 |
create_encrypted_gemini_prompt,
|
25 |
+
create_encrypted_full_gemini_prompt,
|
26 |
+
split_text_by_completion_tokens # Added
|
27 |
)
|
28 |
from api_helpers import (
|
29 |
create_generation_config,
|
30 |
create_openai_error_response,
|
31 |
+
execute_gemini_call,
|
32 |
+
openai_fake_stream_generator # Added
|
33 |
)
|
34 |
|
35 |
router = APIRouter()
|
|
|
224 |
}
|
225 |
|
226 |
if request.stream:
|
227 |
+
if app_config.FAKE_STREAMING_ENABLED:
|
228 |
+
print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
|
229 |
+
# openai_params already has "stream": True from initial setup,
|
230 |
+
# but openai_fake_stream_generator will make a stream=False call internally.
|
231 |
+
# Call the now async generator
|
232 |
+
return StreamingResponse(
|
233 |
+
await openai_fake_stream_generator( # Added await
|
234 |
+
openai_client=openai_client,
|
235 |
+
openai_params=openai_params,
|
236 |
+
openai_extra_body=openai_extra_body,
|
237 |
+
request_obj=request,
|
238 |
+
is_auto_attempt=False,
|
239 |
+
# --- New parameters for tokenizer and reasoning split ---
|
240 |
+
gcp_credentials=rotated_credentials,
|
241 |
+
gcp_project_id=PROJECT_ID, # This is rotated_project_id
|
242 |
+
gcp_location=LOCATION, # This is "global"
|
243 |
+
base_model_id_for_tokenizer=base_model_name # Stripped model ID for tokenizer
|
244 |
+
),
|
245 |
+
media_type="text/event-stream"
|
246 |
+
)
|
247 |
+
else: # Regular OpenAI streaming
|
248 |
+
print(f"INFO: OpenAI True Streaming ENABLED for model '{request.model}'.")
|
249 |
+
async def openai_true_stream_generator(): # Renamed to avoid conflict
|
250 |
+
try:
|
251 |
+
# Ensure stream=True is explicitly passed for real streaming
|
252 |
+
openai_params_for_true_stream = {**openai_params, "stream": True}
|
253 |
+
stream_response = await openai_client.chat.completions.create(
|
254 |
+
**openai_params_for_true_stream,
|
255 |
+
extra_body=openai_extra_body
|
256 |
+
)
|
257 |
+
async for chunk in stream_response:
|
258 |
+
try:
|
259 |
+
chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
|
260 |
+
|
261 |
+
choices = chunk_as_dict.get('choices')
|
262 |
+
if choices and isinstance(choices, list) and len(choices) > 0:
|
263 |
+
delta = choices[0].get('delta')
|
264 |
+
if delta and isinstance(delta, dict):
|
265 |
+
extra_content = delta.get('extra_content')
|
266 |
+
if isinstance(extra_content, dict):
|
267 |
+
google_content = extra_content.get('google')
|
268 |
+
if isinstance(google_content, dict) and google_content.get('thought') is True:
|
269 |
+
reasoning_text = delta.get('content')
|
270 |
+
if reasoning_text is not None:
|
271 |
+
delta['reasoning_content'] = reasoning_text
|
272 |
+
if 'content' in delta: del delta['content']
|
273 |
+
if 'extra_content' in delta: del delta['extra_content']
|
274 |
+
|
275 |
+
# print(f"DEBUG OpenAI Stream Chunk: {chunk_as_dict}") # Potential verbose log
|
276 |
+
yield f"data: {json.dumps(chunk_as_dict)}\n\n"
|
277 |
+
|
278 |
+
except Exception as chunk_processing_error:
|
279 |
+
error_msg_chunk = f"Error processing/serializing OpenAI chunk for {request.model}: {str(chunk_processing_error)}. Chunk: {str(chunk)[:200]}"
|
280 |
+
print(f"ERROR: {error_msg_chunk}")
|
281 |
+
if len(error_msg_chunk) > 1024: error_msg_chunk = error_msg_chunk[:1024] + "..."
|
282 |
+
error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
|
283 |
+
json_payload_for_chunk_error = json.dumps(error_response_chunk)
|
284 |
+
yield f"data: {json_payload_for_chunk_error}\n\n"
|
285 |
+
yield "data: [DONE]\n\n"
|
286 |
+
return
|
287 |
+
yield "data: [DONE]\n\n"
|
288 |
+
except Exception as stream_error:
|
289 |
+
original_error_message = str(stream_error)
|
290 |
+
if len(original_error_message) > 1024: original_error_message = original_error_message[:1024] + "..."
|
291 |
+
error_msg_stream = f"Error during OpenAI client true streaming for {request.model}: {original_error_message}"
|
292 |
+
print(f"ERROR: {error_msg_stream}")
|
293 |
+
error_response_content = create_openai_error_response(500, error_msg_stream, "server_error")
|
294 |
+
json_payload_for_stream_error = json.dumps(error_response_content)
|
295 |
+
yield f"data: {json_payload_for_stream_error}\n\n"
|
296 |
+
yield "data: [DONE]\n\n"
|
297 |
+
return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
|
298 |
+
else: # Not streaming (is_openai_direct_model and not request.stream)
|
299 |
try:
|
300 |
+
# Ensure stream=False is explicitly passed for non-streaming
|
301 |
+
openai_params_for_non_stream = {**openai_params, "stream": False}
|
302 |
response = await openai_client.chat.completions.create(
|
303 |
+
**openai_params_for_non_stream,
|
304 |
**openai_params,
|
305 |
extra_body=openai_extra_body
|
306 |
)
|
|
|
325 |
if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
|
326 |
full_content = message_dict.get('content')
|
327 |
if isinstance(full_content, str) and full_content:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
model_id_for_tokenizer = base_model_name
|
329 |
|
330 |
reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
|
331 |
+
split_text_by_completion_tokens, # Use imported function
|
332 |
+
rotated_credentials,
|
333 |
+
PROJECT_ID,
|
334 |
+
LOCATION,
|
335 |
+
model_id_for_tokenizer,
|
336 |
+
full_content,
|
337 |
+
vertex_completion_tokens
|
338 |
)
|
339 |
|
340 |
+
message_dict['content'] = actual_content
|
341 |
if reasoning_text: # Only add reasoning_content if it's not empty
|
342 |
message_dict['reasoning_content'] = reasoning_text
|
343 |
print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")
|