Spaces:

keisanmono
/

vertextoopenai

Running

App Files Files Community

bibibi12345 commited on May 21

Commit

bee94f0

1 Parent(s): fb7432b

testing openai fake streaming and reasoning

Browse files

Files changed (3) hide show

app/api_helpers.py +277 -154
app/message_processing.py +95 -3
app/routes/chat_api.py +87 -110

app/api_helpers.py CHANGED Viewed

@@ -2,17 +2,25 @@ import json
 import time
 import math
 import asyncio
-from typing import List, Dict, Any, Callable, Union
-from fastapi.responses import JSONResponse, StreamingResponse
 from google.auth.transport.requests import Request as AuthRequest
-from google.genai import types
-from google import genai # Needed if _execute_gemini_call uses genai.Client directly
-# Local module imports
-from models import OpenAIRequest, OpenAIMessage # Changed from relative
-from message_processing import deobfuscate_text, convert_to_openai_format, convert_chunk_to_openai, create_final_chunk # Changed from relative
-import config as app_config # Changed from relative
 def create_openai_error_response(status_code: int, message: str, error_type: str) -> Dict[str, Any]:
     return {
@@ -44,171 +52,286 @@ def create_generation_config(request: OpenAIRequest) -> Dict[str, Any]:
     ]
     return config
-def is_response_valid(response):
-    if response is None:
-        print("DEBUG: Response is None, therefore invalid.")
-        return False
-    # Check for direct text attribute
-    if hasattr(response, 'text') and isinstance(response.text, str) and response.text.strip():
-        # print("DEBUG: Response valid due to response.text")
-        return True
-    # Check candidates for text content
     if hasattr(response, 'candidates') and response.candidates:
-        for candidate in response.candidates: # Iterate through all candidates
-            if hasattr(candidate, 'text') and isinstance(candidate.text, str) and candidate.text.strip():
-                # print(f"DEBUG: Response valid due to candidate.text in candidate")
-                return True
             if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
                 for part in candidate.content.parts:
-                    if hasattr(part, 'text') and isinstance(part.text, str) and part.text.strip():
-                        # print(f"DEBUG: Response valid due to part.text in candidate's content part")
-                        return True
-    # Removed prompt_feedback as a sole criterion for validity.
-    # It should only be valid if actual text content is found.
-    # Block reasons will be checked explicitly by callers if they need to treat it as an error for retries.
-    print("DEBUG: Response is invalid, no usable text content found by is_response_valid.")
     return False
-async def fake_stream_generator(client_instance, model_name: str, prompt: Union[types.Content, List[types.Content]], current_gen_config: Dict[str, Any], request_obj: OpenAIRequest, is_auto_attempt: bool):
     response_id = f"chatcmpl-{int(time.time())}"
-    async def fake_stream_inner():
-        print(f"FAKE STREAMING: Making non-streaming request to Gemini API (Model: {model_name})")
-        api_call_task = asyncio.create_task(
-            client_instance.aio.models.generate_content(
-                model=model_name, contents=prompt, config=current_gen_config
             )
         )
-        while not api_call_task.done():
-            keep_alive_data = {
-                "id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()),
-                "model": request_obj.model, "choices": [{"delta": {"content": ""}, "index": 0, "finish_reason": None}]
-            }
-            yield f"data: {json.dumps(keep_alive_data)}\n\n"
-            await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
-        try:
-            response = api_call_task.result()
-            # Check for safety blocks first, as this should trigger a retry in auto-mode
-            if hasattr(response, 'prompt_feedback') and \
-               hasattr(response.prompt_feedback, 'block_reason') and \
-               response.prompt_feedback.block_reason:
-                block_message = f"Response blocked by safety filter: {response.prompt_feedback.block_reason}"
-                if hasattr(response.prompt_feedback, 'block_reason_message') and response.prompt_feedback.block_reason_message:
-                    block_message = f"Response blocked by safety filter: {response.prompt_feedback.block_reason_message} (Reason: {response.prompt_feedback.block_reason})"
-                print(f"DEBUG: {block_message} (in fake_stream_generator)") # Log this specific condition
-                raise ValueError(block_message) # This will be caught by the except Exception as e below it
-            if not is_response_valid(response): # is_response_valid now only checks for actual text
-                raise ValueError(f"Invalid/empty response in fake stream (no text content): {str(response)[:200]}")
-            full_text = ""
-            if hasattr(response, 'text'):
-                full_text = response.text or "" # Coalesce None to empty string
-            elif hasattr(response, 'candidates') and response.candidates:
-                # Typically, we focus on the first candidate for non-streaming synthesis
-                candidate = response.candidates[0]
-                if hasattr(candidate, 'text'):
-                    full_text = candidate.text or "" # Coalesce None to empty string
-                elif hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
-                    # Ensure parts are iterated and text is joined correctly even if some parts have no text or part.text is None
-                    texts = []
-                    for part in candidate.content.parts:
-                        if hasattr(part, 'text') and part.text is not None: # Check part.text exists and is not None
-                            texts.append(part.text)
-                    full_text = "".join(texts)
-            if request_obj.model.endswith("-encrypt-full"):
-                full_text = deobfuscate_text(full_text)
-            chunk_size = max(20, math.ceil(len(full_text) / 10))
-            for i in range(0, len(full_text), chunk_size):
-                chunk_text = full_text[i:i+chunk_size]
-                delta_data = {
-                    "id": response_id, "object": "chat.completion.chunk", "created": int(time.time()),
-                    "model": request_obj.model, "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}]
-                }
-                yield f"data: {json.dumps(delta_data)}\n\n"
-                await asyncio.sleep(0.05)
-            yield create_final_chunk(request_obj.model, response_id)
             yield "data: [DONE]\n\n"
-        except Exception as e:
-            err_msg = f"Error in fake_stream_generator: {str(e)}"
-            print(err_msg)
-            err_resp = create_openai_error_response(500, err_msg, "server_error")
-            # It's good practice to log the JSON payload here too for consistency,
-            # though the main concern was the true streaming path.
-            json_payload_for_fake_stream_error = json.dumps(err_resp)
-            # Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
-            print(f"DEBUG: Internal error in fake_stream_generator. JSON error for handler: {json_payload_for_fake_stream_error}")
-            if not is_auto_attempt:
-                yield f"data: {json_payload_for_fake_stream_error}\n\n"
-                yield "data: [DONE]\n\n"
-            raise e # Re-raise the original exception e
-    return fake_stream_inner()
 async def execute_gemini_call(
-    current_client: Any, # Should be genai.Client or similar AsyncClient
-    model_to_call: str,
     prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
-    gen_config_for_call: Dict[str, Any],
-    request_obj: OpenAIRequest, # Pass the whole request object
     is_auto_attempt: bool = False
 ):
     actual_prompt_for_call = prompt_func(request_obj.messages)
     if request_obj.stream:
         if app_config.FAKE_STREAMING_ENABLED:
-            return StreamingResponse(
-                await fake_stream_generator(current_client, model_to_call, actual_prompt_for_call, gen_config_for_call, request_obj, is_auto_attempt=is_auto_attempt),
-                media_type="text/event-stream"
-            )
-        response_id_for_stream = f"chatcmpl-{int(time.time())}"
-        cand_count_stream = request_obj.n or 1
-        async def _stream_generator_inner_for_execute(): # Renamed to avoid potential clashes
             try:
-                for c_idx_call in range(cand_count_stream):
-                    async for chunk_item_call in await current_client.aio.models.generate_content_stream(
-                        model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
-                    ):
-                        yield convert_chunk_to_openai(chunk_item_call, request_obj.model, response_id_for_stream, c_idx_call)
                 yield create_final_chunk(request_obj.model, response_id_for_stream, cand_count_stream)
                 yield "data: [DONE]\n\n"
-            except Exception as e_stream_call:
-                print(f"Streaming Error in _execute_gemini_call: {e_stream_call}")
-                error_message_str = str(e_stream_call)
-                # Truncate very long error messages to prevent excessively large JSON payloads.
-                if len(error_message_str) > 1024: # Max length for the error string
-                    error_message_str = error_message_str[:1024] + "..."
-                err_resp_content_call = create_openai_error_response(500, error_message_str, "server_error")
-                json_payload_for_error = json.dumps(err_resp_content_call)
-                # Log the error JSON that WOULD have been sent if not in auto-mode or if this was the final error handler.
-                print(f"DEBUG: Internal error in _stream_generator_inner_for_execute. JSON error for handler: {json_payload_for_error}")
-                if not is_auto_attempt: # is_auto_attempt is from execute_gemini_call's scope
-                    yield f"data: {json_payload_for_error}\n\n"
-                    yield "data: [DONE]\n\n"
-                raise e_stream_call # Re-raise the original exception
-        return StreamingResponse(_stream_generator_inner_for_execute(), media_type="text/event-stream")
-    else:
-        response_obj_call = await current_client.aio.models.generate_content(
-            model=model_to_call, contents=actual_prompt_for_call, config=gen_config_for_call
-        )
-        # Check for safety blocks first for non-streaming calls
-        if hasattr(response_obj_call, 'prompt_feedback') and \
-           hasattr(response_obj_call.prompt_feedback, 'block_reason') and \
-           response_obj_call.prompt_feedback.block_reason:
-            block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason}"
-            if hasattr(response_obj_call.prompt_feedback, 'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message:
-                block_message = f"Response blocked by safety filter: {response_obj_call.prompt_feedback.block_reason_message} (Reason: {response_obj_call.prompt_feedback.block_reason})"
-            print(f"DEBUG: {block_message} (in execute_gemini_call non-streaming)") # Log this specific condition
-            raise ValueError(block_message)
-        if not is_response_valid(response_obj_call): # is_response_valid now only checks for actual text
-            raise ValueError("Invalid/empty response from non-streaming Gemini call (no text content).")
         return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))

 import time
 import math
 import asyncio
+import base64 # Added for tokenizer logic
+from typing import List, Dict, Any, Callable, Union, Optional
+from fastapi.responses import JSONResponse, StreamingResponse
 from google.auth.transport.requests import Request as AuthRequest
+from google.genai import types
+from google.genai.types import HttpOptions # Added for tokenizer logic
+from google import genai
+from openai import AsyncOpenAI
+from models import OpenAIRequest, OpenAIMessage
+from message_processing import (
+    deobfuscate_text,
+    convert_to_openai_format,
+    convert_chunk_to_openai,
+    create_final_chunk,
+    split_text_by_completion_tokens # Added
+)
+import config as app_config
 def create_openai_error_response(status_code: int, message: str, error_type: str) -> Dict[str, Any]:
     return {
     ]
     return config
+def is_gemini_response_valid(response: Any) -> bool:
+    if response is None: return False
+    if hasattr(response, 'text') and isinstance(response.text, str) and response.text.strip(): return True
     if hasattr(response, 'candidates') and response.candidates:
+        for candidate in response.candidates:
+            if hasattr(candidate, 'text') and isinstance(candidate.text, str) and candidate.text.strip(): return True
             if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
                 for part in candidate.content.parts:
+                    if hasattr(part, 'text') and isinstance(part.text, str) and part.text.strip(): return True
     return False
+async def _base_fake_stream_engine(
+    api_call_task_creator: Callable[[], asyncio.Task],
+    extract_text_from_response_func: Callable[[Any], str], # To get the *full* text before splitting
+    response_id: str,
+    sse_model_name: str,
+    is_auto_attempt: bool,
+    is_valid_response_func: Callable[[Any], bool],
+    process_text_func: Optional[Callable[[str, str], str]] = None,
+    check_block_reason_func: Optional[Callable[[Any], None]] = None,
+    # New parameters for pre-split content
+    reasoning_text_to_yield: Optional[str] = None,
+    actual_content_text_to_yield: Optional[str] = None
+):
+    api_call_task = api_call_task_creator()
+    while not api_call_task.done():
+        keep_alive_data = {"id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"delta": {"reasoning_content": ""}, "index": 0, "finish_reason": None}]}
+        yield f"data: {json.dumps(keep_alive_data)}\n\n"
+        await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
+    try:
+        full_api_response = await api_call_task
+        if check_block_reason_func:
+            check_block_reason_func(full_api_response)
+        if not is_valid_response_func(full_api_response):
+             raise ValueError(f"Invalid/empty response in fake stream for model {sse_model_name} (validation failed): {str(full_api_response)[:200]}")
+        # Determine content to chunk
+        content_to_chunk = ""
+        if actual_content_text_to_yield is not None:
+            content_to_chunk = actual_content_text_to_yield
+            if process_text_func: # Process only the actual content part if pre-split
+                 content_to_chunk = process_text_func(content_to_chunk, sse_model_name)
+        else: # Fallback to old method if no pre-split content provided
+            content_to_chunk = extract_text_from_response_func(full_api_response)
+            if process_text_func:
+                content_to_chunk = process_text_func(content_to_chunk, sse_model_name)
+        # Yield reasoning chunk first if available
+        if reasoning_text_to_yield:
+            reasoning_delta_data = {
+                "id": response_id, "object": "chat.completion.chunk", "created": int(time.time()),
+                "model": sse_model_name, "choices": [{"index": 0, "delta": {"reasoning_content": reasoning_text_to_yield}, "finish_reason": None}]
+            }
+            yield f"data: {json.dumps(reasoning_delta_data)}\n\n"
+            await asyncio.sleep(0.05) # Small delay after reasoning
+        # Chunk and yield the main content
+        chunk_size = max(20, math.ceil(len(content_to_chunk) / 10)) if content_to_chunk else 0
+        if not content_to_chunk and content_to_chunk != "":
+            empty_delta_data = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": None}]}
+            yield f"data: {json.dumps(empty_delta_data)}\n\n"
+        else:
+            for i in range(0, len(content_to_chunk), chunk_size):
+                chunk_text = content_to_chunk[i:i+chunk_size]
+                content_delta_data = {"id": response_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": sse_model_name, "choices": [{"index": 0, "delta": {"content": chunk_text}, "finish_reason": None}]}
+                yield f"data: {json.dumps(content_delta_data)}\n\n"
+                if len(content_to_chunk) > chunk_size: await asyncio.sleep(0.05)
+        yield create_final_chunk(sse_model_name, response_id)
+        yield "data: [DONE]\n\n"
+    except Exception as e:
+        err_msg_detail = f"Error in _base_fake_stream_engine (model: '{sse_model_name}'): {type(e).__name__} - {str(e)}"
+        print(f"ERROR: {err_msg_detail}")
+        sse_err_msg_display = str(e)
+        if len(sse_err_msg_display) > 512: sse_err_msg_display = sse_err_msg_display[:512] + "..."
+        err_resp_for_sse = create_openai_error_response(500, sse_err_msg_display, "server_error")
+        json_payload_for_fake_stream_error = json.dumps(err_resp_for_sse)
+        if not is_auto_attempt:
+            yield f"data: {json_payload_for_fake_stream_error}\n\n"
+            yield "data: [DONE]\n\n"
+        raise
+def gemini_fake_stream_generator(
+    gemini_model_instance: genai.GenerativeModel,
+    prompt_for_api_call: Union[types.Content, List[types.Content]],
+    gen_config_for_api_call: Dict[str, Any],
+    request_obj: OpenAIRequest,
+    is_auto_attempt: bool
+):
+    print(f"FAKE STREAMING (Gemini): Prep for '{request_obj.model}' (API model: '{gemini_model_instance.model_name}')")
+    def _create_gemini_api_task() -> asyncio.Task:
+        return asyncio.create_task(gemini_model_instance.generate_content_async(contents=prompt_for_api_call, generation_config=gen_config_for_api_call))
+    def _extract_gemini_text(response: Any) -> str:
+        # ... (extraction logic as before) ...
+        full_text = ""
+        if hasattr(response, 'text') and response.text is not None: full_text = response.text
+        elif hasattr(response, 'candidates') and response.candidates:
+            candidate = response.candidates[0]
+            if hasattr(candidate, 'text') and candidate.text is not None: full_text = candidate.text
+            elif hasattr(candidate, 'content') and hasattr(candidate.content, 'parts') and candidate.content.parts:
+                texts = [part.text for part in candidate.content.parts if hasattr(part, 'text') and part.text is not None]
+                full_text = "".join(texts)
+        return full_text
+    def _process_gemini_text(text: str, sse_model_name: str) -> str:
+        if sse_model_name.endswith("-encrypt-full"): return deobfuscate_text(text)
+        return text
+    def _check_gemini_block(response: Any):
+        if hasattr(response, 'prompt_feedback') and hasattr(response.prompt_feedback, 'block_reason') and response.prompt_feedback.block_reason:
+            block_message = f"Response blocked by Gemini safety filter: {response.prompt_feedback.block_reason}"
+            if hasattr(response.prompt_feedback, 'block_reason_message') and response.prompt_feedback.block_reason_message: block_message += f" (Message: {response.prompt_feedback.block_reason_message})"
+            raise ValueError(block_message)
+    response_id = f"chatcmpl-{int(time.time())}"
+    return _base_fake_stream_engine(
+        api_call_task_creator=_create_gemini_api_task,
+        extract_text_from_response_func=_extract_gemini_text,
+        process_text_func=_process_gemini_text,
+        check_block_reason_func=_check_gemini_block,
+        is_valid_response_func=is_gemini_response_valid,
+        response_id=response_id, sse_model_name=request_obj.model,
+        keep_alive_interval_seconds=app_config.FAKE_STREAMING_INTERVAL_SECONDS,
+        is_auto_attempt=is_auto_attempt
+        # reasoning_text_to_yield and actual_content_text_to_yield are not used for Gemini
+    )
+async def openai_fake_stream_generator( # Changed to async to await the tokenizer
+    openai_client: AsyncOpenAI,
+    openai_params: Dict[str, Any],
+    openai_extra_body: Dict[str, Any],
+    request_obj: OpenAIRequest,
+    is_auto_attempt: bool,
+    # New params for tokenizer
+    gcp_credentials: Any,
+    gcp_project_id: str,
+    gcp_location: str,
+    base_model_id_for_tokenizer: str
+):
+    api_model_name = openai_params.get("model", "unknown-openai-model")
+    print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
     response_id = f"chatcmpl-{int(time.time())}"
+    # This task creator now involves the full API call and subsequent token splitting.
+    # The _base_fake_stream_engine will then use the pre-split text.
+    async def _openai_api_call_and_split_task_creator_wrapper():
+        # This inner async function will be what the asyncio.Task runs.
+        # It first makes the API call, then does the sync tokenization in a thread.
+        # 1. Make the non-streaming API call
+        _api_call_task = asyncio.create_task(
+            openai_client.chat.completions.create(
+                **openai_params, extra_body=openai_extra_body, stream=False
             )
         )
+        raw_response = await _api_call_task # This is the openai.types.chat.ChatCompletion object
+        # 2. Extract full content and usage for splitting
+        full_content_from_api = ""
+        if raw_response.choices and raw_response.choices[0].message and raw_response.choices[0].message.content is not None:
+            full_content_from_api = raw_response.choices[0].message.content
+        vertex_completion_tokens = 0
+        if raw_response.usage and raw_response.usage.completion_tokens is not None:
+            vertex_completion_tokens = raw_response.usage.completion_tokens
+        reasoning_text = ""
+        actual_content_text = full_content_from_api # Default if split fails or not applicable
+        if full_content_from_api and vertex_completion_tokens > 0:
+            # 3. Perform synchronous tokenization and splitting in a separate thread
+            reasoning_text, actual_content_text, _ = await asyncio.to_thread(
+                split_text_by_completion_tokens, # Use imported function
+                gcp_credentials, gcp_project_id, gcp_location,
+                base_model_id_for_tokenizer, # The base model for the tokenizer
+                full_content_from_api,
+                vertex_completion_tokens
+            )
+            if reasoning_text:
+                 print(f"DEBUG_FAKE_REASONING_SPLIT: Success. Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
+        # We pass the raw_response and the split text to the base engine.
+        # The base engine still needs the raw_response for initial validation,
+        # but will use the pre-split text for yielding chunks.
+        return raw_response, reasoning_text, actual_content_text
+    # The main generator logic starts here:
+    # Initial keep-alive loop
+    temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())
+    while not temp_task_for_keepalive_check.done():
+        keep_alive_data = {"id": "chatcmpl-keepalive", "object": "chat.completion.chunk", "created": int(time.time()), "model": request_obj.model, "choices": [{"delta": {"content": ""}, "index": 0, "finish_reason": None}]}
+        yield f"data: {json.dumps(keep_alive_data)}\n\n"
+        await asyncio.sleep(app_config.FAKE_STREAMING_INTERVAL_SECONDS)
+    try:
+        # Get the results from our wrapper task
+        full_api_response, separated_reasoning_text, separated_actual_content_text = await temp_task_for_keepalive_check
+        # Define OpenAI specific helpers for _base_fake_stream_engine
+        def _extract_openai_full_text(response: Any) -> str: # Still needed for initial validation if used
+            if response.choices and response.choices[0].message and response.choices[0].message.content is not None:
+                return response.choices[0].message.content
+            return ""
+        def _is_openai_response_valid(response: Any) -> bool:
+            return bool(response.choices and response.choices[0].message is not None)
+        # Now, iterate through the base engine using the results
+        async for chunk in _base_fake_stream_engine(
+            api_call_task_creator=lambda: asyncio.create_task(asyncio.sleep(0, result=full_api_response)), # Dummy task, result already known
+            extract_text_from_response_func=_extract_openai_full_text, # For potential use by is_valid_response_func
+            is_valid_response_func=_is_openai_response_valid,
+            response_id=response_id,
+            sse_model_name=request_obj.model,
+            keep_alive_interval_seconds=0, # Keep-alive handled above for the combined op
+            is_auto_attempt=is_auto_attempt,
+            reasoning_text_to_yield=separated_reasoning_text,
+            actual_content_text_to_yield=separated_actual_content_text
+        ):
+            yield chunk
+    except Exception as e_outer: # Catch errors from the _openai_api_call_and_split_task_creator_wrapper or subsequent base engine
+        err_msg_detail = f"Error in openai_fake_stream_generator outer (model: '{request_obj.model}'): {type(e_outer).__name__} - {str(e_outer)}"
+        print(f"ERROR: {err_msg_detail}")
+        sse_err_msg_display = str(e_outer)
+        if len(sse_err_msg_display) > 512: sse_err_msg_display = sse_err_msg_display[:512] + "..."
+        err_resp_sse = create_openai_error_response(500, sse_err_msg_display, "server_error")
+        json_payload_error = json.dumps(err_resp_sse)
+        if not is_auto_attempt:
+            yield f"data: {json_payload_error}\n\n"
             yield "data: [DONE]\n\n"
+        # No re-raise here as we've handled sending the error via SSE.
+        # If auto-mode needs to retry, the exception from the inner task would have been raised before this point.
 async def execute_gemini_call(
+    current_client: Any, model_to_call: str,
     prompt_func: Callable[[List[OpenAIMessage]], Union[types.Content, List[types.Content]]],
+    gen_config_for_call: Dict[str, Any], request_obj: OpenAIRequest,
     is_auto_attempt: bool = False
 ):
     actual_prompt_for_call = prompt_func(request_obj.messages)
+    gemini_model_instance: Optional[genai.GenerativeModel] = None
+    if hasattr(current_client, 'get_model') and callable(getattr(current_client, 'get_model')):
+        try: gemini_model_instance = current_client.get_model(model_name=model_to_call)
+        except Exception as e: raise ValueError(f"Could not get Gemini model '{model_to_call}' Express: {e}") from e
+    elif isinstance(current_client, genai.GenerativeModel):
+        if model_to_call not in current_client.model_name: print(f"WARNING: Mismatch! model_to_call='{model_to_call}', client.model_name='{current_client.model_name}'")
+        gemini_model_instance = current_client
+    else: raise ValueError(f"Unsupported current_client for Gemini: {type(current_client)}")
+    if not gemini_model_instance: raise ValueError(f"Failed to get GeminiModel for '{model_to_call}'.")
     if request_obj.stream:
         if app_config.FAKE_STREAMING_ENABLED:
+            return StreamingResponse(gemini_fake_stream_generator(gemini_model_instance, actual_prompt_for_call, gen_config_for_call, request_obj, is_auto_attempt), media_type="text/event-stream")
+        response_id_for_stream, cand_count_stream = f"chatcmpl-{int(time.time())}", request_obj.n or 1
+        async def _gemini_real_stream_generator_inner():
             try:
+                async for chunk_item_call in gemini_model_instance.generate_content_async(contents=actual_prompt_for_call, generation_config=gen_config_for_call, stream=True):
+                    yield convert_chunk_to_openai(chunk_item_call, request_obj.model, response_id_for_stream, 0)
                 yield create_final_chunk(request_obj.model, response_id_for_stream, cand_count_stream)
                 yield "data: [DONE]\n\n"
+            except Exception as e:
+                # ... (error handling as before) ...
+                err_msg_detail_stream = f"Streaming Error (Gemini model: '{gemini_model_instance.model_name}'): {type(e).__name__} - {str(e)}"
+                print(f"ERROR: {err_msg_detail_stream}")
+                s_err = str(e); s_err = s_err[:1024]+"..." if len(s_err)>1024 else s_err
+                err_resp = create_openai_error_response(500,s_err,"server_error")
+                j_err = json.dumps(err_resp)
+                if not is_auto_attempt: yield f"data: {j_err}\n\n"; yield "data: [DONE]\n\n"
+                raise e
+        return StreamingResponse(_gemini_real_stream_generator_inner(), media_type="text/event-stream")
+    else:
+        response_obj_call = await gemini_model_instance.generate_content_async(contents=actual_prompt_for_call, generation_config=gen_config_for_call)
+        if hasattr(response_obj_call, 'prompt_feedback') and hasattr(response_obj_call.prompt_feedback, 'block_reason') and response_obj_call.prompt_feedback.block_reason:
+            block_msg = f"Blocked (Gemini): {response_obj_call.prompt_feedback.block_reason}"
+            if hasattr(response_obj_call.prompt_feedback,'block_reason_message') and response_obj_call.prompt_feedback.block_reason_message: block_msg+=f" ({response_obj_call.prompt_feedback.block_reason_message})"
+            raise ValueError(block_msg)
+        if not is_gemini_response_valid(response_obj_call): raise ValueError(f"Invalid non-streaming Gemini response for '{gemini_model_instance.model_name}'. Resp: {str(response_obj_call)[:200]}")
         return JSONResponse(content=convert_to_openai_format(response_obj_call, request_obj.model))

app/message_processing.py CHANGED Viewed

@@ -3,10 +3,12 @@ import re
 import json
 import time
 import urllib.parse
-from typing import List, Dict, Any, Union, Literal # Optional removed
 from google.genai import types
-from models import OpenAIMessage, ContentPartText, ContentPartImage # Changed from relative
 # Define supported roles for Gemini API
 SUPPORTED_ROLES = ["user", "model"]
@@ -519,4 +521,94 @@ def create_final_chunk(model: str, response_id: str, candidate_count: int = 1) -
         "model": model,
         "choices": choices
     }
-    return f"data: {json.dumps(final_chunk)}\n\n"

 import json
 import time
 import urllib.parse
+from typing import List, Dict, Any, Union, Literal
 from google.genai import types
+from google.genai.types import HttpOptions as GenAIHttpOptions # Renamed to avoid conflict if HttpOptions is used elsewhere
+from google import genai as google_genai_client # For instantiating client in tokenizer
+from models import OpenAIMessage, ContentPartText, ContentPartImage
 # Define supported roles for Gemini API
 SUPPORTED_ROLES = ["user", "model"]
         "model": model,
         "choices": choices
     }
+    return f"data: {json.dumps(final_chunk)}\n\n"
+def split_text_by_completion_tokens(
+    gcp_creds: Any,
+    gcp_proj_id: str,
+    gcp_loc: str,
+    model_id_for_tokenizer: str,
+    full_text_to_tokenize: str,
+    num_completion_tokens_from_usage: int
+) -> tuple[str, str, List[str]]:
+    """
+    Splits a given text into reasoning and actual content based on a number of completion tokens.
+    Uses Google's tokenizer. This is a synchronous function.
+    Args:
+        gcp_creds: GCP credentials.
+        gcp_proj_id: GCP project ID.
+        gcp_loc: GCP location.
+        model_id_for_tokenizer: The base model ID (e.g., "gemini-1.5-pro") for the tokenizer.
+        full_text_to_tokenize: The full text string from the LLM.
+        num_completion_tokens_from_usage: The number of tokens designated as 'completion' by the LLM's usage stats.
+    Returns:
+        A tuple: (reasoning_text_str, actual_content_text_str, all_decoded_token_strings_list)
+    """
+    if not full_text_to_tokenize: # Handle empty input early
+        return "", "", []
+    try:
+        # This client is specifically for tokenization. Uses GenAIHttpOptions for api_version.
+        sync_tokenizer_client = google_genai_client.Client(
+            vertexai=True, credentials=gcp_creds, project=gcp_proj_id, location=gcp_loc,
+            http_options=GenAIHttpOptions(api_version="v1") # v1 is generally for compute_tokens
+        )
+        token_compute_response = sync_tokenizer_client.models.compute_tokens(
+            model=model_id_for_tokenizer, contents=full_text_to_tokenize
+        )
+        all_final_token_strings = []
+        if token_compute_response.tokens_info:
+            for token_info_item in token_compute_response.tokens_info:
+                for api_token_bytes in token_info_item.tokens:
+                    # Attempt to decode from base64 first, as Vertex sometimes returns b64 encoded tokens.
+                    # Fallback to direct UTF-8 decoding if b64 fails.
+                    intermediate_str = ""
+                    try:
+                        # Vertex's tokens via compute_tokens for some models are plain UTF-8 strings,
+                        # but sometimes they might be base64 encoded representations of bytes.
+                        # The provided code in chat_api.py does a b64decode on a utf-8 string.
+                        # Let's assume api_token_bytes is indeed bytes that represent a b64 string of the *actual* token bytes.
+                        # This seems overly complex based on typical SDKs, but following existing pattern.
+                        # More commonly, api_token_bytes would *be* the token bytes directly.
+                        # If api_token_bytes is already text:
+                        if isinstance(api_token_bytes, str):
+                            intermediate_str = api_token_bytes
+                        else: # Assuming it's bytes
+                             intermediate_str = api_token_bytes.decode('utf-8', errors='replace')
+                        final_token_text = ""
+                        # Attempt to decode what we think is a base64 string
+                        b64_decoded_bytes = base64.b64decode(intermediate_str)
+                        final_token_text = b64_decoded_bytes.decode('utf-8', errors='replace')
+                    except Exception:
+                        # If b64decode fails, assume intermediate_str was the actual token text
+                        final_token_text = intermediate_str
+                    all_final_token_strings.append(final_token_text)
+        if not all_final_token_strings: # Should not happen if full_text_to_tokenize was not empty
+            # print(f"DEBUG_TOKEN_SPLIT: No tokens found for: '{full_text_to_tokenize[:50]}...'")
+            return "", full_text_to_tokenize, []
+        # Validate num_completion_tokens_from_usage
+        if not (0 < num_completion_tokens_from_usage <= len(all_final_token_strings)):
+            # print(f"WARNING_TOKEN_SPLIT: num_completion_tokens_from_usage ({num_completion_tokens_from_usage}) is invalid or out of bounds for total client-tokenized tokens ({len(all_final_token_strings)}). Full text returned as 'content'.")
+            # Return the text as re-joined by our tokenizer, not the original full_text_to_tokenize,
+            # as the tokenization process itself might subtly alter it (e.g. space handling, special chars).
+            return "", "".join(all_final_token_strings), all_final_token_strings
+        # Split tokens
+        completion_part_tokens = all_final_token_strings[-num_completion_tokens_from_usage:]
+        reasoning_part_tokens = all_final_token_strings[:-num_completion_tokens_from_usage]
+        reasoning_output_str = "".join(reasoning_part_tokens)
+        completion_output_str = "".join(completion_part_tokens)
+        # print(f"DEBUG_TOKEN_SPLIT: Reasoning: '{reasoning_output_str[:50]}...', Content: '{completion_output_str[:50]}...'")
+        return reasoning_output_str, completion_output_str, all_final_token_strings
+    except Exception as e_tok:
+        print(f"ERROR: Tokenizer failed in split_text_by_completion_tokens: {e_tok}")
+        # Fallback: no reasoning, original full text as content, empty token list
+        return "", full_text_to_tokenize, []

app/routes/chat_api.py CHANGED Viewed

@@ -22,12 +22,14 @@ from model_loader import get_vertex_models, get_vertex_express_models # Import f
 from message_processing import (
     create_gemini_prompt,
     create_encrypted_gemini_prompt,
-    create_encrypted_full_gemini_prompt
 )
 from api_helpers import (
     create_generation_config,
     create_openai_error_response,
-    execute_gemini_call
 )
 router = APIRouter()
@@ -222,72 +224,83 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
             }
             if request.stream:
-                async def openai_stream_generator():
-                    try:
-                        stream_response = await openai_client.chat.completions.create(
-                            **openai_params,
-                            extra_body=openai_extra_body
-                        )
-                        async for chunk in stream_response:
-                            try:
-                                chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
-                                # Safely navigate and check for thought flag
-                                choices = chunk_as_dict.get('choices')
-                                if choices and isinstance(choices, list) and len(choices) > 0:
-                                    delta = choices[0].get('delta')
-                                    if delta and isinstance(delta, dict):
-                                        extra_content = delta.get('extra_content')
-                                        if isinstance(extra_content, dict):
-                                            google_content = extra_content.get('google')
-                                            if isinstance(google_content, dict) and google_content.get('thought') is True:
-                                                # This is a thought chunk, modify chunk_as_dict's delta in place
-                                                reasoning_text = delta.get('content')
-                                                if reasoning_text is not None:
-                                                    delta['reasoning_content'] = reasoning_text
-                                                if 'content' in delta:
-                                                    del delta['content']
-                                                # Always delete extra_content for thought chunks
-                                                if 'extra_content' in delta:
-                                                    del delta['extra_content']
-                                # Yield the (potentially modified) dictionary as JSON
-                                print(chunk_as_dict)
-                                yield f"data: {json.dumps(chunk_as_dict)}\n\n"
-                            except Exception as chunk_processing_error: # Catch errors from dict manipulation or json.dumps
-                                error_msg_chunk = f"Error processing or serializing OpenAI chunk for {request.model}: {str(chunk_processing_error)}. Chunk: {str(chunk)[:200]}"
-                                print(f"ERROR: {error_msg_chunk}")
-                                # Truncate
-                                if len(error_msg_chunk) > 1024:
-                                    error_msg_chunk = error_msg_chunk[:1024] + "..."
-                                error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
-                                json_payload_for_chunk_error = json.dumps(error_response_chunk) # Ensure json is imported
-                                print(f"DEBUG: Yielding chunk processing error JSON payload (OpenAI path): {json_payload_for_chunk_error}")
-                                yield f"data: {json_payload_for_chunk_error}\n\n"
-                                yield "data: [DONE]\n\n"
-                                return # Stop further processing for this request
-                        yield "data: [DONE]\n\n"
-                    except Exception as stream_error:
-                        original_error_message = str(stream_error)
-                        # Truncate very long error messages
-                        if len(original_error_message) > 1024:
-                            original_error_message = original_error_message[:1024] + "..."
-                        error_msg_stream = f"Error during OpenAI client streaming for {request.model}: {original_error_message}"
-                        print(f"ERROR: {error_msg_stream}")
-                        error_response_content = create_openai_error_response(500, error_msg_stream, "server_error")
-                        json_payload_for_stream_error = json.dumps(error_response_content)
-                        print(f"DEBUG: Yielding stream error JSON payload (OpenAI path): {json_payload_for_stream_error}")
-                        yield f"data: {json_payload_for_stream_error}\n\n"
-                        yield "data: [DONE]\n\n"
-                return StreamingResponse(openai_stream_generator(), media_type="text/event-stream")
-            else: # Not streaming
                 try:
                     response = await openai_client.chat.completions.create(
                         **openai_params,
                         extra_body=openai_extra_body
                     )
@@ -312,55 +325,19 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
                                 if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
                                     full_content = message_dict.get('content')
                                     if isinstance(full_content, str) and full_content:
-                                        def _get_token_strings_and_split_texts_sync(creds, proj_id, loc, model_id_for_tokenizer, text_to_tokenize, num_completion_tokens_from_usage):
-                                            sync_tokenizer_client = genai.Client(
-                                                vertexai=True, credentials=creds, project=proj_id, location=loc,
-                                                http_options=HttpOptions(api_version="v1")
-                                            )
-                                            if not text_to_tokenize: return "", text_to_tokenize, [] # No reasoning, original content, empty token list
-                                            token_compute_response = sync_tokenizer_client.models.compute_tokens(
-                                                model=model_id_for_tokenizer, contents=text_to_tokenize
-                                            )
-                                            all_final_token_strings = []
-                                            if token_compute_response.tokens_info:
-                                                for token_info_item in token_compute_response.tokens_info:
-                                                    for api_token_bytes in token_info_item.tokens:
-                                                        intermediate_str = api_token_bytes.decode('utf-8', errors='replace')
-                                                        final_token_text = ""
-                                                        try:
-                                                            b64_decoded_bytes = base64.b64decode(intermediate_str)
-                                                            final_token_text = b64_decoded_bytes.decode('utf-8', errors='replace')
-                                                        except Exception:
-                                                            final_token_text = intermediate_str
-                                                        all_final_token_strings.append(final_token_text)
-                                            if not all_final_token_strings: # Should not happen if text_to_tokenize is not empty
-                                                return "", text_to_tokenize, []
-                                            if not (0 < num_completion_tokens_from_usage <= len(all_final_token_strings)):
-                                                print(f"WARNING_TOKEN_SPLIT: num_completion_tokens_from_usage ({num_completion_tokens_from_usage}) is invalid for total client-tokenized tokens ({len(all_final_token_strings)}). Returning full content as 'content'.")
-                                                return "", "".join(all_final_token_strings), all_final_token_strings
-                                            completion_part_tokens = all_final_token_strings[-num_completion_tokens_from_usage:]
-                                            reasoning_part_tokens = all_final_token_strings[:-num_completion_tokens_from_usage]
-                                            reasoning_output_str = "".join(reasoning_part_tokens)
-                                            completion_output_str = "".join(completion_part_tokens)
-                                            return reasoning_output_str, completion_output_str, all_final_token_strings
                                         model_id_for_tokenizer = base_model_name
                                         reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
-                                            _get_token_strings_and_split_texts_sync,
-                                            rotated_credentials, PROJECT_ID, LOCATION,
-                                            model_id_for_tokenizer, full_content, vertex_completion_tokens
                                         )
-                                        message_dict['content'] = actual_content # Set the new content (potentially from joined tokens)
                                         if reasoning_text: # Only add reasoning_content if it's not empty
                                             message_dict['reasoning_content'] = reasoning_text
                                             print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")

 from message_processing import (
     create_gemini_prompt,
     create_encrypted_gemini_prompt,
+    create_encrypted_full_gemini_prompt,
+    split_text_by_completion_tokens # Added
 )
 from api_helpers import (
     create_generation_config,
     create_openai_error_response,
+    execute_gemini_call,
+    openai_fake_stream_generator # Added
 )
 router = APIRouter()
             }
             if request.stream:
+                if app_config.FAKE_STREAMING_ENABLED:
+                    print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
+                    # openai_params already has "stream": True from initial setup,
+                    # but openai_fake_stream_generator will make a stream=False call internally.
+                    # Call the now async generator
+                    return StreamingResponse(
+                        await openai_fake_stream_generator( # Added await
+                            openai_client=openai_client,
+                            openai_params=openai_params,
+                            openai_extra_body=openai_extra_body,
+                            request_obj=request,
+                            is_auto_attempt=False,
+                            # --- New parameters for tokenizer and reasoning split ---
+                            gcp_credentials=rotated_credentials,
+                            gcp_project_id=PROJECT_ID, # This is rotated_project_id
+                            gcp_location=LOCATION,     # This is "global"
+                            base_model_id_for_tokenizer=base_model_name # Stripped model ID for tokenizer
+                        ),
+                        media_type="text/event-stream"
+                    )
+                else: # Regular OpenAI streaming
+                    print(f"INFO: OpenAI True Streaming ENABLED for model '{request.model}'.")
+                    async def openai_true_stream_generator(): # Renamed to avoid conflict
+                        try:
+                            # Ensure stream=True is explicitly passed for real streaming
+                            openai_params_for_true_stream = {**openai_params, "stream": True}
+                            stream_response = await openai_client.chat.completions.create(
+                                **openai_params_for_true_stream,
+                                extra_body=openai_extra_body
+                            )
+                            async for chunk in stream_response:
+                                try:
+                                    chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
+                                    choices = chunk_as_dict.get('choices')
+                                    if choices and isinstance(choices, list) and len(choices) > 0:
+                                        delta = choices[0].get('delta')
+                                        if delta and isinstance(delta, dict):
+                                            extra_content = delta.get('extra_content')
+                                            if isinstance(extra_content, dict):
+                                                google_content = extra_content.get('google')
+                                                if isinstance(google_content, dict) and google_content.get('thought') is True:
+                                                    reasoning_text = delta.get('content')
+                                                    if reasoning_text is not None:
+                                                        delta['reasoning_content'] = reasoning_text
+                                                    if 'content' in delta: del delta['content']
+                                                    if 'extra_content' in delta: del delta['extra_content']
+                                    # print(f"DEBUG OpenAI Stream Chunk: {chunk_as_dict}") # Potential verbose log
+                                    yield f"data: {json.dumps(chunk_as_dict)}\n\n"
+                                except Exception as chunk_processing_error:
+                                    error_msg_chunk = f"Error processing/serializing OpenAI chunk for {request.model}: {str(chunk_processing_error)}. Chunk: {str(chunk)[:200]}"
+                                    print(f"ERROR: {error_msg_chunk}")
+                                    if len(error_msg_chunk) > 1024: error_msg_chunk = error_msg_chunk[:1024] + "..."
+                                    error_response_chunk = create_openai_error_response(500, error_msg_chunk, "server_error")
+                                    json_payload_for_chunk_error = json.dumps(error_response_chunk)
+                                    yield f"data: {json_payload_for_chunk_error}\n\n"
+                                    yield "data: [DONE]\n\n"
+                                    return
+                            yield "data: [DONE]\n\n"
+                        except Exception as stream_error:
+                            original_error_message = str(stream_error)
+                            if len(original_error_message) > 1024: original_error_message = original_error_message[:1024] + "..."
+                            error_msg_stream = f"Error during OpenAI client true streaming for {request.model}: {original_error_message}"
+                            print(f"ERROR: {error_msg_stream}")
+                            error_response_content = create_openai_error_response(500, error_msg_stream, "server_error")
+                            json_payload_for_stream_error = json.dumps(error_response_content)
+                            yield f"data: {json_payload_for_stream_error}\n\n"
+                            yield "data: [DONE]\n\n"
+                    return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
+            else: # Not streaming (is_openai_direct_model and not request.stream)
                 try:
+                    # Ensure stream=False is explicitly passed for non-streaming
+                    openai_params_for_non_stream = {**openai_params, "stream": False}
                     response = await openai_client.chat.completions.create(
+                        **openai_params_for_non_stream,
                         **openai_params,
                         extra_body=openai_extra_body
                     )
                                 if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
                                     full_content = message_dict.get('content')
                                     if isinstance(full_content, str) and full_content:
                                         model_id_for_tokenizer = base_model_name
                                         reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
+                                            split_text_by_completion_tokens, # Use imported function
+                                            rotated_credentials,
+                                            PROJECT_ID,
+                                            LOCATION,
+                                            model_id_for_tokenizer,
+                                            full_content,
+                                            vertex_completion_tokens
                                         )
+                                        message_dict['content'] = actual_content
                                         if reasoning_text: # Only add reasoning_content if it's not empty
                                             message_dict['reasoning_content'] = reasoning_text
                                             print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")