Spaces:

keisanmono
/

vertextoopenai

Running

App Files Files Community

bibibi12345 commited on May 14

Commit

a8e6344

1 Parent(s): d67404e

dynamic model list

Browse files

Files changed (6) hide show

app/config.py +3 -0
app/main.py +2 -2
app/requirements.txt +2 -1
app/routes/chat_api.py +69 -23
app/routes/models_api.py +48 -42
app/vertex_ai_init.py +18 -11

app/config.py CHANGED Viewed

@@ -19,4 +19,7 @@ VERTEX_EXPRESS_API_KEY_VAL = os.environ.get("VERTEX_EXPRESS_API_KEY")
 FAKE_STREAMING_ENABLED = os.environ.get("FAKE_STREAMING", "false").lower() == "true"
 FAKE_STREAMING_INTERVAL_SECONDS = float(os.environ.get("FAKE_STREAMING_INTERVAL", "1.0"))
 # Validation logic moved to app/auth.py

 FAKE_STREAMING_ENABLED = os.environ.get("FAKE_STREAMING", "false").lower() == "true"
 FAKE_STREAMING_INTERVAL_SECONDS = float(os.environ.get("FAKE_STREAMING_INTERVAL", "1.0"))
+# URL for the remote JSON file containing model lists
+MODELS_CONFIG_URL = os.environ.get("MODELS_CONFIG_URL", "https://gist.githubusercontent.com/gzzhongqi/e0b684f319437a859bcf5bd6203fd1f6/raw")
 # Validation logic moved to app/auth.py

app/main.py CHANGED Viewed

@@ -35,8 +35,8 @@ app.include_router(chat_api.router)
 @app.on_event("startup")
 async def startup_event():
-    if init_vertex_ai(credential_manager):
-        print("INFO: Fallback Vertex AI client initialization check completed successfully.")
     else:
         print("ERROR: Failed to initialize a fallback Vertex AI client. API will likely fail.")

 @app.on_event("startup")
 async def startup_event():
+    if await init_vertex_ai(credential_manager): # Added await
+        print("INFO: Vertex AI credential and model config initialization check completed successfully.")
     else:
         print("ERROR: Failed to initialize a fallback Vertex AI client. API will likely fail.")

app/requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ uvicorn==0.27.1
 google-auth==2.38.0
 google-cloud-aiplatform==1.86.0
 pydantic==2.6.1
-google-genai==1.13.0

 google-auth==2.38.0
 google-cloud-aiplatform==1.86.0
 pydantic==2.6.1
+google-genai==1.13.0
+httpx>=0.25.0

app/routes/chat_api.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 import json # Needed for error streaming
-from fastapi import APIRouter, Depends, Request # Added Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing import List, Dict, Any
@@ -8,12 +8,12 @@ from typing import List, Dict, Any
 from google.genai import types
 from google import genai
-# Local module imports (now absolute from app/ perspective)
 from models import OpenAIRequest, OpenAIMessage
 from auth import get_api_key
-# from main import credential_manager # Removed, will use request.app.state
 import config as app_config
-from vertex_ai_init import VERTEX_EXPRESS_MODELS
 from message_processing import (
     create_gemini_prompt,
     create_encrypted_gemini_prompt,
@@ -27,12 +27,38 @@ from api_helpers import (
 router = APIRouter()
 @router.post("/v1/chat/completions")
 async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api_key: str = Depends(get_api_key)):
     try:
-        # Access credential_manager from app state
         credential_manager_instance = fastapi_request.app.state.credential_manager
         is_auto_model = request.model.endswith("-auto")
         is_grounded_search = request.model.endswith("-search")
         is_encrypted_model = request.model.endswith("-encrypt")
@@ -41,18 +67,28 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
         is_max_thinking_model = request.model.endswith("-max")
         base_model_name = request.model
-        if is_auto_model: base_model_name = request.model.replace("-auto", "")
-        elif is_grounded_search: base_model_name = request.model.replace("-search", "")
-        elif is_encrypted_model: base_model_name = request.model.replace("-encrypt", "")
-        elif is_encrypted_full_model: base_model_name = request.model.replace("-encrypt-full", "")
-        elif is_nothinking_model: base_model_name = request.model.replace("-nothinking","")
-        elif is_max_thinking_model: base_model_name = request.model.replace("-max","")
         generation_config = create_generation_config(request)
         client_to_use = None
         express_api_key_val = app_config.VERTEX_EXPRESS_API_KEY_VAL
-        if express_api_key_val and base_model_name in VERTEX_EXPRESS_MODELS:
             try:
                 client_to_use = genai.Client(vertexai=True, api_key=express_api_key_val)
                 print(f"INFO: Using Vertex Express Mode for model {base_model_name}.")
@@ -74,28 +110,28 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
             print("ERROR: No Vertex AI client could be initialized via Express Mode or Rotated Credentials.")
             return JSONResponse(status_code=500, content=create_openai_error_response(500, "Vertex AI client not available. Ensure credentials are set up correctly (env var or files).", "server_error"))
-        encryption_instructions = ["// Protocol Instructions Placeholder //"]
         if is_auto_model:
             print(f"Processing auto model: {request.model}")
             attempts = [
                 {"name": "base", "model": base_model_name, "prompt_func": create_gemini_prompt, "config_modifier": lambda c: c},
-                {"name": "encrypt", "model": base_model_name, "prompt_func": create_encrypted_gemini_prompt, "config_modifier": lambda c: {**c, "system_instruction": encryption_instructions}},
                 {"name": "old_format", "model": base_model_name, "prompt_func": create_encrypted_full_gemini_prompt, "config_modifier": lambda c: c}
             ]
             last_err = None
             for attempt in attempts:
-                print(f"Auto-mode attempting: '{attempt['name']}'")
                 current_gen_config = attempt["config_modifier"](generation_config.copy())
                 try:
                     return await execute_gemini_call(client_to_use, attempt["model"], attempt["prompt_func"], current_gen_config, request)
                 except Exception as e_auto:
                     last_err = e_auto
-                    print(f"Auto-attempt '{attempt['name']}' failed: {e_auto}")
                     await asyncio.sleep(1)
             print(f"All auto attempts failed. Last error: {last_err}")
-            err_msg = f"All auto-mode attempts failed for {request.model}. Last error: {str(last_err)}"
             if not request.stream and last_err:
                  return JSONResponse(status_code=500, content=create_openai_error_response(500, err_msg, "server_error"))
             elif request.stream:
@@ -106,23 +142,33 @@ async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api
                 return StreamingResponse(final_error_stream(), media_type="text/event-stream")
             return JSONResponse(status_code=500, content=create_openai_error_response(500, "All auto-mode attempts failed without specific error.", "server_error"))
-        else:
             current_prompt_func = create_gemini_prompt
             if is_grounded_search:
                 search_tool = types.Tool(google_search=types.GoogleSearch())
                 generation_config["tools"] = [search_tool]
             elif is_encrypted_model:
-                generation_config["system_instruction"] = encryption_instructions
                 current_prompt_func = create_encrypted_gemini_prompt
             elif is_encrypted_full_model:
-                generation_config["system_instruction"] = encryption_instructions
                 current_prompt_func = create_encrypted_full_gemini_prompt
             elif is_nothinking_model:
                 generation_config["thinking_config"] = {"thinking_budget": 0}
             elif is_max_thinking_model:
                 generation_config["thinking_config"] = {"thinking_budget": 24576}
-            return await execute_gemini_call(client_to_use, base_model_name, current_prompt_func, generation_config, request)
     except Exception as e:
         error_msg = f"Unexpected error in chat_completions endpoint: {str(e)}"

 import asyncio
 import json # Needed for error streaming
+from fastapi import APIRouter, Depends, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing import List, Dict, Any
 from google.genai import types
 from google import genai
+# Local module imports
 from models import OpenAIRequest, OpenAIMessage
 from auth import get_api_key
+from main import credential_manager # Accessing the instance from main.py
 import config as app_config
+from model_loader import get_vertex_models, get_vertex_express_models # Import from model_loader
 from message_processing import (
     create_gemini_prompt,
     create_encrypted_gemini_prompt,
 router = APIRouter()
 @router.post("/v1/chat/completions")
 async def chat_completions(fastapi_request: Request, request: OpenAIRequest, api_key: str = Depends(get_api_key)):
     try:
         credential_manager_instance = fastapi_request.app.state.credential_manager
+        # Dynamically fetch allowed models for validation
+        vertex_model_ids = await get_vertex_models()
+        # Suffixes that can be appended to base models.
+        # The remote model config should ideally be the source of truth for all valid permutations.
+        standard_suffixes = ["-search", "-encrypt", "-encrypt-full", "-auto"]
+        special_suffix_map = { # For models with unique suffixes not covered by standard ones
+            "gemini-2.5-flash-preview-04-17": ["-nothinking", "-max"]
+        }
+        all_allowed_model_ids = set(vertex_model_ids) # Start with base models from config
+        for base_id in vertex_model_ids: # Iterate over base models to add suffixed versions
+            for suffix in standard_suffixes:
+                all_allowed_model_ids.add(f"{base_id}{suffix}")
+            if base_id in special_suffix_map:
+                for special_suffix in special_suffix_map[base_id]:
+                    all_allowed_model_ids.add(f"{base_id}{special_suffix}")
+        # Add express models to the allowed list as well, as they are distinct
+        # and might not be covered by the base vertex_models list from remote config.
+        # Alternatively, the remote config's vertex_models should include express models if they are also usable as base.
+        vertex_express_model_ids = await get_vertex_express_models()
+        all_allowed_model_ids.update(vertex_express_model_ids)
+        if not request.model or request.model not in all_allowed_model_ids:
+            return JSONResponse(status_code=400, content=create_openai_error_response(400, f"Model '{request.model}' not found or not supported by this adapter. Valid models are: {sorted(list(all_allowed_model_ids))}", "invalid_request_error"))
         is_auto_model = request.model.endswith("-auto")
         is_grounded_search = request.model.endswith("-search")
         is_encrypted_model = request.model.endswith("-encrypt")
         is_max_thinking_model = request.model.endswith("-max")
         base_model_name = request.model
+        # Determine base_model_name by stripping known suffixes
+        # This order matters if a model could have multiple (e.g. -encrypt-auto, though not currently a pattern)
+        if is_auto_model: base_model_name = request.model[:-len("-auto")]
+        elif is_grounded_search: base_model_name = request.model[:-len("-search")]
+        elif is_encrypted_full_model: base_model_name = request.model[:-len("-encrypt-full")] # Must be before -encrypt
+        elif is_encrypted_model: base_model_name = request.model[:-len("-encrypt")]
+        elif is_nothinking_model: base_model_name = request.model[:-len("-nothinking")]
+        elif is_max_thinking_model: base_model_name = request.model[:-len("-max")]
+        # Specific model variant checks (if any remain exclusive and not covered dynamically)
+        if is_nothinking_model and base_model_name != "gemini-2.5-flash-preview-04-17":
+            return JSONResponse(status_code=400, content=create_openai_error_response(400, f"Model '{request.model}' (-nothinking) is only supported for 'gemini-2.5-flash-preview-04-17'.", "invalid_request_error"))
+        if is_max_thinking_model and base_model_name != "gemini-2.5-flash-preview-04-17":
+            return JSONResponse(status_code=400, content=create_openai_error_response(400, f"Model '{request.model}' (-max) is only supported for 'gemini-2.5-flash-preview-04-17'.", "invalid_request_error"))
         generation_config = create_generation_config(request)
         client_to_use = None
         express_api_key_val = app_config.VERTEX_EXPRESS_API_KEY_VAL
+        # Use dynamically fetched express models list for this check
+        if express_api_key_val and base_model_name in vertex_express_model_ids: # Check against base_model_name
             try:
                 client_to_use = genai.Client(vertexai=True, api_key=express_api_key_val)
                 print(f"INFO: Using Vertex Express Mode for model {base_model_name}.")
             print("ERROR: No Vertex AI client could be initialized via Express Mode or Rotated Credentials.")
             return JSONResponse(status_code=500, content=create_openai_error_response(500, "Vertex AI client not available. Ensure credentials are set up correctly (env var or files).", "server_error"))
+        encryption_instructions_placeholder = ["// Protocol Instructions Placeholder //"] # Actual instructions are in message_processing
         if is_auto_model:
             print(f"Processing auto model: {request.model}")
             attempts = [
                 {"name": "base", "model": base_model_name, "prompt_func": create_gemini_prompt, "config_modifier": lambda c: c},
+                {"name": "encrypt", "model": base_model_name, "prompt_func": create_encrypted_gemini_prompt, "config_modifier": lambda c: {**c, "system_instruction": encryption_instructions_placeholder}},
                 {"name": "old_format", "model": base_model_name, "prompt_func": create_encrypted_full_gemini_prompt, "config_modifier": lambda c: c}
             ]
             last_err = None
             for attempt in attempts:
+                print(f"Auto-mode attempting: '{attempt['name']}' for model {attempt['model']}")
                 current_gen_config = attempt["config_modifier"](generation_config.copy())
                 try:
                     return await execute_gemini_call(client_to_use, attempt["model"], attempt["prompt_func"], current_gen_config, request)
                 except Exception as e_auto:
                     last_err = e_auto
+                    print(f"Auto-attempt '{attempt['name']}' for model {attempt['model']} failed: {e_auto}")
                     await asyncio.sleep(1)
             print(f"All auto attempts failed. Last error: {last_err}")
+            err_msg = f"All auto-mode attempts failed for model {request.model}. Last error: {str(last_err)}"
             if not request.stream and last_err:
                  return JSONResponse(status_code=500, content=create_openai_error_response(500, err_msg, "server_error"))
             elif request.stream:
                 return StreamingResponse(final_error_stream(), media_type="text/event-stream")
             return JSONResponse(status_code=500, content=create_openai_error_response(500, "All auto-mode attempts failed without specific error.", "server_error"))
+        else: # Not an auto model
             current_prompt_func = create_gemini_prompt
+            # Determine the actual model string to call the API with (e.g., "gemini-1.5-pro-search")
+            api_model_string = request.model
             if is_grounded_search:
                 search_tool = types.Tool(google_search=types.GoogleSearch())
                 generation_config["tools"] = [search_tool]
             elif is_encrypted_model:
+                generation_config["system_instruction"] = encryption_instructions_placeholder
                 current_prompt_func = create_encrypted_gemini_prompt
             elif is_encrypted_full_model:
+                generation_config["system_instruction"] = encryption_instructions_placeholder
                 current_prompt_func = create_encrypted_full_gemini_prompt
             elif is_nothinking_model:
                 generation_config["thinking_config"] = {"thinking_budget": 0}
             elif is_max_thinking_model:
                 generation_config["thinking_config"] = {"thinking_budget": 24576}
+            # For non-auto models, the 'base_model_name' might have suffix stripped.
+            # We should use the original 'request.model' for API call if it's a suffixed one,
+            # or 'base_model_name' if it's truly a base model without suffixes.
+            # The current logic uses 'base_model_name' for the API call in the 'else' block.
+            # This means if `request.model` was "gemini-1.5-pro-search", `base_model_name` becomes "gemini-1.5-pro"
+            # but the API call might need the full "gemini-1.5-pro-search".
+            # Let's use `request.model` for the API call here, and `base_model_name` for checks like Express eligibility.
+            return await execute_gemini_call(client_to_use, api_model_string, current_prompt_func, generation_config, request)
     except Exception as e:
         error_msg = f"Unexpected error in chat_completions endpoint: {str(e)}"

app/routes/models_api.py CHANGED Viewed

@@ -1,49 +1,55 @@
 import time
 from fastapi import APIRouter, Depends
-# from typing import List, Dict, Any # Removed as unused
-from auth import get_api_key # Changed from relative
 router = APIRouter()
 @router.get("/v1/models")
 async def list_models(api_key: str = Depends(get_api_key)):
-    # This model list should ideally be dynamic or configurable
-    models_data = [
-        {"id": "gemini-2.5-pro-exp-03-25", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-exp-03-25-search", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-exp-03-25-encrypt", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-exp-03-25-encrypt-full", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-exp-03-25-auto", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-03-25", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-03-25-search", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-03-25-encrypt", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-03-25-encrypt-full", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-03-25-auto", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-05-06", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-05-06-search", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-05-06-encrypt", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-05-06-encrypt-full", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-pro-preview-05-06-auto", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.0-flash", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.0-flash-search", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.0-flash-lite", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.0-flash-lite-search", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.0-pro-exp-02-05", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-1.5-flash", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-flash-preview-04-17", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-flash-preview-04-17-encrypt", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-flash-preview-04-17-nothinking", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-2.5-flash-preview-04-17-max", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-1.5-flash-8b", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-1.5-pro", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-1.0-pro-002", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-1.0-pro-vision-001", "object": "model", "created": int(time.time()), "owned_by": "google"},
-        {"id": "gemini-embedding-exp", "object": "model", "created": int(time.time()), "owned_by": "google"}
-    ]
-    # Add root and parent for consistency with OpenAI-like response
-    for model_info in models_data:
-        model_info.setdefault("permission", [])
-        model_info.setdefault("root", model_info["id"]) # Typically the model ID itself
-        model_info.setdefault("parent", None) # Typically None for base models
-    return {"object": "list", "data": models_data}

 import time
 from fastapi import APIRouter, Depends
+from typing import List, Dict, Any # Will be needed for constructing model dicts
+from auth import get_api_key
+from ..model_loader import get_vertex_models, get_vertex_express_models, refresh_models_config_cache
 router = APIRouter()
 @router.get("/v1/models")
 async def list_models(api_key: str = Depends(get_api_key)):
+    # Attempt to refresh the cache. If it fails, getters will use the old cache.
+    await refresh_models_config_cache()
+    vertex_model_ids = await get_vertex_models()
+    vertex_express_model_ids = await get_vertex_express_models()
+    # Combine and unique model IDs.
+    # We should also consider creating the OpenAI model suffixes (-search, -encrypt, -auto)
+    # based on the base models available, similar to how chat_api.py currently does.
+    # For simplicity here, we'll list all unique base models from the config
+    # and then also list the specific variations.
+    all_model_ids = set(vertex_model_ids + vertex_express_model_ids)
+    # Create extended model list with variations (search, encrypt, auto etc.)
+    # This logic might need to be more sophisticated based on actual supported features per base model.
+    # For now, let's assume for each base model, we might have these variations.
+    # A better approach would be if the remote config specified these variations.
+    dynamic_models_data: List[Dict[str, Any]] = []
+    current_time = int(time.time())
+    # Add base models
+    for model_id in sorted(list(all_model_ids)):
+        dynamic_models_data.append({
+            "id": model_id, "object": "model", "created": current_time, "owned_by": "google",
+            "permission": [], "root": model_id, "parent": None
+        })
+        # Add common variations if not already present directly in fetched list (more robust if config provides these)
+        # This part is a simplification and might create non-existent model permutations
+        # if not all base models support all suffixes.
+        suffixes = ["-search", "-encrypt", "-encrypt-full", "-auto"]
+        # Special suffixes like -nothinking, -max are very model specific, harder to generalize here
+        for suffix in suffixes:
+            suffixed_id = f"{model_id}{suffix}"
+            if suffixed_id not in all_model_ids: # Avoid duplicates if config already lists them
+                 dynamic_models_data.append({
+                    "id": suffixed_id, "object": "model", "created": current_time, "owned_by": "google",
+                    "permission": [], "root": model_id, "parent": None
+                })
+    # Ensure uniqueness again after adding suffixes (in case some suffixed models were also in base lists)
+    final_models_data_map = {m["id"]: m for m in dynamic_models_data}
+    return {"object": "list", "data": list(final_models_data_map.values())}

app/vertex_ai_init.py CHANGED Viewed

@@ -1,20 +1,17 @@
 import json
 from google import genai
-from credentials_manager import CredentialManager, parse_multiple_json_credentials # Changed from relative
-import config as app_config # Changed from relative
-# VERTEX_EXPRESS_API_KEY constant is removed, direct string "VERTEX_EXPRESS_API_KEY" will be used in chat_api.py
-VERTEX_EXPRESS_MODELS = [
-    "gemini-2.0-flash-001",
-    "gemini-2.0-flash-lite-001",
-    "gemini-2.5-pro-preview-03-25",
-    "gemini-2.5-flash-preview-04-17",
-    "gemini-2.5-pro-preview-05-06",
-]
 # Global 'client' and 'get_vertex_client()' are removed.
-def init_vertex_ai(credential_manager_instance: CredentialManager) -> bool:
     """
     Initializes the credential manager with credentials from GOOGLE_CREDENTIALS_JSON (if provided)
     and verifies if any credentials (environment or file-based through the manager) are available.
@@ -65,6 +62,16 @@ def init_vertex_ai(credential_manager_instance: CredentialManager) -> bool:
         else:
             print("INFO: GOOGLE_CREDENTIALS_JSON environment variable not found.")
         # CredentialManager's __init__ calls load_credentials_list() for files.
         # refresh_credentials_list() re-scans files and combines with in-memory (already includes env creds if loaded above).
         # The return value of refresh_credentials_list indicates if total > 0

 import json
+import asyncio # Added for await
 from google import genai
+from credentials_manager import CredentialManager, parse_multiple_json_credentials
+import config as app_config
+from model_loader import refresh_models_config_cache # Import new model loader function
+# VERTEX_EXPRESS_MODELS list is now dynamically loaded via model_loader
+# The constant VERTEX_EXPRESS_MODELS previously defined here is removed.
+# Consumers should use get_vertex_express_models() from model_loader.
 # Global 'client' and 'get_vertex_client()' are removed.
+async def init_vertex_ai(credential_manager_instance: CredentialManager) -> bool: # Made async
     """
     Initializes the credential manager with credentials from GOOGLE_CREDENTIALS_JSON (if provided)
     and verifies if any credentials (environment or file-based through the manager) are available.
         else:
             print("INFO: GOOGLE_CREDENTIALS_JSON environment variable not found.")
+        # Attempt to pre-warm the model configuration cache
+        print("INFO: Attempting to pre-warm model configuration cache during startup...")
+        models_loaded_successfully = await refresh_models_config_cache()
+        if models_loaded_successfully:
+            print("INFO: Model configuration cache pre-warmed successfully.")
+        else:
+            print("WARNING: Failed to pre-warm model configuration cache during startup. It will be loaded lazily on first request.")
+            # We don't necessarily fail the entire init_vertex_ai if model list fetching fails,
+            # as credential validation might still be important, and model list can be fetched later.
         # CredentialManager's __init__ calls load_credentials_list() for files.
         # refresh_credentials_list() re-scans files and combines with in-memory (already includes env creds if loaded above).
         # The return value of refresh_credentials_list indicates if total > 0