""" LLM Inference Server main application using LitServe framework. """ import litserve as ls import yaml import logging import os from pathlib import Path from fastapi.middleware.cors import CORSMiddleware from huggingface_hub import login from .routes import router, init_router from .api import InferenceApi # Store process list globally so it doesn't get garbage collected _WORKER_PROCESSES = [] _MANAGER = None def setup_logging(): """Set up basic logging configuration""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) return logging.getLogger(__name__) def load_config(): """Load configuration from config.yaml""" config_path = Path(__file__).parent / "config.yaml" with open(config_path) as f: return yaml.safe_load(f) def create_app(): """Create and configure the application instance.""" global _WORKER_PROCESSES, _MANAGER logger = setup_logging() # Log into Hugging Face Hub access_token = os.environ.get("InferenceAPI") if access_token: try: login(token=access_token) logger.info("Successfully logged into Hugging Face Hub") except Exception as e: logger.error(f"Failed to login to Hugging Face Hub: {str(e)}") else: logger.warning("No Hugging Face access token found") config = load_config() server_config = config.get('server', {}) # Initialize API with config api = InferenceApi(config) # Initialize router with API instance init_router(api) # Create LitServer instance server = ls.LitServer( api, timeout=server_config.get('timeout', 60), max_batch_size=server_config.get('max_batch_size', 1), track_requests=True ) # Launch inference workers (assuming single uvicorn worker for now) _MANAGER, _WORKER_PROCESSES = server.launch_inference_worker(num_uvicorn_servers=1) # Get the FastAPI app app = server.app # Add CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Add routes with configured prefix api_prefix = config.get('llm_server', {}).get('api_prefix', '/api/v1') app.include_router(router, prefix=api_prefix) # Set the response queue ID for the app app.response_queue_id = 0 # Since we're using a single worker return app # Create the app instance for uvicorn app = create_app()