""" LLM Inference Server main application using LitServe framework. """ import litserve as ls import yaml import logging import multiprocessing as mp import os from pathlib import Path from fastapi.middleware.cors import CORSMiddleware from huggingface_hub import login from .routes import router, init_router from .api import InferenceApi # Store process list globally so it doesn't get garbage collected _WORKER_PROCESSES = [] _MANAGER = None def setup_logging(): """Set up basic logging configuration""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) return logging.getLogger(__name__) def load_config(): """Load configuration from config.yaml""" config_path = Path(__file__).parent / "config.yaml" with open(config_path) as f: return yaml.safe_load(f) def create_app(): """Create and configure the application instance.""" global _WORKER_PROCESSES, _MANAGER logger = setup_logging() # Log into Hugging Face Hub access_token = os.environ.get("InferenceAPI") if access_token: try: login(token=access_token) logger.info("Successfully logged into Hugging Face Hub") except Exception as e: logger.error(f"Failed to login to Hugging Face Hub: {str(e)}") else: logger.warning("No Hugging Face access token found") config = load_config() server_config = config.get('server', {}) # Initialize API with config api = InferenceApi(config) # Initialize router with API instance init_router(api) # Create LitServer instance server = ls.LitServer( api, timeout=server_config.get('timeout', 60), max_batch_size=server_config.get('max_batch_size', 1), track_requests=True ) # Launch inference workers (assuming single uvicorn worker for now) _MANAGER, _WORKER_PROCESSES = server.launch_inference_worker(num_uvicorn_servers=1) # Get the FastAPI app app = server.app # Add CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Add routes with configured prefix api_prefix = config.get('llm_server', {}).get('api_prefix', '/api/v1') app.include_router(router, prefix=api_prefix) # Set the response queue ID for the app app.response_queue_id = 0 # Since we're using a single worker return app # Create the app instance for uvicorn app = create_app()