Spaces:
Runtime error
Runtime error
File size: 3,033 Bytes
47031d7 efd3dc5 47031d7 3be0c5b daae8cc 3be0c5b 47031d7 15890c0 24b4bfe 47031d7 f71fa9b 24b4bfe 4992462 47031d7 98af53d 47031d7 0af4a83 4992462 f71fa9b 0af4a83 3be0c5b 31d57ff 3be0c5b 0af4a83 92cdcfc 62f21a9 92cdcfc efd3dc5 baae755 f71fa9b 9de4eee 0af4a83 be8d239 0af4a83 c6b21e3 0af4a83 c6b21e3 f71fa9b 0af4a83 9814b43 0af4a83 4992462 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
"""
LLM Inference Server main application using LitServe framework.
"""
from sys import platform
import litserve as ls
import logging
import os
from fastapi.middleware.cors import CORSMiddleware
from huggingface_hub import login
from .routes import router, init_router
from .api import InferenceApi
from .utils import load_config
# Store process list globally so it doesn't get garbage collected
_WORKER_PROCESSES = []
_MANAGER = None
# Load configuration
config = load_config()
def setup_logging():
"""Set up basic logging configuration"""
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
return logging.getLogger(__name__)
def create_app():
"""Create and configure the application instance."""
global _WORKER_PROCESSES, _MANAGER, config
logger = setup_logging()
# Log into Hugging Face Hub
access_token = os.environ.get("InfAPITokenWrite")
if access_token:
try:
login(token=access_token)
logger.info("Successfully logged into Hugging Face Hub")
except Exception as e:
logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
else:
logger.warning("No Hugging Face access token found")
server_config = config.get('server', {})
# Initialize API with config
api = InferenceApi(config)
# Initialize router with API instance
init_router(api, config)
if platform == "darwin": # Darwin is macOS
server = ls.LitServer(
api,
timeout=server_config.get('timeout', 60),
max_batch_size=server_config.get('max_batch_size', 1),
track_requests=True,
accelerator="cpu" # Force CPU on Mac
)
else:
server = ls.LitServer(
api,
timeout=server_config.get('timeout', 60),
max_batch_size=server_config.get('max_batch_size', 1),
track_requests=True
)
# Launch inference workers (assuming single uvicorn worker for now)
_MANAGER, _WORKER_PROCESSES = server.launch_inference_worker(num_uvicorn_servers=1)
# Get the FastAPI appls
app = server.app
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Add routes with configured prefix
api_prefix = config.get('llm_server', {}).get('api_prefix', '/api/v1')
app.include_router(router, prefix=api_prefix)
# Set the response queue ID for the app
app.response_queue_id = 0 # Since we're using a single worker
return app
# Create the app instance for uvicorn
app = create_app()
if __name__ == "__main__":
# Run the app with uvicorn
import uvicorn
host = config["server"]["host"]
port = config["server"]["port"]
uvicorn.run(
app,
host=host,
port=port,
log_level=config["logging"]["level"].lower()
) |