Spaces:

TeamGenKI
/

Inference-API

Runtime error

App Files Files Community

Inference-API / main /main.py

AurelioAguirre

Updated Dockerfile

9de4eee 5 months ago

raw

history blame

3.03 kB

	"""
	LLM Inference Server main application using LitServe framework.
	"""
	from sys import platform

	import litserve as ls
	import logging
	import os
	from fastapi.middleware.cors import CORSMiddleware
	from huggingface_hub import login
	from .routes import router, init_router
	from .api import InferenceApi
	from .utils import load_config

	# Store process list globally so it doesn't get garbage collected
	_WORKER_PROCESSES = []
	_MANAGER = None

	# Load configuration
	config = load_config()


	def setup_logging():
	"""Set up basic logging configuration"""
	logging.basicConfig(
	level=logging.DEBUG,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	return logging.getLogger(__name__)


	def create_app():
	"""Create and configure the application instance."""
	global _WORKER_PROCESSES, _MANAGER, config

	logger = setup_logging()

	# Log into Hugging Face Hub
	access_token = os.environ.get("InfAPITokenWrite")
	if access_token:
	try:
	login(token=access_token)
	logger.info("Successfully logged into Hugging Face Hub")
	except Exception as e:
	logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
	else:
	logger.warning("No Hugging Face access token found")

	server_config = config.get('server', {})

	# Initialize API with config
	api = InferenceApi(config)

	# Initialize router with API instance
	init_router(api, config)

	if platform == "darwin": # Darwin is macOS
	server = ls.LitServer(
	api,
	timeout=server_config.get('timeout', 60),
	max_batch_size=server_config.get('max_batch_size', 1),
	track_requests=True,
	accelerator="cpu" # Force CPU on Mac
	)
	else:
	server = ls.LitServer(
	api,
	timeout=server_config.get('timeout', 60),
	max_batch_size=server_config.get('max_batch_size', 1),
	track_requests=True
	)

	# Launch inference workers (assuming single uvicorn worker for now)
	_MANAGER, _WORKER_PROCESSES = server.launch_inference_worker(num_uvicorn_servers=1)

	# Get the FastAPI appls

	app = server.app

	# Add CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Add routes with configured prefix
	api_prefix = config.get('llm_server', {}).get('api_prefix', '/api/v1')
	app.include_router(router, prefix=api_prefix)

	# Set the response queue ID for the app
	app.response_queue_id = 0 # Since we're using a single worker

	return app

	# Create the app instance for uvicorn
	app = create_app()

	if __name__ == "__main__":
	# Run the app with uvicorn
	import uvicorn
	host = config["server"]["host"]
	port = config["server"]["port"]
	uvicorn.run(
	app,
	host=host,
	port=port,
	log_level=config["logging"]["level"].lower()
	)