File size: 3,033 Bytes
47031d7
 
 
efd3dc5
 
47031d7
 
3be0c5b
daae8cc
3be0c5b
47031d7
15890c0
24b4bfe
47031d7
f71fa9b
 
 
 
24b4bfe
4992462
 
 
47031d7
 
 
98af53d
47031d7
 
 
 
 
0af4a83
 
4992462
f71fa9b
0af4a83
3be0c5b
 
31d57ff
3be0c5b
 
 
 
 
 
 
 
 
0af4a83
 
 
 
 
92cdcfc
62f21a9
92cdcfc
efd3dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
baae755
f71fa9b
 
 
9de4eee
 
0af4a83
be8d239
0af4a83
 
 
 
 
 
 
 
c6b21e3
0af4a83
 
 
c6b21e3
f71fa9b
 
 
0af4a83
9814b43
0af4a83
4992462
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
LLM Inference Server main application using LitServe framework.
"""
from sys import platform

import litserve as ls
import logging
import os
from fastapi.middleware.cors import CORSMiddleware
from huggingface_hub import login
from .routes import router, init_router
from .api import InferenceApi
from .utils import load_config

# Store process list globally so it doesn't get garbage collected
_WORKER_PROCESSES = []
_MANAGER = None

# Load configuration
config = load_config()


def setup_logging():
    """Set up basic logging configuration"""
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    return logging.getLogger(__name__)


def create_app():
    """Create and configure the application instance."""
    global _WORKER_PROCESSES, _MANAGER, config

    logger = setup_logging()

    # Log into Hugging Face Hub
    access_token = os.environ.get("InfAPITokenWrite")
    if access_token:
        try:
            login(token=access_token)
            logger.info("Successfully logged into Hugging Face Hub")
        except Exception as e:
            logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
    else:
        logger.warning("No Hugging Face access token found")

    server_config = config.get('server', {})

    # Initialize API with config
    api = InferenceApi(config)

    # Initialize router with API instance
    init_router(api, config)

    if platform == "darwin":  # Darwin is macOS
        server = ls.LitServer(
            api,
            timeout=server_config.get('timeout', 60),
            max_batch_size=server_config.get('max_batch_size', 1),
            track_requests=True,
            accelerator="cpu"  # Force CPU on Mac
        )
    else:
        server = ls.LitServer(
            api,
            timeout=server_config.get('timeout', 60),
            max_batch_size=server_config.get('max_batch_size', 1),
            track_requests=True
        )

    # Launch inference workers (assuming single uvicorn worker for now)
    _MANAGER, _WORKER_PROCESSES = server.launch_inference_worker(num_uvicorn_servers=1)

    # Get the FastAPI appls

    app = server.app

    # Add CORS middleware
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )

    # Add routes with configured prefix
    api_prefix = config.get('llm_server', {}).get('api_prefix', '/api/v1')
    app.include_router(router, prefix=api_prefix)

    # Set the response queue ID for the app
    app.response_queue_id = 0  # Since we're using a single worker

    return app

# Create the app instance for uvicorn
app = create_app()

if __name__ == "__main__":
    # Run the app with uvicorn
    import uvicorn
    host = config["server"]["host"]
    port = config["server"]["port"]
    uvicorn.run(
        app,
        host=host,
        port=port,
        log_level=config["logging"]["level"].lower()
    )