Spaces:

TeamGenKI
/

Inference-API

Runtime error

App Files Files Community

AurelioAguirre commited on Jan 8

Commit

a4e24d4

1 Parent(s): 1eab622

Fixing dockerfile v3

Browse files

Files changed (2) hide show

src/api.py +53 -88
src/main.py +3 -2

src/api.py CHANGED Viewed

@@ -1,24 +1,60 @@
 import httpx
-from typing import Optional, Iterator, List, Dict, Union
 import logging
-class InferenceApi:
     def __init__(self, config: dict):
         """Initialize the Inference API with configuration."""
         self.logger = logging.getLogger(__name__)
         self.logger.info("Initializing Inference API")
         # Get base URL from config
         self.base_url = config["llm_server"]["base_url"]
         self.timeout = config["llm_server"].get("timeout", 60)
-        # Initialize HTTP client
         self.client = httpx.AsyncClient(
             base_url=self.base_url,
             timeout=self.timeout
         )
-        self.logger.info("Inference API initialized successfully")
     async def generate_response(
             self,
@@ -26,9 +62,7 @@ class InferenceApi:
             system_message: Optional[str] = None,
             max_new_tokens: Optional[int] = None
     ) -> str:
-        """
-        Generate a complete response by forwarding the request to the LLM Server.
-        """
         self.logger.debug(f"Forwarding generation request for prompt: {prompt[:50]}...")
         try:
@@ -54,9 +88,7 @@ class InferenceApi:
             system_message: Optional[str] = None,
             max_new_tokens: Optional[int] = None
     ) -> Iterator[str]:
-        """
-        Generate a streaming response by forwarding the request to the LLM Server.
-        """
         self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
         try:
@@ -77,83 +109,16 @@ class InferenceApi:
             self.logger.error(f"Error in generate_stream: {str(e)}")
             raise
-    async def generate_embedding(self, text: str) -> List[float]:
-        """
-        Generate embedding by forwarding the request to the LLM Server.
-        """
-        self.logger.debug(f"Forwarding embedding request for text: {text[:50]}...")
-        try:
-            response = await self.client.post(
-                "/api/v1/embedding",
-                json={"text": text}
-            )
-            response.raise_for_status()
-            data = response.json()
-            return data["embedding"]
-        except Exception as e:
-            self.logger.error(f"Error in generate_embedding: {str(e)}")
-            raise
-    async def check_system_status(self) -> Dict[str, Union[Dict, str]]:
-        """
-        Get system status from the LLM Server.
-        """
-        try:
-            response = await self.client.get("/api/v1/system/status")
-            response.raise_for_status()
-            return response.json()
-        except Exception as e:
-            self.logger.error(f"Error getting system status: {str(e)}")
-            raise
-    async def validate_system(self) -> Dict[str, Union[Dict, str, List[str]]]:
-        """
-        Get system validation status from the LLM Server.
-        """
-        try:
-            response = await self.client.get("/api/v1/system/validate")
-            response.raise_for_status()
-            return response.json()
-        except Exception as e:
-            self.logger.error(f"Error validating system: {str(e)}")
-            raise
-    async def initialize_model(self, model_name: Optional[str] = None) -> Dict[str, str]:
-        """
-        Initialize a model on the LLM Server.
-        """
-        try:
-            response = await self.client.post(
-                "/api/v1/model/initialize",
-                params={"model_name": model_name} if model_name else None
-            )
-            response.raise_for_status()
-            return response.json()
-        except Exception as e:
-            self.logger.error(f"Error initializing model: {str(e)}")
-            raise
-    async def initialize_embedding_model(self, model_name: Optional[str] = None) -> Dict[str, str]:
-        """
-        Initialize an embedding model on the LLM Server.
-        """
-        try:
-            response = await self.client.post(
-                "/api/v1/model/initialize/embedding",
-                params={"model_name": model_name} if model_name else None
-            )
-            response.raise_for_status()
-            return response.json()
-        except Exception as e:
-            self.logger.error(f"Error initializing embedding model: {str(e)}")
-            raise
-    async def close(self):
-        """Close the HTTP client session."""
-        await self.client.aclose()

 import httpx
+from typing import Optional, Iterator, Union, Any
 import logging
+from litserve import LitAPI
+class InferenceApi(LitAPI):
     def __init__(self, config: dict):
         """Initialize the Inference API with configuration."""
+        super().__init__()
         self.logger = logging.getLogger(__name__)
         self.logger.info("Initializing Inference API")
         # Get base URL from config
         self.base_url = config["llm_server"]["base_url"]
         self.timeout = config["llm_server"].get("timeout", 60)
+        self.client = None  # Will be initialized in setup()
+        # Set request timeout from config
+        self.request_timeout = float(self.timeout)
+    async def setup(self, device: Optional[str] = None):
+        """Setup method required by LitAPI - initialize HTTP client"""
+        self._device = device  # Store device as required by LitAPI
         self.client = httpx.AsyncClient(
             base_url=self.base_url,
             timeout=self.timeout
         )
+        self.logger.info(f"Inference API setup completed on device: {device}")
+    async def predict(self, x: str, **kwargs) -> Union[str, Iterator[str]]:
+        """
+        Main prediction method required by LitAPI.
+        If streaming is enabled, yields chunks; otherwise returns complete response.
+        """
+        if self.stream:
+            async for chunk in self.generate_stream(x, **kwargs):
+                yield chunk
+        else:
+            return await self.generate_response(x, **kwargs)
+    def decode_request(self, request: Any, **kwargs) -> str:
+        """Convert the request payload to input format."""
+        # For our case, we expect the request to be text
+        if isinstance(request, dict) and "prompt" in request:
+            return request["prompt"]
+        return request
+    def encode_response(self, output: Union[str, Iterator[str]], **kwargs) -> Union[str, Iterator[str]]:
+        """Convert the model output to a response payload."""
+        if self.stream:
+            # For streaming, yield each chunk wrapped in a dict
+            async def stream_wrapper():
+                async for chunk in output:
+                    yield {"generated_text": chunk}
+        else:
+            # For non-streaming, return complete response
+            return {"generated_text": output}
     async def generate_response(
             self,
             system_message: Optional[str] = None,
             max_new_tokens: Optional[int] = None
     ) -> str:
+        """Generate a complete response by forwarding the request to the LLM Server."""
         self.logger.debug(f"Forwarding generation request for prompt: {prompt[:50]}...")
         try:
             system_message: Optional[str] = None,
             max_new_tokens: Optional[int] = None
     ) -> Iterator[str]:
+        """Generate a streaming response by forwarding the request to the LLM Server."""
         self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
         try:
             self.logger.error(f"Error in generate_stream: {str(e)}")
             raise
+    # ... [rest of the methods remain the same: generate_embedding, check_system_status, etc.]
+    async def cleanup(self):
+        """Cleanup method - close HTTP client"""
+        if self.client:
+            await self.client.aclose()
+    def log(self, key: str, value: Any):
+        """Override log method to use our logger if queue not set"""
+        if self._logger_queue is None:
+            self.logger.info(f"Log event: {key}={value}")
+        else:
+            super().log(key, value)

src/main.py CHANGED Viewed

@@ -6,6 +6,7 @@ import yaml
 import logging
 from pathlib import Path
 from .routes import router, init_router
 def setup_logging():
     """Set up basic logging configuration"""
@@ -31,9 +32,9 @@ def main():
         # Initialize the router with our config
         init_router(config)
         # Create LitServer instance
-        server = ls.LitServer(
             timeout=config.get("server", {}).get("timeout", 60),
             max_batch_size=1,
             track_requests=True

 import logging
 from pathlib import Path
 from .routes import router, init_router
+from api import InferenceApi
 def setup_logging():
     """Set up basic logging configuration"""
         # Initialize the router with our config
         init_router(config)
+        api = InferenceApi()
         # Create LitServer instance
+        server = ls.LitServer(api,
             timeout=config.get("server", {}).get("timeout", 60),
             max_batch_size=1,
             track_requests=True