Spaces:

TeamGenKI
/

Inference-API

Runtime error

App Files Files Community

AurelioAguirre commited on Jan 8

Commit

47031d7

1 Parent(s): a717933

FIRST

Browse files

Files changed (14) hide show

.gitignore +50 -0
.idea/.gitignore +3 -0
.idea/Inference-API.iml +9 -0
.idea/misc.xml +9 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
Dockerfile +24 -0
app/__init__.py +0 -0
app/api.py +159 -0
app/config.yaml +11 -0
app/main.py +56 -0
app/routes.py +118 -0
app/schemas.py +29 -0
requirements.txt +22 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,50 @@

+# Virtual Environment
+myenv/
+venv/
+ENV/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# IDEs and editors
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.project
+.settings/
+.classpath
+# Logs and databases
+*.log
+*.sqlite
+*.db
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/Inference-API.iml ADDED Viewed

	@@ -0,0 +1,9 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,9 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (Inference-API)" />
+  </component>
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="Python 3.13 (Inference-API)" project-jdk-type="Python SDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Inference-API.iml" filepath="$PROJECT_DIR$/.idea/Inference-API.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+# Use Python 3.12 slim image as base
+FROM python:3.12-slim
+# Set working directory
+WORKDIR /app
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code
+COPY app/ ./main/
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+# Expose the port your application runs on.
+EXPOSE 7680
+# Command to run the application
+CMD ["python", "-m", "app.main"]

app/__init__.py ADDED Viewed

File without changes

app/api.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import httpx
+from typing import Optional, Iterator, List, Dict, Union
+import logging
+class InferenceApi:
+    def __init__(self, config: dict):
+        """Initialize the Inference API with configuration."""
+        self.logger = logging.getLogger(__name__)
+        self.logger.info("Initializing Inference API")
+        # Get base URL from config
+        self.base_url = config["llm_server"]["base_url"]
+        self.timeout = config["llm_server"].get("timeout", 60)
+        # Initialize HTTP client
+        self.client = httpx.AsyncClient(
+            base_url=self.base_url,
+            timeout=self.timeout
+        )
+        self.logger.info("Inference API initialized successfully")
+    async def generate_response(
+            self,
+            prompt: str,
+            system_message: Optional[str] = None,
+            max_new_tokens: Optional[int] = None
+    ) -> str:
+        """
+        Generate a complete response by forwarding the request to the LLM Server.
+        """
+        self.logger.debug(f"Forwarding generation request for prompt: {prompt[:50]}...")
+        try:
+            response = await self.client.post(
+                "/api/v1/generate",
+                json={
+                    "prompt": prompt,
+                    "system_message": system_message,
+                    "max_new_tokens": max_new_tokens
+                }
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data["generated_text"]
+        except Exception as e:
+            self.logger.error(f"Error in generate_response: {str(e)}")
+            raise
+    async def generate_stream(
+            self,
+            prompt: str,
+            system_message: Optional[str] = None,
+            max_new_tokens: Optional[int] = None
+    ) -> Iterator[str]:
+        """
+        Generate a streaming response by forwarding the request to the LLM Server.
+        """
+        self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
+        try:
+            async with self.client.stream(
+                    "POST",
+                    "/api/v1/generate/stream",
+                    json={
+                        "prompt": prompt,
+                        "system_message": system_message,
+                        "max_new_tokens": max_new_tokens
+                    }
+            ) as response:
+                response.raise_for_status()
+                async for chunk in response.aiter_text():
+                    yield chunk
+        except Exception as e:
+            self.logger.error(f"Error in generate_stream: {str(e)}")
+            raise
+    async def generate_embedding(self, text: str) -> List[float]:
+        """
+        Generate embedding by forwarding the request to the LLM Server.
+        """
+        self.logger.debug(f"Forwarding embedding request for text: {text[:50]}...")
+        try:
+            response = await self.client.post(
+                "/api/v1/embedding",
+                json={"text": text}
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data["embedding"]
+        except Exception as e:
+            self.logger.error(f"Error in generate_embedding: {str(e)}")
+            raise
+    async def check_system_status(self) -> Dict[str, Union[Dict, str]]:
+        """
+        Get system status from the LLM Server.
+        """
+        try:
+            response = await self.client.get("/api/v1/system/status")
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            self.logger.error(f"Error getting system status: {str(e)}")
+            raise
+    async def validate_system(self) -> Dict[str, Union[Dict, str, List[str]]]:
+        """
+        Get system validation status from the LLM Server.
+        """
+        try:
+            response = await self.client.get("/api/v1/system/validate")
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            self.logger.error(f"Error validating system: {str(e)}")
+            raise
+    async def initialize_model(self, model_name: Optional[str] = None) -> Dict[str, str]:
+        """
+        Initialize a model on the LLM Server.
+        """
+        try:
+            response = await self.client.post(
+                "/api/v1/model/initialize",
+                params={"model_name": model_name} if model_name else None
+            )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            self.logger.error(f"Error initializing model: {str(e)}")
+            raise
+    async def initialize_embedding_model(self, model_name: Optional[str] = None) -> Dict[str, str]:
+        """
+        Initialize an embedding model on the LLM Server.
+        """
+        try:
+            response = await self.client.post(
+                "/api/v1/model/initialize/embedding",
+                params={"model_name": model_name} if model_name else None
+            )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            self.logger.error(f"Error initializing embedding model: {str(e)}")
+            raise
+    async def close(self):
+        """Close the HTTP client session."""
+        await self.client.aclose()

app/config.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+server:
+  port: 8001
+  timeout: 60
+llm_server:
+  base_url: "https://teamgenki-llmserver.hf.space:7680"  # URL of your LLM Server
+  timeout: 60  # Timeout for requests to LLM Server
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

app/main.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+LLM Inference Server main application using LitServe framework.
+"""
+import litserve as ls
+import yaml
+import logging
+from pathlib import Path
+from .routes import router, init_router
+def setup_logging():
+    """Set up basic logging configuration"""
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    return logging.getLogger(__name__)
+def load_config():
+    """Load configuration from config.yaml"""
+    config_path = Path(__file__).parent / "config.yaml"
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+def main():
+    """Main function to set up and run the inference server."""
+    logger = setup_logging()
+    try:
+        # Load configuration
+        config = load_config()
+        # Initialize the router with our config
+        init_router(config)
+        # Create LitServer instance
+        server = ls.LitServer(
+            timeout=config.get("server", {}).get("timeout", 60),
+            max_batch_size=1,
+            track_requests=True
+        )
+        # Add our routes to the server's FastAPI app
+        server.app.include_router(router, prefix="/api/v1")
+        # Get port from config or use default
+        port = config.get("server", {}).get("port", 8001)
+        logger.info(f"Starting server on port {port}")
+        server.run(port=port)
+    except Exception as e:
+        logger.error(f"Server initialization failed: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()

app/routes.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from fastapi import APIRouter, HTTPException
+from typing import Optional
+from .api import InferenceApi
+from .schemas import (
+    GenerateRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    SystemStatusResponse,
+    ValidationResponse
+)
+import logging
+router = APIRouter()
+logger = logging.getLogger(__name__)
+api = None
+def init_router(config: dict):
+    """Initialize router with config and Inference API instance"""
+    global api
+    api = InferenceApi(config)
+    logger.info("Router initialized with Inference API instance")
+@router.post("/generate")
+async def generate_text(request: GenerateRequest):
+    """Generate text response from prompt"""
+    logger.info(f"Received generation request for prompt: {request.prompt[:50]}...")
+    try:
+        response = await api.generate_response(
+            prompt=request.prompt,
+            system_message=request.system_message,
+            max_new_tokens=request.max_new_tokens
+        )
+        logger.info("Successfully generated response")
+        return {"generated_text": response}
+    except Exception as e:
+        logger.error(f"Error in generate_text endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/generate/stream")
+async def generate_stream(request: GenerateRequest):
+    """Generate streaming text response from prompt"""
+    logger.info(f"Received streaming generation request for prompt: {request.prompt[:50]}...")
+    try:
+        return api.generate_stream(
+            prompt=request.prompt,
+            system_message=request.system_message,
+            max_new_tokens=request.max_new_tokens
+        )
+    except Exception as e:
+        logger.error(f"Error in generate_stream endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/embedding", response_model=EmbeddingResponse)
+async def generate_embedding(request: EmbeddingRequest):
+    """Generate embedding vector from text"""
+    logger.info(f"Received embedding request for text: {request.text[:50]}...")
+    try:
+        embedding = await api.generate_embedding(request.text)
+        logger.info(f"Successfully generated embedding of dimension {len(embedding)}")
+        return EmbeddingResponse(
+            embedding=embedding,
+            dimension=len(embedding)
+        )
+    except Exception as e:
+        logger.error(f"Error in generate_embedding endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/system/status",
+            response_model=SystemStatusResponse,
+            summary="Check System Status",
+            description="Returns comprehensive system status including CPU, Memory, GPU, Storage, and Model information")
+async def check_system():
+    """Get system status from LLM Server"""
+    try:
+        return await api.check_system_status()
+    except Exception as e:
+        logger.error(f"Error checking system status: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/system/validate",
+            response_model=ValidationResponse,
+            summary="Validate System Configuration",
+            description="Validates system configuration, folders, and model setup")
+async def validate_system():
+    """Get system validation status from LLM Server"""
+    try:
+        return await api.validate_system()
+    except Exception as e:
+        logger.error(f"Error validating system: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/model/initialize",
+             summary="Initialize default or specified model",
+             description="Initialize model for use. Uses default model from config if none specified.")
+async def initialize_model(model_name: Optional[str] = None):
+    """Initialize a model for use"""
+    try:
+        return await api.initialize_model(model_name)
+    except Exception as e:
+        logger.error(f"Error initializing model: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/model/initialize/embedding",
+             summary="Initialize embedding model",
+             description="Initialize a separate model specifically for generating embeddings")
+async def initialize_embedding_model(model_name: Optional[str] = None):
+    """Initialize a model specifically for embeddings"""
+    try:
+        return await api.initialize_embedding_model(model_name)
+    except Exception as e:
+        logger.error(f"Error initializing embedding model: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.on_event("shutdown")
+async def shutdown_event():
+    """Clean up resources on shutdown"""
+    if api:
+        await api.close()

app/schemas.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pydantic import BaseModel
+from typing import Optional, List, Dict, Union
+class GenerateRequest(BaseModel):
+    prompt: str
+    system_message: Optional[str] = None
+    max_new_tokens: Optional[int] = None
+class EmbeddingRequest(BaseModel):
+    text: str
+class EmbeddingResponse(BaseModel):
+    embedding: List[float]
+    dimension: int
+class SystemStatusResponse(BaseModel):
+    """Pydantic model for system status response"""
+    cpu: Optional[Dict[str, Union[float, str]]] = None
+    memory: Optional[Dict[str, Union[float, str]]] = None
+    gpu: Optional[Dict[str, Union[bool, str, float]]] = None
+    storage: Optional[Dict[str, str]] = None
+    model: Optional[Dict[str, Union[bool, str]]] = None
+class ValidationResponse(BaseModel):
+    config_validation: Dict[str, bool]
+    model_validation: Dict[str, bool]
+    folder_validation: Dict[str, bool]
+    overall_status: str
+    issues: List[str]

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+annotated-types==0.7.0
+anyio==4.8.0
+certifi==2024.12.14
+click==8.1.8
+fastapi==0.115.6
+h11==0.14.0
+httpcore==1.0.7
+httptools==0.6.4
+httpx==0.28.1
+idna==3.10
+litserve==0.2.5
+pydantic==2.10.4
+pydantic_core==2.27.2
+python-dotenv==1.0.1
+PyYAML==6.0.2
+sniffio==1.3.1
+starlette==0.41.3
+typing_extensions==4.12.2
+uvicorn==0.34.0
+uvloop==0.21.0
+watchfiles==1.0.3
+websockets==14.1