Spaces:

TeamGenKI
/

Inference-API

Runtime error

App Files Files Community

AurelioAguirre commited on Jan 10

Commit

eb5a3fb

1 Parent(s): ecd2385

Adding query expansion and reranker

Browse files

Files changed (6) hide show

main/api.py +94 -2
main/prompt_templates/chunk_rerank.json +39 -0
main/prompt_templates/query_expansion.json +25 -0
main/routes.py +38 -1
main/schemas.py +52 -2
main/utils.py +28 -0

main/api.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import httpx
-from typing import Optional, AsyncIterator, Dict, Any, Iterator, List
 import logging
 import asyncio
-import os
 from litserve import LitAPI
 from pydantic import BaseModel
 class GenerationResponse(BaseModel):
@@ -136,6 +139,95 @@ class InferenceApi(LitAPI):
             self.logger.error(f"Error in generate_response: {str(e)}")
             raise
     async def generate_stream(
             self,
             prompt: str,

+import json
+from pathlib import Path
 import httpx
+from typing import Optional, AsyncIterator, Dict, Any, Iterator, List, Callable
 import logging
 import asyncio
 from litserve import LitAPI
 from pydantic import BaseModel
+from .utils import extract_json
 class GenerationResponse(BaseModel):
             self.logger.error(f"Error in generate_response: {str(e)}")
             raise
+    async def structured_llm_query(
+            self,
+            template_name: str,
+            input_text: str,
+            additional_context: Optional[Dict[str, Any]] = None,
+            pre_hooks: Optional[List[Callable]] = None,
+            post_hooks: Optional[List[Callable]] = None
+    ) -> Dict[str, Any]:
+        """Execute a structured LLM query using a template."""
+        template_path = Path(__file__).parent / "prompt_templates" / f"{template_name}.json"
+        try:
+            # Load and parse template
+            with open(template_path) as f:
+                template = json.load(f)
+            # Apply pre-processing hooks
+            processed_input = input_text
+            if pre_hooks:
+                for hook in pre_hooks:
+                    processed_input = hook(processed_input)
+            # Format the prompt with the context
+            context = {"input_text": processed_input}
+            if additional_context:
+                context.update(additional_context)
+            prompt = template["prompt_template"].format(**context)
+            # Make the request to the LLM
+            response = await self._make_request(
+                "POST",
+                "generate",
+                json={
+                    "prompt": prompt,
+                    "system_message": template.get("system_message"),
+                    "max_new_tokens": 1000
+                }
+            )
+            # Extract JSON from response
+            data = response.json()
+            result = extract_json(data["generated_text"])
+            # Apply any additional post-processing hooks
+            if post_hooks:
+                for hook in post_hooks:
+                    result = hook(result)
+            return result
+        except FileNotFoundError:
+            raise ValueError(f"Template {template_name} not found")
+        except Exception as e:
+            self.logger.error(f"Error in structured_llm_query: {str(e)}")
+            raise
+    async def expand_query(
+            self,
+            query: str,
+            system_message: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Expand a query for RAG processing."""
+        return await self.structured_llm_query(
+            template_name="query_expansion",
+            input_text=query,
+            additional_context={"system_message": system_message} if system_message else None
+        )
+    async def rerank_chunks(
+            self,
+            query: str,
+            chunks: List[str],
+            system_message: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Rerank text chunks based on their relevance to the query."""
+        # Format chunks as numbered list for better LLM processing
+        formatted_chunks = "\n".join(f"{i+1}. {chunk}" for i, chunk in enumerate(chunks))
+        return await self.structured_llm_query(
+            template_name="chunk_rerank",
+            input_text=query,
+            additional_context={
+                "chunks": formatted_chunks,
+                "system_message": system_message
+            }
+        )
     async def generate_stream(
             self,
             prompt: str,

main/prompt_templates/chunk_rerank.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "name": "chunk_rerank",
+  "description": "Evaluate and rank text chunks based on their relevance to a query",
+  "system_message": "You are a helpful assistant that evaluates text chunks for their relevance to a query. You always respond in valid JSON format.",
+  "prompt_template": "Please analyze the following query and text chunks, ranking the chunks by their relevance and importance to answering the query. Prioritize chunks that contain specific, relevant information over general statements.\n\nQuery: {input_text}\n\nText Chunks to evaluate:\n{chunks}\n\nCreate a JSON response with the following fields:\n- original_query: the exact query\n- ranked_chunks: array of the top 5 most relevant chunks, ordered by importance (most important first)\n- got_chunks: set to false if no chunks were provided or if they're all irrelevant\n\nEnsure your response is valid JSON and contains only these fields.",
+  "response_schema": {
+    "type": "object",
+    "properties": {
+      "original_query": {
+        "type": "string",
+        "description": "The exact query being processed"
+      },
+      "ranked_chunks": {
+        "type": "array",
+        "items": {
+          "type": "string"
+        },
+        "maxItems": 5,
+        "description": "Top 5 most relevant chunks in order of importance"
+      },
+      "got_chunks": {
+        "type": "boolean",
+        "description": "Whether any relevant chunks were found"
+      }
+    },
+    "required": ["original_query", "ranked_chunks", "got_chunks"]
+  },
+  "example_response": {
+    "original_query": "What are the key principles of relativity?",
+    "ranked_chunks": [
+      "Einstein's theory of special relativity is based on two fundamental principles: the principle of relativity and the constancy of the speed of light.",
+      "The principle of relativity states that the laws of physics are the same in all inertial reference frames.",
+      "In special relativity, time dilation occurs when objects move at high speeds relative to one another.",
+      "Mass and energy are equivalent, as expressed in the famous equation E=mc².",
+      "The theory led to revolutionary predictions about space and time, including length contraction."
+    ],
+    "got_chunks": true
+  }
+}

main/prompt_templates/query_expansion.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "name": "query_expansion",
+  "description": "Expand a query for RAG processing with additional context and search terms",
+  "system_message": "You are a helpful assistant that creates JSON responses. Always ensure your response is valid JSON.",
+  "prompt_template": "Please analyze this query and create a JSON response with the following fields:\n- original_query: the exact query as provided\n- expanded_query: a more detailed version of the query that might help in getting better answers\n- search_terms: a list of key terms that would be useful for searching related information\n- call_rag: set to false if this query doesn't require searching through external documents (like math problems, coding questions, or general knowledge)\n\nThe query is: \"{input_text}\"\n\nYour response must be valid JSON and contain only these fields. Do not include any other text.",
+  "response_schema": {
+    "type": "object",
+    "properties": {
+      "original_query": {"type": "string"},
+      "expanded_query": {"type": "string"},
+      "search_terms": {
+        "type": "array",
+        "items": {"type": "string"}
+      },
+      "call_rag": {"type": "boolean"}
+    },
+    "required": ["original_query", "expanded_query", "search_terms", "call_rag"]
+  },
+  "example_response": {
+    "original_query": "What is quantum entanglement?",
+    "expanded_query": "Explain quantum entanglement, its significance in quantum mechanics, and how it challenges classical physics",
+    "search_terms": ["quantum entanglement", "quantum mechanics", "EPR paradox", "quantum physics", "spooky action"],
+    "call_rag": true
+  }
+}

main/routes.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .schemas import (
     SystemStatusResponse,
     ValidationResponse,
     ChatCompletionRequest,
-    ChatCompletionResponse
 )
 router = APIRouter()
@@ -113,6 +113,43 @@ async def create_chat_completion(request: ChatCompletionRequest):
         logger.error(f"Error in chat completion endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @router.post("/embedding", response_model=EmbeddingResponse)
 async def generate_embedding(request: EmbeddingRequest):
     """Generate embedding vector from text"""

     SystemStatusResponse,
     ValidationResponse,
     ChatCompletionRequest,
+    ChatCompletionResponse, QueryExpansionResponse, QueryExpansionRequest, ChunkRerankResponse, ChunkRerankRequest
 )
 router = APIRouter()
         logger.error(f"Error in chat completion endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@router.post("/expand_query", response_model=QueryExpansionResponse)
+async def expand_query(request: QueryExpansionRequest):
+    """Expand a query for RAG processing"""
+    logger.info(f"Received query expansion request: {request.query[:50]}...")
+    try:
+        result = await api.expand_query(
+            query=request.query,
+            system_message=request.system_message
+        )
+        logger.info("Successfully expanded query")
+        return result
+    except FileNotFoundError as e:
+        logger.error(f"Template file not found: {str(e)}")
+        raise HTTPException(status_code=500, detail="Query expansion template not found")
+    except json.JSONDecodeError as e:
+        logger.error(f"Invalid JSON response from LLM: {str(e)}")
+        raise HTTPException(status_code=500, detail="Invalid response format from LLM")
+    except Exception as e:
+        logger.error(f"Error in expand_query endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/rerank", response_model=ChunkRerankResponse)
+async def rerank_chunks(request: ChunkRerankRequest):
+    """Rerank chunks based on their relevance to the query"""
+    logger.info(f"Received reranking request for query: {request.query[:50]}...")
+    try:
+        result = await api.rerank_chunks(
+            query=request.query,
+            chunks=request.chunks,
+            system_message=request.system_message
+        )
+        logger.info(f"Successfully reranked {len(request.chunks)} chunks")
+        return result
+    except Exception as e:
+        logger.error(f"Error in rerank_chunks endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 @router.post("/embedding", response_model=EmbeddingResponse)
 async def generate_embedding(request: EmbeddingRequest):
     """Generate embedding vector from text"""

main/schemas.py CHANGED Viewed

@@ -1,7 +1,35 @@
-from pydantic import BaseModel, Field
 from typing import List, Optional, Dict, Union
 from time import time
 class ChatMessage(BaseModel):
     role: str
     content: str
@@ -91,4 +119,26 @@ class ValidationResponse(BaseModel):
     model_validation: Dict[str, bool]
     folder_validation: Dict[str, bool]
     overall_status: str
-    issues: List[str]

+import json
+from pathlib import Path
+from pydantic import BaseModel, Field, create_model, ConfigDict
 from typing import List, Optional, Dict, Union
 from time import time
+class QueryExpansionRequest(BaseModel):
+    query: str
+    system_message: Optional[str] = None
+# Load the template to create the response model
+template_path = Path(__file__).parent / "prompt_templates" / "query_expansion.json"
+with open(template_path) as f:
+    template = json.load(f)
+# Create model configuration with proper typing
+model_config = ConfigDict(
+    json_schema_extra={
+        'example': template['example_response']
+    }
+)
+# Create the response model based on the template's schema
+QueryExpansionResponse = create_model(
+    'QueryExpansionResponse',
+    original_query=(str, ...),
+    expanded_query=(str, ...),
+    search_terms=(List[str], ...),
+    call_rag=(bool, ...),
+    model_config=model_config
+)
 class ChatMessage(BaseModel):
     role: str
     content: str
     model_validation: Dict[str, bool]
     folder_validation: Dict[str, bool]
     overall_status: str
+    issues: List[str]
+class ChunkRerankRequest(BaseModel):
+    query: str
+    chunks: List[str]
+    system_message: Optional[str] = None
+# Load example from template
+template_path = Path(__file__).parent / "prompt_templates" / "chunk_rerank.json"
+with open(template_path) as f:
+    template = json.load(f)
+    example = template['example_response']
+class ChunkRerankResponse(BaseModel):
+    """Response model for chunk reranking, based on template schema"""
+    original_query: str = Field(..., description="The exact query being processed")
+    ranked_chunks: List[str] = Field(..., description="Top 5 most relevant chunks in order of importance", max_items=5)
+    got_chunks: bool = Field(..., description="Whether any relevant chunks were found")
+    class Config:
+        json_schema_extra = {
+            "example": example
+        }

main/utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Utility functions for the inference API."""
+import json
+import re
+from typing import Dict, Any
+def extract_json(text: str) -> Dict[str, Any]:
+    """Extract JSON from text that might contain other content.
+    Handles cases like:
+    - Clean JSON: {"key": "value"}
+    - JSON with prefix: Sure! Here's your JSON: {"key": "value"}
+    - JSON with suffix: {"key": "value"} Let me know if you need anything else!
+    """
+    # Find anything that looks like a JSON object
+    json_pattern = r'\{(?:[^{}]|(?R))*\}'
+    matches = re.finditer(json_pattern, text)
+    # Try each match until we find valid JSON
+    for match in matches:
+        try:
+            potential_json = match.group()
+            parsed = json.loads(potential_json)
+            return parsed
+        except json.JSONDecodeError:
+            continue
+    # If we couldn't find any valid JSON, raise an error
+    raise ValueError("No valid JSON found in response")