Spaces:

slabstech
/

dhwani-internal-api-server

Paused

App Files Files Community

sachin commited on 3 days ago

Commit

75bbaa5

1 Parent(s): badf26d

add-ocr

Browse files

Files changed (1) hide show

src/server/main.py +97 -1

src/server/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from fastapi import Depends, FastAPI, File, HTTPException, Query, Request, Uploa
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
 from PIL import Image
-from pydantic import BaseModel, field_validator
 from pydantic_settings import BaseSettings
 from slowapi import Limiter
 from slowapi.util import get_remote_address
@@ -26,6 +26,10 @@ from starlette.responses import StreamingResponse
 from logging_config import logger
 from tts_config import SPEED, ResponseFormat, config as tts_config
 import torchaudio
 # Device setup
 if torch.cuda.is_available():
@@ -296,6 +300,14 @@ class SynthesizeRequest(BaseModel):
 class KannadaSynthesizeRequest(BaseModel):
     text: str
 # TTS Functions
 def load_audio_from_url(url: str):
     response = requests.get(url)
@@ -762,6 +774,90 @@ async def visual_query(
         logger.error(f"Error processing request: {str(e)}")
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 @app.post("/v1/chat_v2", response_model=ChatResponse)
 @limiter.limit(settings.chat_rate_limit)
 async def chat_v2(

 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
 from PIL import Image
+from pydantic import BaseModel, field_validator, Field
 from pydantic_settings import BaseSettings
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from logging_config import logger
 from tts_config import SPEED, ResponseFormat, config as tts_config
 import torchaudio
+import base64
+from io import BytesIO
+from pypdf import PdfReader
+from olmocr.data.renderpdf import render_pdf_to_base64png
 # Device setup
 if torch.cuda.is_available():
 class KannadaSynthesizeRequest(BaseModel):
     text: str
+class ExtractTextRequest(BaseModel):
+    page_number: int = Field(
+        default=1,
+        description="The page number to extract text from (1-based indexing). Must be a positive integer.",
+        ge=1,
+        example=1
+    )
 # TTS Functions
 def load_audio_from_url(url: str):
     response = requests.get(url)
         logger.error(f"Error processing request: {str(e)}")
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+@app.post(
+    "/v1/extract-text-visual-query/",
+    response_model=dict,
+    summary="Extract text from a PDF page using visual query",
+    description=(
+        "Extracts text from a specific page of a PDF file by rendering it as an image and processing it with the internal vision query model. "
+        "The query 'describe the image' is used to generate a description of the page content."
+    ),
+    response_description="A JSON object containing the extracted text from the specified page."
+)
+async def extract_text_visual_query(
+    file: UploadFile = File(..., description="The PDF file to process. Must be a valid PDF."),
+    page_number: int = Body(
+        default=1,
+        embed=True,
+        description=ExtractTextRequest.model_fields["page_number"].description,
+        ge=1,
+        example=1
+    )
+):
+    """
+    Extract text from a specific page of a PDF file using the internal vision query model.
+    Args:
+        file (UploadFile): The PDF file to process.
+        page_number (int): The page number to extract text from (1-based indexing). Defaults to 1.
+    Returns:
+        JSONResponse: A dictionary containing:
+            - page_content: The extracted text from the specified page via the vision query model.
+    Raises:
+        HTTPException: If the file is not a PDF, the page number is invalid, or processing fails.
+    Example:
+        ```json
+        {"page_content": "Here’s a summary of the page in one sentence:\\n\\nThe page displays..."}
+        ```
+    """
+    try:
+        # Validate file type
+        if not file.filename.lower().endswith(".pdf"):
+            raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+        # Save the uploaded PDF to a temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(await file.read())
+            temp_file_path = temp_file.name
+        # Render the specified page to an image
+        try:
+            image_base64 = render_pdf_to_base64png(
+                temp_file_path, page_number, target_longest_image_dim=1024
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Failed to render PDF page: {str(e)}")
+        # Decode base64 image to PIL Image
+        try:
+            image_bytes = base64.b64decode(image_base64)
+            image = Image.open(BytesIO(image_bytes))
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Failed to process image: {str(e)}")
+        # Process image with vision query
+        try:
+            page_content = await llm_manager.vision_query(image, "describe the image")
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Vision query processing failed: {str(e)}")
+        # Clean up temporary file
+        os.remove(temp_file_path)
+        return JSONResponse(content={"page_content": page_content})
+    except Exception as e:
+        # Clean up in case of error
+        if 'temp_file_path' in locals():
+            try:
+                os.remove(temp_file_path)
+            except:
+                pass
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 @app.post("/v1/chat_v2", response_model=ChatResponse)
 @limiter.limit(settings.chat_rate_limit)
 async def chat_v2(