Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
sachin
commited on
Commit
·
d441356
1
Parent(s):
8047b25
test
Browse files- src/server/main.py +88 -0
src/server/main.py
CHANGED
@@ -710,6 +710,94 @@ async def translate(
|
|
710 |
logger.error(f"Invalid JSON response: {str(e)}")
|
711 |
raise HTTPException(status_code=500, detail="Invalid response format from translation service")
|
712 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
@app.post("/v1/visual_query",
|
714 |
response_model=VisualQueryResponse,
|
715 |
summary="Visual Query with Image",
|
|
|
710 |
logger.error(f"Invalid JSON response: {str(e)}")
|
711 |
raise HTTPException(status_code=500, detail="Invalid response format from translation service")
|
712 |
|
713 |
+
class PDFTextExtractionResponse(BaseModel):
|
714 |
+
page_content: str = Field(..., description="Extracted text from the specified PDF page")
|
715 |
+
|
716 |
+
class Config:
|
717 |
+
schema_extra = {
|
718 |
+
"example": {
|
719 |
+
"page_content": "Google Interview Preparation Guide\nCustomer Engineer Specialist\n\nOur hiring process\n..."
|
720 |
+
}
|
721 |
+
}
|
722 |
+
|
723 |
+
@app.post("/v1/extract-text/",
|
724 |
+
response_model=PDFTextExtractionResponse,
|
725 |
+
summary="Extract Text from PDF",
|
726 |
+
description="Extract text from a specified page of an encrypted PDF file by calling an external API. Rate limited to 100 requests per minute per user. Requires authentication and X-Session-Key header.",
|
727 |
+
tags=["PDF"],
|
728 |
+
responses={
|
729 |
+
200: {"description": "Extracted text", "model": PDFTextExtractionResponse},
|
730 |
+
400: {"description": "Invalid encrypted PDF or page number"},
|
731 |
+
401: {"description": "Unauthorized - Token required"},
|
732 |
+
429: {"description": "Rate limit exceeded"},
|
733 |
+
500: {"description": "External API error"},
|
734 |
+
504: {"description": "External API timeout"}
|
735 |
+
})
|
736 |
+
@limiter.limit(settings.chat_rate_limit)
|
737 |
+
async def extract_text(
|
738 |
+
request: Request,
|
739 |
+
file: UploadFile = File(..., description="Encrypted PDF file to extract text from"),
|
740 |
+
page_number: int = Query(1, description="Page number to extract text from (1-based indexing)"),
|
741 |
+
credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme),
|
742 |
+
x_session_key: str = Header(..., alias="X-Session-Key")
|
743 |
+
):
|
744 |
+
user_id = await get_current_user(credentials)
|
745 |
+
session_key = base64.b64decode(x_session_key)
|
746 |
+
|
747 |
+
# Validate page number
|
748 |
+
if page_number < 1:
|
749 |
+
raise HTTPException(status_code=400, detail="Page number must be at least 1")
|
750 |
+
|
751 |
+
# Decrypt PDF content
|
752 |
+
try:
|
753 |
+
encrypted_content = await file.read()
|
754 |
+
decrypted_content = decrypt_data(encrypted_content, session_key)
|
755 |
+
except Exception as e:
|
756 |
+
logger.error(f"PDF decryption failed: {str(e)}")
|
757 |
+
raise HTTPException(status_code=400, detail="Invalid encrypted PDF")
|
758 |
+
|
759 |
+
logger.info("Processing PDF text extraction request", extra={
|
760 |
+
"endpoint": "/v1/extract-text",
|
761 |
+
"filename": file.filename,
|
762 |
+
"page_number": page_number,
|
763 |
+
"client_ip": get_remote_address(request),
|
764 |
+
"user_id": user_id
|
765 |
+
})
|
766 |
+
|
767 |
+
start_time = time()
|
768 |
+
try:
|
769 |
+
# Call external API
|
770 |
+
external_url = f"http://144.24.122.208:7860/extract-text/?page_number={page_number}"
|
771 |
+
files = {"file": (file.filename, decrypted_content, file.content_type)}
|
772 |
+
|
773 |
+
response = requests.post(
|
774 |
+
external_url,
|
775 |
+
files=files,
|
776 |
+
headers={"accept": "application/json"},
|
777 |
+
timeout=60
|
778 |
+
)
|
779 |
+
response.raise_for_status()
|
780 |
+
|
781 |
+
response_data = response.json()
|
782 |
+
extracted_text = response_data.get("page_content", "")
|
783 |
+
if not extracted_text:
|
784 |
+
logger.warning("No page_content found in external API response")
|
785 |
+
extracted_text = ""
|
786 |
+
|
787 |
+
logger.info(f"PDF text extraction completed in {time() - start_time:.2f} seconds")
|
788 |
+
return PDFTextExtractionResponse(page_content=extracted_text.strip())
|
789 |
+
|
790 |
+
except requests.Timeout:
|
791 |
+
logger.error("External PDF extraction API timed out")
|
792 |
+
raise HTTPException(status_code=504, detail="External API timeout")
|
793 |
+
except requests.RequestException as e:
|
794 |
+
logger.error(f"External PDF extraction API error: {str(e)}")
|
795 |
+
raise HTTPException(status_code=500, detail=f"External API error: {str(e)}")
|
796 |
+
except ValueError as e:
|
797 |
+
logger.error(f"Invalid JSON response from external API: {str(e)}")
|
798 |
+
raise HTTPException(status_code=500, detail="Invalid response format from external API")
|
799 |
+
|
800 |
+
|
801 |
@app.post("/v1/visual_query",
|
802 |
response_model=VisualQueryResponse,
|
803 |
summary="Visual Query with Image",
|