sachin commited on
Commit
d441356
·
1 Parent(s): 8047b25
Files changed (1) hide show
  1. src/server/main.py +88 -0
src/server/main.py CHANGED
@@ -710,6 +710,94 @@ async def translate(
710
  logger.error(f"Invalid JSON response: {str(e)}")
711
  raise HTTPException(status_code=500, detail="Invalid response format from translation service")
712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  @app.post("/v1/visual_query",
714
  response_model=VisualQueryResponse,
715
  summary="Visual Query with Image",
 
710
  logger.error(f"Invalid JSON response: {str(e)}")
711
  raise HTTPException(status_code=500, detail="Invalid response format from translation service")
712
 
713
+ class PDFTextExtractionResponse(BaseModel):
714
+ page_content: str = Field(..., description="Extracted text from the specified PDF page")
715
+
716
+ class Config:
717
+ schema_extra = {
718
+ "example": {
719
+ "page_content": "Google Interview Preparation Guide\nCustomer Engineer Specialist\n\nOur hiring process\n..."
720
+ }
721
+ }
722
+
723
+ @app.post("/v1/extract-text/",
724
+ response_model=PDFTextExtractionResponse,
725
+ summary="Extract Text from PDF",
726
+ description="Extract text from a specified page of an encrypted PDF file by calling an external API. Rate limited to 100 requests per minute per user. Requires authentication and X-Session-Key header.",
727
+ tags=["PDF"],
728
+ responses={
729
+ 200: {"description": "Extracted text", "model": PDFTextExtractionResponse},
730
+ 400: {"description": "Invalid encrypted PDF or page number"},
731
+ 401: {"description": "Unauthorized - Token required"},
732
+ 429: {"description": "Rate limit exceeded"},
733
+ 500: {"description": "External API error"},
734
+ 504: {"description": "External API timeout"}
735
+ })
736
+ @limiter.limit(settings.chat_rate_limit)
737
+ async def extract_text(
738
+ request: Request,
739
+ file: UploadFile = File(..., description="Encrypted PDF file to extract text from"),
740
+ page_number: int = Query(1, description="Page number to extract text from (1-based indexing)"),
741
+ credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme),
742
+ x_session_key: str = Header(..., alias="X-Session-Key")
743
+ ):
744
+ user_id = await get_current_user(credentials)
745
+ session_key = base64.b64decode(x_session_key)
746
+
747
+ # Validate page number
748
+ if page_number < 1:
749
+ raise HTTPException(status_code=400, detail="Page number must be at least 1")
750
+
751
+ # Decrypt PDF content
752
+ try:
753
+ encrypted_content = await file.read()
754
+ decrypted_content = decrypt_data(encrypted_content, session_key)
755
+ except Exception as e:
756
+ logger.error(f"PDF decryption failed: {str(e)}")
757
+ raise HTTPException(status_code=400, detail="Invalid encrypted PDF")
758
+
759
+ logger.info("Processing PDF text extraction request", extra={
760
+ "endpoint": "/v1/extract-text",
761
+ "filename": file.filename,
762
+ "page_number": page_number,
763
+ "client_ip": get_remote_address(request),
764
+ "user_id": user_id
765
+ })
766
+
767
+ start_time = time()
768
+ try:
769
+ # Call external API
770
+ external_url = f"http://144.24.122.208:7860/extract-text/?page_number={page_number}"
771
+ files = {"file": (file.filename, decrypted_content, file.content_type)}
772
+
773
+ response = requests.post(
774
+ external_url,
775
+ files=files,
776
+ headers={"accept": "application/json"},
777
+ timeout=60
778
+ )
779
+ response.raise_for_status()
780
+
781
+ response_data = response.json()
782
+ extracted_text = response_data.get("page_content", "")
783
+ if not extracted_text:
784
+ logger.warning("No page_content found in external API response")
785
+ extracted_text = ""
786
+
787
+ logger.info(f"PDF text extraction completed in {time() - start_time:.2f} seconds")
788
+ return PDFTextExtractionResponse(page_content=extracted_text.strip())
789
+
790
+ except requests.Timeout:
791
+ logger.error("External PDF extraction API timed out")
792
+ raise HTTPException(status_code=504, detail="External API timeout")
793
+ except requests.RequestException as e:
794
+ logger.error(f"External PDF extraction API error: {str(e)}")
795
+ raise HTTPException(status_code=500, detail=f"External API error: {str(e)}")
796
+ except ValueError as e:
797
+ logger.error(f"Invalid JSON response from external API: {str(e)}")
798
+ raise HTTPException(status_code=500, detail="Invalid response format from external API")
799
+
800
+
801
  @app.post("/v1/visual_query",
802
  response_model=VisualQueryResponse,
803
  summary="Visual Query with Image",