""" PDF processing module for ACRES RAG Platform. Handles PDF file processing, text extraction, and page rendering. """ # utils/pdf_processor.py import os import fitz import logging from typing import Dict, List, Optional from datetime import datetime from slugify import slugify import json from PIL import Image logger = logging.getLogger(__name__) class PDFProcessor: def __init__(self, upload_dir: str = "data/uploads"): """Initialize PDFProcessor with upload directory.""" self.upload_dir = upload_dir os.makedirs(upload_dir, exist_ok=True) self.current_page = 0 def extract_text_from_pdf(self, file_path: str) -> Dict: """Extract text and metadata from a PDF file.""" try: doc = fitz.open(file_path) # Extract text from all pages with page tracking text = "" pages = {} for page_num in range(len(doc)): page_text = doc[page_num].get_text() pages[page_num] = page_text text += page_text + "\n" # Extract metadata metadata = doc.metadata if not metadata.get("title"): metadata["title"] = os.path.basename(file_path) # Create structured document document = { "title": metadata.get("title", ""), "authors": ( metadata.get("author", "").split(";") if metadata.get("author") else [] ), "date": metadata.get("creationDate", ""), "abstract": text[:500] + "..." if len(text) > 500 else text, "full_text": text, "source_file": file_path, "pages": pages, "page_count": len(doc), } doc.close() return document except Exception as e: logger.error(f"Error processing PDF {file_path}: {str(e)}") raise def process_pdfs(self, file_paths: List[str], collection_name: str) -> str: """Process multiple PDF files and store their content.""" processed_docs = [] for file_path in file_paths: try: doc_data = self.extract_text_from_pdf(file_path) processed_docs.append(doc_data) except Exception as e: logger.error(f"Error processing {file_path}: {str(e)}") continue if not processed_docs: raise ValueError("No documents were successfully processed") # Save to JSON file timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_filename = f"{slugify(collection_name)}_{timestamp}_documents.json" output_path = f"data/{output_filename}" with open(output_path, "w", encoding="utf-8") as f: json.dump(processed_docs, f, indent=2, ensure_ascii=False) return output_path def render_page(self, file_path: str, page_num: int) -> Optional[Image.Image]: """Render a specific page from a PDF as an image.""" try: doc = fitz.open(file_path) page = doc[page_num] pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72)) image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) doc.close() return image except Exception as e: logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}") return None