# src/utils/document_loader.py import os from typing import List, Union import PyPDF2 import docx def load_document(file_path: str) -> str: """ Load text from various document types s Args: file_path (str): Path to the document file Returns: str: Extracted text from the document Raises: ValueError: If file type is not supported """ # Get file extension _, ext = os.path.splitext(file_path) ext = ext.lower() # Load based on file type if ext == '.txt': with open(file_path, 'r', encoding='utf-8') as f: return f.read() elif ext == '.pdf': return load_pdf(file_path) elif ext == '.docx': return load_docx(file_path) else: raise ValueError(f"Unsupported file type: {ext}") def load_pdf(file_path: str) -> str: """ Extract text from PDF file Args: file_path (str): Path to PDF file Returns: str: Extracted text """ text = "" with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) for page in reader.pages: text += page.extract_text() return text def load_docx(file_path: str) -> str: """ Extract text from DOCX file Args: file_path (str): Path to DOCX file Returns: str: Extracted text """ doc = docx.Document(file_path) return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) def load_documents_from_directory( directory: str, extensions: List[str] = ['.txt', '.pdf', '.docx'] ) -> List[str]: """ Load all documents from a directory Args: directory (str): Path to the directory extensions (List[str]): List of file extensions to load Returns: List[str]: List of document texts """ documents = [] for filename in os.listdir(directory): file_path = os.path.join(directory, filename) if os.path.isfile(file_path) and any(filename.lower().endswith(ext) for ext in extensions): try: documents.append(load_document(file_path)) except Exception as e: print(f"Error loading {filename}: {e}") return documents