Spaces:
Running
Running
# src/utils/document_loader.py | |
import os | |
from typing import List, Union | |
import PyPDF2 | |
import docx | |
def load_document(file_path: str) -> str: | |
""" | |
Load text from various document types | |
s | |
Args: | |
file_path (str): Path to the document file | |
Returns: | |
str: Extracted text from the document | |
Raises: | |
ValueError: If file type is not supported | |
""" | |
# Get file extension | |
_, ext = os.path.splitext(file_path) | |
ext = ext.lower() | |
# Load based on file type | |
if ext == '.txt': | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return f.read() | |
elif ext == '.pdf': | |
return load_pdf(file_path) | |
elif ext == '.docx': | |
return load_docx(file_path) | |
else: | |
raise ValueError(f"Unsupported file type: {ext}") | |
def load_pdf(file_path: str) -> str: | |
""" | |
Extract text from PDF file | |
Args: | |
file_path (str): Path to PDF file | |
Returns: | |
str: Extracted text | |
""" | |
text = "" | |
with open(file_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
def load_docx(file_path: str) -> str: | |
""" | |
Extract text from DOCX file | |
Args: | |
file_path (str): Path to DOCX file | |
Returns: | |
str: Extracted text | |
""" | |
doc = docx.Document(file_path) | |
return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) | |
def load_documents_from_directory( | |
directory: str, | |
extensions: List[str] = ['.txt', '.pdf', '.docx'] | |
) -> List[str]: | |
""" | |
Load all documents from a directory | |
Args: | |
directory (str): Path to the directory | |
extensions (List[str]): List of file extensions to load | |
Returns: | |
List[str]: List of document texts | |
""" | |
documents = [] | |
for filename in os.listdir(directory): | |
file_path = os.path.join(directory, filename) | |
if os.path.isfile(file_path) and any(filename.lower().endswith(ext) for ext in extensions): | |
try: | |
documents.append(load_document(file_path)) | |
except Exception as e: | |
print(f"Error loading {filename}: {e}") | |
return documents |