Spaces:
Running
Running
File size: 2,277 Bytes
640b1c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# src/utils/document_loader.py
import os
from typing import List, Union
import PyPDF2
import docx
def load_document(file_path: str) -> str:
"""
Load text from various document types
s
Args:
file_path (str): Path to the document file
Returns:
str: Extracted text from the document
Raises:
ValueError: If file type is not supported
"""
# Get file extension
_, ext = os.path.splitext(file_path)
ext = ext.lower()
# Load based on file type
if ext == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
elif ext == '.pdf':
return load_pdf(file_path)
elif ext == '.docx':
return load_docx(file_path)
else:
raise ValueError(f"Unsupported file type: {ext}")
def load_pdf(file_path: str) -> str:
"""
Extract text from PDF file
Args:
file_path (str): Path to PDF file
Returns:
str: Extracted text
"""
text = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text()
return text
def load_docx(file_path: str) -> str:
"""
Extract text from DOCX file
Args:
file_path (str): Path to DOCX file
Returns:
str: Extracted text
"""
doc = docx.Document(file_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
def load_documents_from_directory(
directory: str,
extensions: List[str] = ['.txt', '.pdf', '.docx']
) -> List[str]:
"""
Load all documents from a directory
Args:
directory (str): Path to the directory
extensions (List[str]): List of file extensions to load
Returns:
List[str]: List of document texts
"""
documents = []
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path) and any(filename.lower().endswith(ext) for ext in extensions):
try:
documents.append(load_document(file_path))
except Exception as e:
print(f"Error loading {filename}: {e}")
return documents |