File size: 2,277 Bytes
640b1c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# src/utils/document_loader.py
import os
from typing import List, Union
import PyPDF2
import docx

def load_document(file_path: str) -> str:
    """
    Load text from various document types
    s
    Args:
        file_path (str): Path to the document file
    
    Returns:
        str: Extracted text from the document
    
    Raises:
        ValueError: If file type is not supported
    """
    # Get file extension
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    
    # Load based on file type
    if ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    
    elif ext == '.pdf':
        return load_pdf(file_path)
    
    elif ext == '.docx':
        return load_docx(file_path)
    
    else:
        raise ValueError(f"Unsupported file type: {ext}")

def load_pdf(file_path: str) -> str:
    """
    Extract text from PDF file
    
    Args:
        file_path (str): Path to PDF file
    
    Returns:
        str: Extracted text
    """
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def load_docx(file_path: str) -> str:
    """
    Extract text from DOCX file
    
    Args:
        file_path (str): Path to DOCX file
    
    Returns:
        str: Extracted text
    """
    doc = docx.Document(file_path)
    return '\n'.join([paragraph.text for paragraph in doc.paragraphs])

def load_documents_from_directory(
    directory: str, 
    extensions: List[str] = ['.txt', '.pdf', '.docx']
) -> List[str]:
    """
    Load all documents from a directory
    
    Args:
        directory (str): Path to the directory
        extensions (List[str]): List of file extensions to load
    
    Returns:
        List[str]: List of document texts
    """
    documents = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path) and any(filename.lower().endswith(ext) for ext in extensions):
            try:
                documents.append(load_document(file_path))
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    
    return documents