File size: 2,869 Bytes
6e09bf4
c5586ab
 
5cb00b6
 
 
 
 
 
 
c5586ab
 
6e09bf4
 
 
 
 
 
 
 
5cb00b6
 
 
 
6e09bf4
 
 
 
5cb00b6
 
 
451f8a3
 
 
 
 
 
5cb00b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e09bf4
c5586ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from typing import List, Tuple
from langchain_core.documents import Document
from odf.opendocument import load
from odf.text import P
from typing import List
from setup.easy_imports import (
    PyPDFLoader,
    RecursiveCharacterTextSplitter,
)


class SplitterUtils:
    def get_file_type(self, file_path):
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()  # Normalize to lowercase
        if ext == ".pdf":
            return "pdf"
        elif ext == ".docx":
            return "word"
        elif ext == ".odt":
            return "odt"
        elif ext == ".txt":
            return "txt"
        else:
            print("\next", ext)
            return "unknown"

    def load_odt_file(self, file_path: str):
        textdoc = load(file_path)
        all_paragraphs = textdoc.getElementsByType(P)
        text = []
        for p in all_paragraphs:
            for node in p.childNodes:
                if node.nodeType == node.TEXT_NODE:
                    text.append(node.data)
        return "\n".join(text)


class Splitter_Simple:
    def __init__(
        self,
        chunk_size=1000,
        chunk_overlap=400,
    ):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )

    async def load_and_split_document(self, pdf_path: str):
        """Load PDF and split into chunks with metadata"""
        print("\nCOMEÇANDO LEITURA DO PDF")
        pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
        print("\nTERMINADO LEITURA DO PDF")

        return pages

    def load_and_split_text(self, text: str) -> List[Document]:
        documents: List[Document] = []
        chunks = self.text_splitter.split_text(text)

        for chunk in chunks:
            documents.append(Document(page_content=chunk))

        return documents

    def get_chunks_of_string_only_from_list_of_documents(
        self, lista_de_documentos: List[Document]
    ):
        full_text_as_string = ""
        for page in lista_de_documentos:
            full_text_as_string = full_text_as_string + page.page_content
        full_text_as_array = self.text_splitter.split_text(full_text_as_string)
        return full_text_as_array


def combine_documents_without_losing_pagination(documents: list[Document]):
    combined_text = ""
    page_boundaries: List[Tuple[int, int, int]] = (
        []
    )  # (start_idx, end_idx, page_number)
    current_position = 0
    for document in documents:
        start = current_position
        combined_text += document.page_content
        end = current_position + len(document.page_content)
        page_number = document.metadata.get("page", len(page_boundaries) + 1)
        page_boundaries.append((start, end, page_number))

        current_position = end
    return page_boundaries, combined_text