Spaces:
Sleeping
Sleeping
File size: 2,869 Bytes
6e09bf4 c5586ab 5cb00b6 c5586ab 6e09bf4 5cb00b6 6e09bf4 5cb00b6 451f8a3 5cb00b6 6e09bf4 c5586ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
from typing import List, Tuple
from langchain_core.documents import Document
from odf.opendocument import load
from odf.text import P
from typing import List
from setup.easy_imports import (
PyPDFLoader,
RecursiveCharacterTextSplitter,
)
class SplitterUtils:
def get_file_type(self, file_path):
_, ext = os.path.splitext(file_path)
ext = ext.lower() # Normalize to lowercase
if ext == ".pdf":
return "pdf"
elif ext == ".docx":
return "word"
elif ext == ".odt":
return "odt"
elif ext == ".txt":
return "txt"
else:
print("\next", ext)
return "unknown"
def load_odt_file(self, file_path: str):
textdoc = load(file_path)
all_paragraphs = textdoc.getElementsByType(P)
text = []
for p in all_paragraphs:
for node in p.childNodes:
if node.nodeType == node.TEXT_NODE:
text.append(node.data)
return "\n".join(text)
class Splitter_Simple:
def __init__(
self,
chunk_size=1000,
chunk_overlap=400,
):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
async def load_and_split_document(self, pdf_path: str):
"""Load PDF and split into chunks with metadata"""
print("\nCOMEÇANDO LEITURA DO PDF")
pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
print("\nTERMINADO LEITURA DO PDF")
return pages
def load_and_split_text(self, text: str) -> List[Document]:
documents: List[Document] = []
chunks = self.text_splitter.split_text(text)
for chunk in chunks:
documents.append(Document(page_content=chunk))
return documents
def get_chunks_of_string_only_from_list_of_documents(
self, lista_de_documentos: List[Document]
):
full_text_as_string = ""
for page in lista_de_documentos:
full_text_as_string = full_text_as_string + page.page_content
full_text_as_array = self.text_splitter.split_text(full_text_as_string)
return full_text_as_array
def combine_documents_without_losing_pagination(documents: list[Document]):
combined_text = ""
page_boundaries: List[Tuple[int, int, int]] = (
[]
) # (start_idx, end_idx, page_number)
current_position = 0
for document in documents:
start = current_position
combined_text += document.page_content
end = current_position + len(document.page_content)
page_number = document.metadata.get("page", len(page_boundaries) + 1)
page_boundaries.append((start, end, page_number))
current_position = end
return page_boundaries, combined_text
|