Spaces:
Running
Running
File size: 5,705 Bytes
cb23311 78209bc c5586ab 12d3e1a 78209bc 12d3e1a 78209bc 12d3e1a cb23311 78209bc 12d3e1a 78209bc 12d3e1a 78209bc 12d3e1a 78209bc 12d3e1a 78209bc 12d3e1a 78209bc 12d3e1a b374298 12d3e1a b374298 12d3e1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.splitters.splitter_util import combine_documents_without_losing_pagination
from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
from typing import Any, List, Dict, Tuple, Optional, cast
from _utils.models.gerar_relatorio import (
DocumentChunk,
)
import uuid
class Splitter:
def __init__(
self,
chunk_size,
chunk_overlap,
):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
self.chunk_metadata = {} # Store chunk metadata for tracing
async def load_and_split_document(
self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
):
"""Load PDF and split into chunks with metadata"""
# loader = PyPDFLoader(pdf_path)
# if not pages:
# pages = get_pdf_from_bubble(
# pdf_path
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
initial_chunks: List[str] = []
if isBubble:
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
initial_chunks = initial_chunks + self.text_splitter.split_text(
combined_text
)
else:
if should_use_llama_parse:
pages = await return_document_list_with_llama_parser(pdf_path)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
initial_chunks = initial_chunks + self.text_splitter.split_text(
combined_text
)
else:
pages = PyPDFLoader(pdf_path).load()
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
initial_chunks = initial_chunks + self.text_splitter.split_text(
combined_text
)
chunks: List[DocumentChunk] = []
char_count = 0
# for page in pages:
# text = page.page_content
# page_chunks = self.text_splitter.split_text(
# text
# ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
text_char = 0
for chunk in initial_chunks:
chunk_id = str(uuid.uuid4())
start_char = text_char + 1
end_char = start_char + len(chunk)
text_char = end_char
if should_use_llama_parse:
somar_pages = 0
else:
somar_pages = 1
page_number = 0
for start, end, page_number in page_boundaries:
if start <= start_char < end:
page_number = page_number
break
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
contextual_summary="",
page_number=page_number + somar_pages, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
# char_count += len(text)
return chunks, initial_chunks
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
page = Document(page_content=text, metadata={"page": 1})
chunks = []
char_count = 0
text = page.page_content
page_chunks = self.text_splitter.split_text(
text
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
print("\n\n\npage_chunks: ", page_chunks)
for chunk in page_chunks:
chunk_id = str(uuid.uuid4())
start_char = text.find(
chunk
) # Retorna a posição onde se encontra o chunk dentro da página inteira
end_char = start_char + len(chunk)
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
page_number=cast(int, page.metadata.get("page"))
+ 1, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
char_count += len(text)
return chunks
|