Spaces:
Running
Running
import pytest | |
import os | |
from _utils.splitters.Splitter_class import Splitter | |
from _utils.models.gerar_relatorio import ( | |
DocumentChunk, | |
) | |
base_dir = os.path.dirname(os.path.abspath(__file__)) | |
chunk_size = 1000 | |
chunk_overlap = 200 | |
cwd = os.getcwd() | |
pdf_file = os.path.join(cwd, "tests", "fixtures", "_pdf-uma-pagina.pdf") | |
class TestSplitters: | |
splitter = Splitter(chunk_size, chunk_overlap) | |
async def test_load_and_split_document_No_llama_parse_No_Bubble(self, monkeypatch): | |
should_use_llama_parse = False | |
isBubble = False | |
result_chunks, result_strings = await self.splitter.load_and_split_document( | |
pdf_file, should_use_llama_parse, isBubble | |
) | |
assert isinstance(result_chunks, list) | |
assert isinstance(result_strings, list) | |
assert len(result_chunks) > 0 | |
assert len(result_strings) > 0 | |
assert all(isinstance(item, str) for item in result_strings) | |
assert all(isinstance(item, DocumentChunk) for item in result_chunks) | |
assert all( | |
(chunk_size - 100) < len(item.content) < (chunk_size + 100) | |
for item in result_chunks | |
) | |
async def test_load_and_split_document_No_llama_parse_No_Bubble_with_bigger_chunk( | |
self, monkeypatch | |
): | |
should_use_llama_parse = False | |
isBubble = False | |
chunk_size = 3500 | |
splitter_temp = Splitter(chunk_size, chunk_overlap) | |
result_chunks, result_strings = await splitter_temp.load_and_split_document( | |
pdf_file, should_use_llama_parse, isBubble | |
) | |
assert isinstance(result_chunks, list) | |
assert isinstance(result_strings, list) | |
assert len(result_chunks) > 0 | |
assert len(result_strings) > 0 | |
assert all(isinstance(item, str) for item in result_strings) | |
assert all(isinstance(item, DocumentChunk) for item in result_chunks) | |
assert all( | |
(chunk_size - 200) < len(item.content) < (chunk_size + 200) | |
for item in result_chunks | |
) | |
async def test_load_and_split_document_With_llama_parse_No_Bubble( | |
self, monkeypatch | |
): | |
should_use_llama_parse = True | |
isBubble = False | |
result_chunks, result_strings = await self.splitter.load_and_split_document( | |
pdf_file, should_use_llama_parse, isBubble | |
) | |
assert isinstance(result_chunks, list) | |
assert isinstance(result_strings, list) | |
assert len(result_chunks) > 0 | |
assert len(result_strings) > 0 | |
assert all(isinstance(item, str) for item in result_strings) | |
assert all(isinstance(item, DocumentChunk) for item in result_chunks) | |
# Teste abaixo não passa ainda --> Será consertado no futuro | |
# assert all( | |
# (chunk_size - 100) < len(item.content) < (chunk_size + 100) | |
# for item in result_chunks | |
# ) | |