File size: 2,978 Bytes
39fc36b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pytest
import os
from _utils.splitters.Splitter_class import Splitter
from _utils.models.gerar_relatorio import (
    DocumentChunk,
)

base_dir = os.path.dirname(os.path.abspath(__file__))
chunk_size = 1000
chunk_overlap = 200
cwd = os.getcwd()
pdf_file = os.path.join(cwd, "tests", "fixtures", "_pdf-uma-pagina.pdf")


class TestSplitters:
    splitter = Splitter(chunk_size, chunk_overlap)

    @pytest.mark.asyncio
    async def test_load_and_split_document_No_llama_parse_No_Bubble(self, monkeypatch):
        should_use_llama_parse = False
        isBubble = False

        result_chunks, result_strings = await self.splitter.load_and_split_document(
            pdf_file, should_use_llama_parse, isBubble
        )

        assert isinstance(result_chunks, list)
        assert isinstance(result_strings, list)
        assert len(result_chunks) > 0
        assert len(result_strings) > 0
        assert all(isinstance(item, str) for item in result_strings)
        assert all(isinstance(item, DocumentChunk) for item in result_chunks)
        assert all(
            (chunk_size - 100) < len(item.content) < (chunk_size + 100)
            for item in result_chunks
        )

    @pytest.mark.asyncio
    async def test_load_and_split_document_No_llama_parse_No_Bubble_with_bigger_chunk(
        self, monkeypatch
    ):
        should_use_llama_parse = False
        isBubble = False
        chunk_size = 3500
        splitter_temp = Splitter(chunk_size, chunk_overlap)

        result_chunks, result_strings = await splitter_temp.load_and_split_document(
            pdf_file, should_use_llama_parse, isBubble
        )

        assert isinstance(result_chunks, list)
        assert isinstance(result_strings, list)
        assert len(result_chunks) > 0
        assert len(result_strings) > 0
        assert all(isinstance(item, str) for item in result_strings)
        assert all(isinstance(item, DocumentChunk) for item in result_chunks)
        assert all(
            (chunk_size - 200) < len(item.content) < (chunk_size + 200)
            for item in result_chunks
        )

    @pytest.mark.asyncio
    async def test_load_and_split_document_With_llama_parse_No_Bubble(
        self, monkeypatch
    ):
        should_use_llama_parse = True
        isBubble = False
        result_chunks, result_strings = await self.splitter.load_and_split_document(
            pdf_file, should_use_llama_parse, isBubble
        )

        assert isinstance(result_chunks, list)
        assert isinstance(result_strings, list)
        assert len(result_chunks) > 0
        assert len(result_strings) > 0
        assert all(isinstance(item, str) for item in result_strings)
        assert all(isinstance(item, DocumentChunk) for item in result_chunks)
        # Teste abaixo não passa ainda --> Será consertado no futuro
        # assert all(
        #     (chunk_size - 100) < len(item.content) < (chunk_size + 100)
        #     for item in result_chunks
        # )