File size: 10,869 Bytes
1286e81
cb23311
e70ffc1
7fa7a9c
 
 
 
cb23311
12d3e1a
b374298
12d3e1a
 
 
1286e81
e70ffc1
20e3edd
1286e81
 
b374298
1286e81
 
 
 
 
 
20e3edd
7fa7a9c
1286e81
12d3e1a
1286e81
 
 
 
 
 
7fa7a9c
1286e81
 
 
20e3edd
1286e81
 
 
 
 
20e3edd
 
1286e81
 
 
 
baeaaa5
7fa7a9c
1286e81
 
 
7fa7a9c
 
 
 
e70ffc1
 
 
7fa7a9c
e70ffc1
 
 
7fa7a9c
 
e70ffc1
 
 
 
7fa7a9c
e70ffc1
7fa7a9c
e70ffc1
 
 
 
 
7fa7a9c
 
baeaaa5
 
7fa7a9c
 
e70ffc1
7fa7a9c
 
e70ffc1
 
baeaaa5
1286e81
e70ffc1
 
 
 
 
 
 
1286e81
e70ffc1
1286e81
12d3e1a
e70ffc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55f46c1
 
 
e70ffc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3e1a
b374298
 
 
 
55f46c1
b374298
12d3e1a
b374298
 
 
 
 
55f46c1
 
b374298
55f46c1
 
 
 
 
e70ffc1
 
 
b374298
 
 
 
 
 
 
 
12d3e1a
 
e70ffc1
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import os

from _utils.LLMs.LLM_class import LLM
from _utils.gerar_relatorio_modelo_usuario.prompts import (
    prompt_auxiliar_do_contextual_prompt,
    create_prompt_auxiliar_do_contextual_prompt,
)
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from _utils.chains.Chain_class import Chain
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.prompts.Prompt_class import Prompt
from _utils.splitters.Splitter_class import Splitter
from setup.easy_imports import PyPDFLoader
from langchain_openai import ChatOpenAI
from typing import List, Dict, Tuple, Optional, cast
from anthropic import Anthropic, AsyncAnthropic
import logging
from langchain.schema import Document
from llama_index import Document as Llama_Index_Document
import asyncio
from langchain.prompts import PromptTemplate
from typing import List
from multiprocessing import Process, Barrier, Queue
from dataclasses import dataclass
from langchain_core.messages import HumanMessage
from asgiref.sync import sync_to_async
from setup.easy_imports import ChatPromptTemplate, ChatOpenAI

from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
from _utils.models.gerar_relatorio import (
    ContextualizedChunk,
    DocumentChunk,
    RetrievalConfig,
)
from _utils.prompts.Prompt_class import prompt as prompt_obj

lista_contador = []


class ContextualRetriever:
    def __init__(
        self, config: RetrievalConfig, claude_api_key: str, claude_context_model: str
    ):
        self.config = config
        # self.claude_client = Anthropic(api_key=claude_api_key)
        self.claude_client = AsyncAnthropic(api_key=claude_api_key)
        self.logger = logging.getLogger(__name__)
        self.bm25 = None
        self.claude_context_model = claude_context_model

    async def contextualize_all_chunks(
        self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
    ) -> List[ContextualizedChunk]:
        """Add context to all chunks"""
        contextualized_chunks = []
        full_text = ""
        for x in full_text_as_array:
            full_text += x.page_content

        prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)

        print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])

        # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
        # response_auxiliar_summary = await aclaude_answer(
        #     self.claude_client, self.claude_context_model, prompt_auxiliar_summary
        # )

        llms = LLM()
        response_auxiliar_summary = await llms.googleGemini().ainvoke(
            [HumanMessage(content=prompt_auxiliar_summary)]
        )

        print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)

        lista_de_listas_cada_com_20_chunks = [
            chunks[i : i + 20] for i in range(0, len(chunks), 20)
        ]
        print(
            "lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
        )

        async with asyncio.TaskGroup() as tg:
            tasks = [
                tg.create_task(
                    self.create_contextualized_chunk(
                        chunk, full_text_as_array, response_auxiliar_summary.content
                    )
                )
                # for chunk in chunks # ORIGINAL
                for chunk in lista_de_listas_cada_com_20_chunks
            ]

        # contextualized_chunks = [task.result() for task in tasks]
        contextualized_chunks = []
        for task in tasks:
            # print("\n\ntask", task)
            # print("\n\ntask.result()", task.result())

            contextualized_chunks = contextualized_chunks + task.result()

        print("\n\ncontextualized_chunks", contextualized_chunks)
        return contextualized_chunks

    # ORIGINAL
    # async def create_contextualized_chunk(
    #     self, chunk, single_page_text, response_auxiliar_summary
    # ):
    #     lista_contador.append(0)
    #     print("contador: ", len(lista_contador))
    #     page_number = chunk.page_number - 1
    #     page_content = single_page_text[page_number].page_content

    #     context = await self.llm_generate_context(
    #         page_content, chunk, response_auxiliar_summary
    #     )
    #     print("context: ", context)
    #     return ContextualizedChunk(
    #         content=chunk.content,
    #         page_number=chunk.page_number,
    #         chunk_id=chunk.chunk_id,
    #         start_char=chunk.start_char,
    #         end_char=chunk.end_char,
    #         context=context,
    #     )

    async def create_contextualized_chunk(
        self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
    ):

        lista_contador.append(0)
        print("contador: ", len(lista_contador))
        all_pages_contents = ""
        contador = 1
        for chunk in chunks:
            page_number = chunk.page_number - 1
            page_content = single_page_text[page_number].page_content

            all_pages_contents += page_content
            contador += 1

        context = await self.llm_generate_context(
            page_content, chunks, response_auxiliar_summary
        )

        context = (
            context.replace("document_id: ", "")
            .replace("document_id:", "")
            .replace("DOCUMENT_ID: ", "")
            .replace("DOCUMENT_ID: ", "")
        )

        # print("context: ", context)
        import re

        pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>"  # Funciona para quando a resposta do LLM não vem com "document_id" escrito
        # pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
        matches = re.findall(pattern, context, re.DOTALL)

        # Convert matches to the desired format
        result = [
            [int(doc_id), title.strip(), content.strip()]
            for doc_id, title, content in matches
        ]
        # print("\n\nresult", result)

        if result == "" or result == [""]:
            print("\n\ncontext", context)

        lista_chunks = []
        for index, chunk in enumerate(chunks):
            lista_chunks.append(
                ContextualizedChunk(
                    content=chunk.content,
                    page_number=chunk.page_number,
                    chunk_id=result[index][0],
                    start_char=chunk.start_char,
                    end_char=chunk.end_char,
                    context=" ".join(result[index][1:2]),
                )
            )

        return lista_chunks

    # ORIGINAL
    # async def llm_generate_context(
    #     self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
    # ) -> str:
    #     """Generate contextual description using ChatOpenAI"""
    #     try:
    #         print("COMEÇOU A REQUISIÇÃO")
    #         prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
    #         # response = await aclaude_answer(
    #         #     self.claude_client, self.claude_context_model, prompt
    #         # )

    #         # response = await agpt_answer(prompt)
    #         llms = LLM()
    #         response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
    #         return cast(str, response.content)
    #     except Exception as e:
    #         self.logger.error(
    #             f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
    #         )
    #         return ""

    async def llm_generate_context(
        self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
    ) -> str:
        """Generate contextual description using ChatOpenAI"""
        contador = 1
        all_chunks_contents = ""

        for chunk in chunks:
            all_chunks_contents += chunk.content
            all_chunks_contents += f"\n\n CHUNK {contador}:\n"
            contador += 1

        try:
            print("COMEÇOU A REQUISIÇÃO")
            prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
            # response = await aclaude_answer(
            #     self.claude_client, self.claude_context_model, prompt
            # )

            response = await agpt_answer(prompt)
            # llms = LLM()
            # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
            # return cast(str, response.content)
            return cast(str, response)
        except Exception as e:
            self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
            return ""

    # def gerar_resumo_auxiliar_do_contextual_embedding(self):
    #     prompt = Prompt().create_prompt_template(
    #         "", prompt_auxiliar_do_contextual_prompt
    #     )
    #     Chain(prompt, ChatOpenAI())
    #     return


# Primeira função chamada do arquivo
async def contextualize_chunk_based_on_serializer(
    serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
):
    if serializer["should_have_contextual_chunks"]:
        contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
            pages, all_PDFs_chunks
        )
        chunks_passados = contextualized_chunks
        is_contextualized_chunk = True
    else:
        chunks_passados = all_PDFs_chunks
        is_contextualized_chunk = False

    return chunks_passados, is_contextualized_chunk


async def get_full_text_and_all_PDFs_chunks(
    listaPDFs: List[str],
    splitterObject: Splitter,
    should_use_llama_parse: bool,
    isBubble: bool,
):
    all_PDFs_chunks = []

    pages: List[Document] = []

    # Load and process document
    for pdf_path in listaPDFs:
        if isBubble:
            pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
        else:
            if should_use_llama_parse:
                pages = pages + await return_document_list_with_llama_parser(pdf_path)
            else:
                pages = pages + PyPDFLoader(pdf_path).load()

        chunks = splitterObject.load_and_split_document(
            pdf_path, pages, should_use_llama_parse
        )
        all_PDFs_chunks = all_PDFs_chunks + chunks
    # Get full text for contextualization
    # loader = PyPDFLoader(pdf_path)

    # full_text = ""
    # full_text = " ".join([page.page_content for page in pages])

    return all_PDFs_chunks, pages  # , full_text


# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
# page_content = ""
# for i in range(
#     max(0, chunk.page_number - 1),
#     min(len(single_page_text), chunk.page_number + 2),
# ):
#     page_content += single_page_text[i].page_content if single_page_text[i] else ""