Spaces:
Running
Running
File size: 6,579 Bytes
1286e81 91028c0 78209bc d07865c 78209bc f490f11 20e3edd 1286e81 b374298 1286e81 9d333bd 1286e81 91028c0 1286e81 20e3edd 1286e81 d07865c 1286e81 d07865c e70ffc1 de78af1 e70ffc1 c5586ab d07865c c5586ab e70ffc1 c5586ab e70ffc1 de78af1 f490f11 de78af1 f490f11 de78af1 e70ffc1 f490f11 c5586ab e70ffc1 78209bc de78af1 d07865c f490f11 d07865c f490f11 91028c0 68ee60a 903083d 91028c0 f490f11 78209bc c5586ab 78209bc de78af1 78209bc e70ffc1 f490f11 e70ffc1 d07865c e70ffc1 d07865c e70ffc1 f8e2c8b 2213315 d07865c 2213315 12d3e1a d07865c b374298 d07865c b374298 d07865c 78209bc d07865c b374298 d07865c b374298 d07865c 12d3e1a e70ffc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import os
from _utils.LLMs.LLM_class import LLM
from _utils.gerar_relatorio_modelo_usuario.utils import (
get_response_from_auxiliar_contextual_prompt,
validate_many_chunks_in_one_request,
)
from typing import Any, List, Dict, Tuple, Optional, cast
from anthropic import Anthropic, AsyncAnthropic
import logging
from langchain.schema import Document
from llama_index import Document as Llama_Index_Document
import asyncio
from typing import List
from dataclasses import dataclass
from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agemini_answer, agpt_answer
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
from _utils.models.gerar_relatorio import (
ContextualizedChunk,
DocumentChunk,
RetrievalConfig,
)
from langchain_core.messages import HumanMessage
lista_contador = []
class ContextualRetriever:
def __init__(self, config: RetrievalConfig, claude_context_model: str):
self.config = config
self.logger = logging.getLogger(__name__)
self.bm25 = None
self.claude_context_model = claude_context_model
self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
# self.claude_client = Anthropic(api_key=claude_api_key)
def getAllDocumentsIds(self, lista_com_20_chunks: List[DocumentChunk]):
contador = 1
all_chunks_contents = ""
all_document_ids = []
for chunk in lista_com_20_chunks:
all_chunks_contents += f"\n\nCHUNK {contador}:\n"
all_chunks_contents += chunk.content
pattern = r"Num\. (\d+)"
import re
match = re.search(pattern, chunk.content)
if match:
number = match.group(1) # Extract the number
else:
number = 0
all_document_ids.append(int(number))
contador += 1
return all_chunks_contents, all_document_ids
def get_info_from_validated_chunks(self, matches):
result = [
[int(doc_id), title.strip(), content.strip()]
for doc_id, title, content in matches
]
return result
async def llm_call_uma_lista_de_chunks(
self, lista_com_20_chunks: List[DocumentChunk], resumo_auxiliar
) -> List[List[Any]]:
"""Generate contextual description using ChatOpenAI"""
all_chunks_contents, all_document_ids = self.getAllDocumentsIds(
lista_com_20_chunks
)
try:
print("\n\nCOMEÇOU A REQUISIÇÃO")
prompt = contextual_prompt(
resumo_auxiliar, all_chunks_contents, len(lista_com_20_chunks)
)
for attempt in range(4):
if attempt != 0:
print("------------- FORMATAÇÃO DO CONTEXTUAL INCORRETA - TENTANDO NOVAMENTE -------------")
print(
f"TENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt + 1}"
)
print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
# raw_response = await agpt_answer(prompt)
# raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
response = cast(str, raw_response)
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
# return cast(str, response.content)
matches = validate_many_chunks_in_one_request(
response, all_document_ids
)
if matches:
return self.get_info_from_validated_chunks(matches)
raise ValueError(f"FORMATAÇÃO DOS CHUNKS FOI INVÁLIDA: {response}")
except Exception as e:
self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
return [[""]]
async def contextualize_uma_lista_de_chunks(
self, lista_com_20_chunks: List[DocumentChunk], response_auxiliar_summary
):
lista_contador.append(0)
print("contador: ", len(lista_contador))
result = await self.llm_call_uma_lista_de_chunks(
lista_com_20_chunks, response_auxiliar_summary
)
lista_chunks: List[ContextualizedChunk] = []
try:
for index, chunk in enumerate(lista_com_20_chunks):
lista_chunks.append(
ContextualizedChunk(
contextual_summary=result[index][2],
content=chunk.content,
page_number=chunk.page_number,
id_do_processo=int(result[index][0]),
chunk_id=chunk.chunk_id,
start_char=chunk.start_char,
end_char=chunk.end_char,
context=result[index][1],
)
)
except BaseException as e :
print(e)
print("\nERRO DO CONTEXTUAL")
print('\n\nresult', result)
return lista_chunks
async def contextualize_all_chunks(
self,
all_PDFs_chunks: List[DocumentChunk],
response_auxiliar_summary,
) -> List[ContextualizedChunk]:
"""Add context to all chunks"""
lista_de_listas_cada_com_20_chunks = [
all_PDFs_chunks[i : i + 20] for i in range(0, len(all_PDFs_chunks), 20)
]
async with asyncio.TaskGroup() as tg:
tasks = [
tg.create_task(
self.contextualize_uma_lista_de_chunks(
lista_com_20_chunks,
response_auxiliar_summary,
)
)
for lista_com_20_chunks in lista_de_listas_cada_com_20_chunks
]
# contextualized_chunks = [task.result() for task in tasks]
contextualized_chunks = []
for task in tasks:
contextualized_chunks = contextualized_chunks + task.result()
return contextualized_chunks
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
# page_content = ""
# for i in range(
# max(0, chunk.page_number - 1),
# min(len(single_page_text), chunk.page_number + 2),
# ):
# page_content += single_page_text[i].page_content if single_page_text[i] else ""
|