Spaces:
Running
Running
import os | |
from _utils.LLMs.LLM_class import LLM | |
from _utils.gerar_relatorio_modelo_usuario.prompts import ( | |
prompt_auxiliar_do_contextual_prompt, | |
create_prompt_auxiliar_do_contextual_prompt, | |
) | |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble | |
from _utils.chains.Chain_class import Chain | |
from _utils.handle_files import return_document_list_with_llama_parser | |
from _utils.prompts.Prompt_class import Prompt | |
from _utils.splitters.Splitter_class import Splitter | |
from setup.easy_imports import PyPDFLoader | |
from langchain_openai import ChatOpenAI | |
from typing import List, Dict, Tuple, Optional, cast | |
from anthropic import Anthropic, AsyncAnthropic | |
import logging | |
from langchain.schema import Document | |
from llama_index import Document as Llama_Index_Document | |
import asyncio | |
from langchain.prompts import PromptTemplate | |
from typing import List | |
from multiprocessing import Process, Barrier, Queue | |
from dataclasses import dataclass | |
from langchain_core.messages import HumanMessage | |
from asgiref.sync import sync_to_async | |
from setup.easy_imports import ChatPromptTemplate, ChatOpenAI | |
from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer | |
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt | |
from _utils.models.gerar_relatorio import ( | |
ContextualizedChunk, | |
DocumentChunk, | |
RetrievalConfig, | |
) | |
from _utils.prompts.Prompt_class import prompt as prompt_obj | |
lista_contador = [] | |
class ContextualRetriever: | |
def __init__( | |
self, config: RetrievalConfig, claude_api_key: str, claude_context_model: str | |
): | |
self.config = config | |
# self.claude_client = Anthropic(api_key=claude_api_key) | |
self.claude_client = AsyncAnthropic(api_key=claude_api_key) | |
self.logger = logging.getLogger(__name__) | |
self.bm25 = None | |
self.claude_context_model = claude_context_model | |
async def contextualize_all_chunks( | |
self, full_text_as_array: List[Document], chunks: List[DocumentChunk] | |
) -> List[ContextualizedChunk]: | |
"""Add context to all chunks""" | |
contextualized_chunks = [] | |
full_text = "" | |
for x in full_text_as_array: | |
full_text += x.page_content | |
prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text) | |
print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500]) | |
# Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro | |
# response_auxiliar_summary = await aclaude_answer( | |
# self.claude_client, self.claude_context_model, prompt_auxiliar_summary | |
# ) | |
llms = LLM() | |
response_auxiliar_summary = await llms.googleGemini().ainvoke( | |
[HumanMessage(content=prompt_auxiliar_summary)] | |
) | |
print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content) | |
lista_de_listas_cada_com_20_chunks = [ | |
chunks[i : i + 20] for i in range(0, len(chunks), 20) | |
] | |
print( | |
"lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks | |
) | |
async with asyncio.TaskGroup() as tg: | |
tasks = [ | |
tg.create_task( | |
self.create_contextualized_chunk( | |
chunk, full_text_as_array, response_auxiliar_summary.content | |
) | |
) | |
# for chunk in chunks # ORIGINAL | |
for chunk in lista_de_listas_cada_com_20_chunks | |
] | |
# contextualized_chunks = [task.result() for task in tasks] | |
contextualized_chunks = [] | |
for task in tasks: | |
# print("\n\ntask", task) | |
# print("\n\ntask.result()", task.result()) | |
contextualized_chunks = contextualized_chunks + task.result() | |
print("\n\ncontextualized_chunks", contextualized_chunks) | |
return contextualized_chunks | |
# ORIGINAL | |
# async def create_contextualized_chunk( | |
# self, chunk, single_page_text, response_auxiliar_summary | |
# ): | |
# lista_contador.append(0) | |
# print("contador: ", len(lista_contador)) | |
# page_number = chunk.page_number - 1 | |
# page_content = single_page_text[page_number].page_content | |
# context = await self.llm_generate_context( | |
# page_content, chunk, response_auxiliar_summary | |
# ) | |
# print("context: ", context) | |
# return ContextualizedChunk( | |
# content=chunk.content, | |
# page_number=chunk.page_number, | |
# chunk_id=chunk.chunk_id, | |
# start_char=chunk.start_char, | |
# end_char=chunk.end_char, | |
# context=context, | |
# ) | |
async def create_contextualized_chunk( | |
self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary | |
): | |
lista_contador.append(0) | |
print("contador: ", len(lista_contador)) | |
all_pages_contents = "" | |
contador = 1 | |
for chunk in chunks: | |
page_number = chunk.page_number - 1 | |
page_content = single_page_text[page_number].page_content | |
all_pages_contents += page_content | |
contador += 1 | |
context = await self.llm_generate_context( | |
page_content, chunks, response_auxiliar_summary | |
) | |
context = ( | |
context.replace("document_id: ", "") | |
.replace("document_id:", "") | |
.replace("DOCUMENT_ID: ", "") | |
.replace("DOCUMENT_ID: ", "") | |
) | |
# print("context: ", context) | |
import re | |
pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito | |
# pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]" | |
matches = re.findall(pattern, context, re.DOTALL) | |
# Convert matches to the desired format | |
result = [ | |
[int(doc_id), title.strip(), content.strip()] | |
for doc_id, title, content in matches | |
] | |
# print("\n\nresult", result) | |
if result == "" or result == [""]: | |
print("\n\ncontext", context) | |
lista_chunks = [] | |
for index, chunk in enumerate(chunks): | |
lista_chunks.append( | |
ContextualizedChunk( | |
content=chunk.content, | |
page_number=chunk.page_number, | |
chunk_id=result[index][0], | |
start_char=chunk.start_char, | |
end_char=chunk.end_char, | |
context=" ".join(result[index][1:2]), | |
) | |
) | |
return lista_chunks | |
# ORIGINAL | |
# async def llm_generate_context( | |
# self, page_text: str, chunk: DocumentChunk, resumo_auxiliar | |
# ) -> str: | |
# """Generate contextual description using ChatOpenAI""" | |
# try: | |
# print("COMEÇOU A REQUISIÇÃO") | |
# prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content) | |
# # response = await aclaude_answer( | |
# # self.claude_client, self.claude_context_model, prompt | |
# # ) | |
# # response = await agpt_answer(prompt) | |
# llms = LLM() | |
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)]) | |
# return cast(str, response.content) | |
# except Exception as e: | |
# self.logger.error( | |
# f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}" | |
# ) | |
# return "" | |
async def llm_generate_context( | |
self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar | |
) -> str: | |
"""Generate contextual description using ChatOpenAI""" | |
contador = 1 | |
all_chunks_contents = "" | |
for chunk in chunks: | |
all_chunks_contents += chunk.content | |
all_chunks_contents += f"\n\n CHUNK {contador}:\n" | |
contador += 1 | |
try: | |
print("COMEÇOU A REQUISIÇÃO") | |
prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents) | |
# response = await aclaude_answer( | |
# self.claude_client, self.claude_context_model, prompt | |
# ) | |
response = await agpt_answer(prompt) | |
# llms = LLM() | |
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)]) | |
# return cast(str, response.content) | |
return cast(str, response) | |
except Exception as e: | |
self.logger.error(f"Context generation failed for chunks .... : {str(e)}") | |
return "" | |
# def gerar_resumo_auxiliar_do_contextual_embedding(self): | |
# prompt = Prompt().create_prompt_template( | |
# "", prompt_auxiliar_do_contextual_prompt | |
# ) | |
# Chain(prompt, ChatOpenAI()) | |
# return | |
# Primeira função chamada do arquivo | |
async def contextualize_chunk_based_on_serializer( | |
serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks | |
): | |
if serializer["should_have_contextual_chunks"]: | |
contextualized_chunks = await contextual_retriever.contextualize_all_chunks( | |
pages, all_PDFs_chunks | |
) | |
chunks_passados = contextualized_chunks | |
is_contextualized_chunk = True | |
else: | |
chunks_passados = all_PDFs_chunks | |
is_contextualized_chunk = False | |
return chunks_passados, is_contextualized_chunk | |
async def get_full_text_and_all_PDFs_chunks( | |
listaPDFs: List[str], | |
splitterObject: Splitter, | |
should_use_llama_parse: bool, | |
isBubble: bool, | |
): | |
all_PDFs_chunks = [] | |
pages: List[Document] = [] | |
# Load and process document | |
for pdf_path in listaPDFs: | |
if isBubble: | |
pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse) | |
else: | |
if should_use_llama_parse: | |
pages = pages + await return_document_list_with_llama_parser(pdf_path) | |
else: | |
pages = pages + PyPDFLoader(pdf_path).load() | |
chunks = splitterObject.load_and_split_document( | |
pdf_path, pages, should_use_llama_parse | |
) | |
all_PDFs_chunks = all_PDFs_chunks + chunks | |
# Get full text for contextualization | |
# loader = PyPDFLoader(pdf_path) | |
# full_text = "" | |
# full_text = " ".join([page.page_content for page in pages]) | |
return all_PDFs_chunks, pages # , full_text | |
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk | |
# page_content = "" | |
# for i in range( | |
# max(0, chunk.page_number - 1), | |
# min(len(single_page_text), chunk.page_number + 2), | |
# ): | |
# page_content += single_page_text[i].page_content if single_page_text[i] else "" | |