Spaces:

luanpoppe
/

vella-backend

Sleeping

vella-backend / _utils /gerar_relatorio_modelo_usuario /GerarDocumento.py

luanpoppe

feat: gerando documento final corretamente

e1d2a79 about 1 month ago

8.15 kB

	import os
	from typing import List, Dict, Tuple, Optional, cast

	from pydantic import SecretStr
	from _utils.LLMs.LLM_class import LLM
	from _utils.vector_stores.Vector_store_class import VectorStore
	from setup.easy_imports import (
	Chroma,
	ChatOpenAI,
	PromptTemplate,
	BM25Okapi,
	Response,
	HuggingFaceEmbeddings,
	)
	import logging
	from _utils.gerar_relatorio_modelo_usuario.DocumentSummarizer_simples import (
	DocumentSummarizer,
	)
	from _utils.models.gerar_relatorio import (
	RetrievalConfig,
	)
	from cohere import Client
	from _utils.splitters.Splitter_class import Splitter


	class GerarDocumento:
	openai_api_key = os.environ.get("OPENAI_API_KEY", "")
	cohere_api_key = os.environ.get("COHERE_API_KEY", "")
	resumo_gerado = ""

	def __init__(
	self,
	config: RetrievalConfig,
	embedding_model,
	chunk_size,
	chunk_overlap,
	num_k_rerank,
	model_cohere_rerank,
	# prompt_auxiliar,
	gpt_model,
	gpt_temperature,
	# id_modelo_do_usuario,
	prompt_gerar_documento,
	reciprocal_rank_fusion,
	):
	self.config = config
	self.logger = logging.getLogger(__name__)
	# self.prompt_auxiliar = prompt_auxiliar
	self.gpt_model = gpt_model
	self.gpt_temperature = gpt_temperature
	self.prompt_gerar_documento = prompt_gerar_documento
	self.reciprocal_rank_fusion = reciprocal_rank_fusion

	self.openai_api_key = self.openai_api_key
	self.cohere_client = Client(self.cohere_api_key)
	self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
	self.num_k_rerank = num_k_rerank
	self.model_cohere_rerank = model_cohere_rerank
	self.splitter = Splitter(chunk_size, chunk_overlap)

	self.vector_store = VectorStore(embedding_model)

	def retrieve_with_rank_fusion(
	self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
	) -> List[Dict]:
	"""Combine embedding and BM25 retrieval results"""
	try:
	# Get embedding results
	embedding_results = vector_store.similarity_search_with_score(
	query, k=self.config.num_chunks
	)

	# Convert embedding results to list of (chunk_id, score)
	embedding_list = [
	(doc.metadata["chunk_id"], 1 / (1 + score))
	for doc, score in embedding_results
	]

	# Get BM25 results
	tokenized_query = query.split()
	bm25_scores = bm25.get_scores(tokenized_query)

	# Convert BM25 scores to list of (chunk_id, score)
	bm25_list = [
	(chunk_ids[i], float(score)) for i, score in enumerate(bm25_scores)
	]

	# Sort bm25_list by score in descending order and limit to top N results
	bm25_list = sorted(bm25_list, key=lambda x: x[1], reverse=True)[
	: self.config.num_chunks
	]

	# Normalize BM25 scores
	calculo_max = max(
	[score for _, score in bm25_list]
	) # Criei este max() pois em alguns momentos estava vindo valores 0, e reclamava que não podia dividir por 0
	max_bm25 = calculo_max if bm25_list and calculo_max else 1
	bm25_list = [(doc_id, score / max_bm25) for doc_id, score in bm25_list]

	# Pass the lists to rank fusion
	result_lists = [embedding_list, bm25_list]
	weights = [self.config.embedding_weight, self.config.bm25_weight]

	combined_results = self.reciprocal_rank_fusion(
	result_lists, weights=weights
	)

	return combined_results

	except Exception as e:
	self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
	raise

	def rank_fusion_get_top_results(
	self,
	vector_store: Chroma,
	bm25: BM25Okapi,
	chunk_ids: List[str],
	query: str = "Summarize the main points of this document",
	):
	# Get combined results using rank fusion
	ranked_results = self.retrieve_with_rank_fusion(
	vector_store, bm25, chunk_ids, query
	)

	# Prepare context and track sources
	contexts = []
	sources = []

	# Get full documents for top results
	for chunk_id, score in ranked_results[: self.config.num_chunks]:
	results = vector_store.get(
	where={"chunk_id": chunk_id}, include=["documents", "metadatas"]
	)

	if results["documents"]:
	context = results["documents"][0]
	metadata = results["metadatas"][0]

	contexts.append(context)
	sources.append(
	{
	"content": context,
	"page": metadata["page"],
	"chunk_id": chunk_id,
	"relevance_score": score,
	"context": metadata.get("context", ""),
	}
	)

	return sources, contexts

	def select_model_for_last_requests(self, llm_ultimas_requests: str):
	llm_instance = LLM()
	if llm_ultimas_requests == "gpt-4o-mini":
	llm = ChatOpenAI(
	temperature=self.gpt_temperature,
	model=self.gpt_model,
	api_key=SecretStr(self.openai_api_key),
	)
	elif llm_ultimas_requests == "deepseek-chat":
	llm = llm_instance.deepseek()
	elif llm_ultimas_requests == "gemini-2.0-flash":
	llm = llm_instance.google_gemini("gemini-2.0-flash")
	return llm

	async def gerar_documento_final(
	self,
	vector_store: Chroma,
	bm25: BM25Okapi,
	chunk_ids: List[str],
	llm_ultimas_requests: str,
	query: str = "Summarize the main points of this document",
	) -> List[Dict]:
	try:
	sources, contexts = self.rank_fusion_get_top_results(
	vector_store, bm25, chunk_ids, query
	)

	llm = self.select_model_for_last_requests(llm_ultimas_requests)
	# prompt_auxiliar = PromptTemplate(
	# template=self.prompt_auxiliar, input_variables=["context"]
	# )

	# resumo_auxiliar_do_documento = llm.invoke(
	# prompt_auxiliar.format(context="\n\n".join(contexts))
	# )

	# self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)

	prompt_gerar_documento = PromptTemplate(
	template=self.prompt_gerar_documento,
	input_variables=["context"],
	)

	documento_gerado = cast(
	str,
	llm.invoke(
	prompt_gerar_documento.format(
	context="\n\n".join(contexts),
	# modelo_usuario=serializer.data["modelo"],
	)
	).content,
	)

	# Split the response into paragraphs
	summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]

	# Create structured output
	structured_output = []
	for idx, summary in enumerate(summaries):
	source_idx = min(idx, len(sources) - 1)
	structured_output.append(
	{
	"content": summary,
	"source": {
	"page": sources[source_idx]["page"],
	"text": sources[source_idx]["content"][:200] + "...",
	"context": sources[source_idx]["context"],
	"relevance_score": sources[source_idx]["relevance_score"],
	"chunk_id": sources[source_idx]["chunk_id"],
	},
	}
	)

	return structured_output

	except Exception as e:
	self.logger.error(f"Error generating enhanced summary: {str(e)}")
	raise