Spaces:
Running
Running
import os | |
from typing import List, Dict, Tuple, Optional | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.chat_models import ChatOpenAI | |
from langchain.chains import create_extraction_chain | |
from langchain.prompts import PromptTemplate | |
from dataclasses import dataclass | |
import uuid | |
import json | |
from anthropic import Anthropic | |
import numpy as np | |
from rank_bm25 import BM25Okapi | |
import logging | |
from cohere import Client | |
import requests | |
from setup.environment import api_url | |
from rest_framework.response import Response | |
from langchain.schema import Document | |
listaContador = [] | |
def reciprocal_rank_fusion(result_lists, weights=None): | |
"""Combine multiple ranked lists using reciprocal rank fusion""" | |
fused_scores = {} | |
num_lists = len(result_lists) | |
if weights is None: | |
weights = [1.0] * num_lists | |
for i in range(num_lists): | |
for doc_id, score in result_lists[i]: | |
if doc_id not in fused_scores: | |
fused_scores[doc_id] = 0 | |
fused_scores[doc_id] += weights[i] * score | |
# Sort by score in descending order | |
sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True) | |
return sorted_results | |
os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" | |
os.environ.get("LANGCHAIN_API_KEY") | |
os.environ["LANGCHAIN_PROJECT"] = "VELLA" | |
class DocumentChunk: | |
content: str | |
page_number: int | |
chunk_id: str | |
start_char: int | |
end_char: int | |
class RetrievalConfig: | |
num_chunks: int = 5 | |
embedding_weight: float = 0.5 | |
bm25_weight: float = 0.5 | |
context_window: int = 3 | |
chunk_overlap: int = 200 | |
chunk_size: int = 1000 | |
class ContextualizedChunk(DocumentChunk): | |
context: str = "" | |
embedding: Optional[np.ndarray] = None | |
bm25_score: Optional[float] = None | |
class DocumentSummarizer: | |
def __init__( | |
self, | |
openai_api_key: str, | |
cohere_api_key: str, | |
embedding_model, | |
chunk_size, | |
chunk_overlap, | |
num_k_rerank, | |
model_cohere_rerank, | |
): | |
self.openai_api_key = openai_api_key | |
self.cohere_client = Client(cohere_api_key) | |
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, chunk_overlap=chunk_overlap | |
) | |
self.chunk_metadata = {} # Store chunk metadata for tracing | |
self.num_k_rerank = num_k_rerank | |
self.model_cohere_rerank = model_cohere_rerank | |
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]: | |
"""Load PDF and split into chunks with metadata""" | |
loader = PyPDFLoader(pdf_path) | |
pages = ( | |
loader.load() | |
) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF. | |
chunks = [] | |
char_count = 0 | |
for page in pages: | |
text = page.page_content | |
page_chunks = self.text_splitter.split_text( | |
text | |
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página. | |
for chunk in page_chunks: | |
chunk_id = str(uuid.uuid4()) | |
start_char = text.find( | |
chunk | |
) # Retorna a posição onde se encontra o chunk dentro da página inteira | |
end_char = start_char + len(chunk) | |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk | |
content=chunk, | |
page_number=page.metadata.get("page") + 1, # 1-based page numbering | |
chunk_id=chunk_id, | |
start_char=char_count + start_char, | |
end_char=char_count + end_char, | |
) | |
chunks.append(doc_chunk) | |
# Store metadata for later retrieval | |
self.chunk_metadata[chunk_id] = { | |
"page": doc_chunk.page_number, | |
"start_char": doc_chunk.start_char, | |
"end_char": doc_chunk.end_char, | |
} | |
char_count += len(text) | |
return chunks | |
def load_and_split_text(self, text: str) -> List[DocumentChunk]: | |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas""" | |
page = Document(page_content=text, metadata={"page": 1}) | |
chunks = [] | |
char_count = 0 | |
text = page.page_content | |
page_chunks = self.text_splitter.split_text( | |
text | |
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página. | |
print("\n\n\n") | |
print("page_chunks: ", page_chunks) | |
for chunk in page_chunks: | |
chunk_id = str(uuid.uuid4()) | |
start_char = text.find( | |
chunk | |
) # Retorna a posição onde se encontra o chunk dentro da página inteira | |
end_char = start_char + len(chunk) | |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk | |
content=chunk, | |
page_number=page.metadata.get("page") + 1, # 1-based page numbering | |
chunk_id=chunk_id, | |
start_char=char_count + start_char, | |
end_char=char_count + end_char, | |
) | |
chunks.append(doc_chunk) | |
# Store metadata for later retrieval | |
self.chunk_metadata[chunk_id] = { | |
"page": doc_chunk.page_number, | |
"start_char": doc_chunk.start_char, | |
"end_char": doc_chunk.end_char, | |
} | |
char_count += len(text) | |
return chunks | |
def create_vector_store( | |
self, chunks: List[DocumentChunk] | |
) -> Chroma: # Esta função nunca está sendo utilizada | |
"""Create vector store with metadata""" | |
texts = [chunk.content for chunk in chunks] | |
metadatas = [ | |
{ | |
"chunk_id": chunk.chunk_id, | |
"page": chunk.page_number, | |
"start_char": chunk.start_char, | |
"end_char": chunk.end_char, | |
} | |
for chunk in chunks | |
] | |
vector_store = Chroma.from_texts( | |
texts=texts, metadatas=metadatas, embedding=self.embeddings | |
) | |
return vector_store | |
def rerank_chunks( # Esta função nunca está sendo utilizada | |
self, chunks: List[Dict], query: str, k: int = 5 | |
) -> List[Dict]: | |
""" | |
Rerank chunks using Cohere's reranking model. | |
Args: | |
chunks: List of dictionaries containing chunks and their metadata | |
query: Original search query | |
k: Number of top chunks to return | |
Returns: | |
List of reranked chunks with updated relevance scores | |
""" | |
try: | |
# Prepare documents for reranking | |
documents = [chunk["content"] for chunk in chunks] | |
# Get reranking scores from Cohere | |
results = self.cohere_client.rerank( | |
query=query, | |
documents=documents, | |
top_n=k, | |
model=self.model_cohere_rerank, | |
) | |
# Create reranked results with original metadata | |
reranked_chunks = [] | |
for hit in results: | |
original_chunk = chunks[hit.index] | |
reranked_chunks.append( | |
{**original_chunk, "relevance_score": hit.relevance_score} | |
) | |
return reranked_chunks | |
except Exception as e: | |
logging.error(f"Reranking failed: {str(e)}") | |
return chunks[:k] # Fallback to original ordering | |
def generate_summary_with_sources( # Esta função nunca está sendo utilizada | |
self, | |
vector_store: Chroma, | |
query: str = "Summarize the main points of this document", | |
) -> List[Dict]: | |
"""Generate summary with source citations using reranking""" | |
# Retrieve more initial chunks for reranking | |
relevant_docs = vector_store.similarity_search_with_score(query, k=20) | |
# Prepare chunks for reranking | |
chunks = [] | |
for doc, score in relevant_docs: | |
chunks.append( | |
{ | |
"content": doc.page_content, | |
"page": doc.metadata["page"], | |
"chunk_id": doc.metadata["chunk_id"], | |
"relevance_score": score, | |
} | |
) | |
# Rerank chunks | |
reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank) | |
# Prepare context and sources from reranked chunks | |
contexts = [] | |
sources = [] | |
for chunk in reranked_chunks: | |
contexts.append(chunk["content"]) | |
sources.append( | |
{ | |
"content": chunk["content"], | |
"page": chunk["page"], | |
"chunk_id": chunk["chunk_id"], | |
"relevance_score": chunk["relevance_score"], | |
} | |
) | |
prompt_template = """ | |
Based on the following context, provide multiple key points from the document. | |
For each point, create a new paragraph. | |
Each paragraph should be a complete, self-contained insight. | |
Context: {context} | |
Key points: | |
""" | |
prompt = PromptTemplate(template=prompt_template, input_variables=["context"]) | |
llm = ChatOpenAI( | |
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key | |
) | |
response = llm.predict(prompt.format(context="\n\n".join(contexts))) | |
# Split the response into paragraphs | |
summaries = [p.strip() for p in response.split("\n\n") if p.strip()] | |
# Create structured output | |
structured_output = [] | |
for idx, summary in enumerate(summaries): | |
# Associate each summary with the most relevant source | |
structured_output.append( | |
{ | |
"content": summary, | |
"source": { | |
"page": sources[min(idx, len(sources) - 1)]["page"], | |
"text": sources[min(idx, len(sources) - 1)]["content"][:200] | |
+ "...", | |
"relevance_score": sources[min(idx, len(sources) - 1)][ | |
"relevance_score" | |
], | |
}, | |
} | |
) | |
return structured_output | |
def get_source_context( | |
self, chunk_id: str, window: int = 100 | |
) -> Dict: # Esta função nunca está sendo utilizada | |
"""Get extended context around a specific chunk""" | |
metadata = self.chunk_metadata.get(chunk_id) | |
if not metadata: | |
return None | |
return { | |
"page": metadata["page"], | |
"start_char": metadata["start_char"], | |
"end_char": metadata["end_char"], | |
} | |
class ContextualRetriever: | |
def __init__( | |
self, config: RetrievalConfig, claude_api_key: str, claude_context_model | |
): | |
self.config = config # Este self.config no momento não está sendo utilizada para nada dentro desta classe. Analisar se deveria estar sendo utilizada. | |
self.claude_client = Anthropic(api_key=claude_api_key) | |
self.logger = logging.getLogger(__name__) | |
self.bm25 = None | |
self.claude_context_model = claude_context_model | |
def generate_context(self, full_text: str, chunk: DocumentChunk) -> str: | |
"""Generate contextual description using Claude""" | |
try: | |
# prompt = f"""<document> | |
# {full_text} | |
# </document> | |
# Here is the chunk we want to situate within the whole document | |
# <chunk> | |
# {chunk.content} | |
# </chunk> | |
# Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else.""" | |
prompt = f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output): | |
1. **Chain of Thought (internal)**: | |
- Identify the document ID, which is the value between "NUM." and "- Pág". | |
- Identify the document name from the header. | |
2. **Reflection (internal)**: | |
- Confirm the document ID and name are correctly identified. | |
- Ensure the final context is concise and helpful. | |
3. **Final Response**: | |
- Provide a short context situating the *chunk* within the document, including the document ID and document name. | |
- Do not include any reasoning or reflection in your response. | |
**Example Usage:** | |
``` | |
<document> {full_text} </document> | |
<chunk> {chunk.content} </chunk> | |
Please return only the succinct context (without displaying your internal reasoning), including the document ID and the document name. | |
``` | |
""" | |
response = self.claude_client.messages.create( | |
model=self.claude_context_model, | |
max_tokens=100, | |
messages=[{"role": "user", "content": prompt}], | |
) | |
return response.content[ | |
0 | |
].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model | |
except Exception as e: | |
self.logger.error( | |
f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}" | |
) | |
return "" | |
def contextualize_chunks( | |
self, full_text: List[Document], chunks: List[DocumentChunk] | |
) -> List[ | |
ContextualizedChunk | |
]: # Pega um chunk e apenas adiciona uma propriedade de contexto a ela, sendo esta propriedade a resposta da função acima, que chama um Model do Claude para dizer o contexto de um chunk | |
"""Add context to all chunks""" | |
smaller_context = "" | |
contextualized_chunks = [] | |
print("\n\n") | |
print("len(chunks): ", len(chunks)) | |
for chunk in chunks: | |
contador_pagina = -1 | |
while contador_pagina <= 1: | |
local_page = full_text[chunk.page_number + contador_pagina] | |
if local_page: | |
smaller_context += local_page.page_content | |
contador_pagina += 1 | |
print("chunk.page_number: ", chunk.page_number) | |
context = self.generate_context(smaller_context, chunk) | |
contextualized_chunk = ContextualizedChunk( | |
content=chunk.content, | |
page_number=chunk.page_number, | |
chunk_id=chunk.chunk_id, | |
start_char=chunk.start_char, | |
end_char=chunk.end_char, | |
context=context, | |
) | |
contextualized_chunks.append(contextualized_chunk) | |
return contextualized_chunks | |
class EnhancedDocumentSummarizer(DocumentSummarizer): | |
def __init__( | |
self, | |
openai_api_key: str, | |
claude_api_key: str, | |
config: RetrievalConfig, | |
embedding_model, | |
chunk_size, | |
chunk_overlap, | |
num_k_rerank, | |
model_cohere_rerank, | |
claude_context_model, | |
prompt_relatorio, | |
gpt_model, | |
gpt_temperature, | |
id_modelo_do_usuario, | |
prompt_modelo, | |
): | |
super().__init__( | |
openai_api_key, | |
os.environ.get("COHERE_API_KEY"), | |
embedding_model, | |
chunk_size, | |
chunk_overlap, | |
num_k_rerank, | |
model_cohere_rerank, | |
) | |
self.config = config | |
self.contextual_retriever = ContextualRetriever( | |
config, claude_api_key, claude_context_model | |
) | |
self.logger = logging.getLogger(__name__) | |
self.prompt_relatorio = prompt_relatorio | |
self.gpt_model = gpt_model | |
self.gpt_temperature = gpt_temperature | |
self.id_modelo_do_usuario = id_modelo_do_usuario | |
self.prompt_modelo = prompt_modelo | |
def create_enhanced_vector_store( | |
self, chunks: List[ContextualizedChunk] | |
) -> Tuple[Chroma, BM25Okapi, List[str]]: | |
"""Create vector store and BM25 index with contextualized chunks""" | |
try: | |
# Prepare texts with context | |
texts = [f"{chunk.context} {chunk.content}" for chunk in chunks] | |
# Create vector store | |
metadatas = [ | |
{ | |
"chunk_id": chunk.chunk_id, | |
"page": chunk.page_number, | |
"start_char": chunk.start_char, | |
"end_char": chunk.end_char, | |
"context": chunk.context, | |
} | |
for chunk in chunks | |
] | |
vector_store = Chroma.from_texts( | |
texts=texts, metadatas=metadatas, embedding=self.embeddings | |
) | |
# Create BM25 index | |
tokenized_texts = [text.split() for text in texts] | |
bm25 = BM25Okapi(tokenized_texts) | |
# Get chunk IDs in order | |
chunk_ids = [chunk.chunk_id for chunk in chunks] | |
return vector_store, bm25, chunk_ids | |
except Exception as e: | |
self.logger.error(f"Error creating enhanced vector store: {str(e)}") | |
raise | |
def retrieve_with_rank_fusion( | |
self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str | |
) -> List[Dict]: | |
"""Combine embedding and BM25 retrieval results""" | |
try: | |
# Get embedding results | |
embedding_results = vector_store.similarity_search_with_score( | |
query, k=self.config.num_chunks | |
) | |
# Convert embedding results to list of (chunk_id, score) | |
embedding_list = [ | |
(doc.metadata["chunk_id"], 1 / (1 + score)) | |
for doc, score in embedding_results | |
] | |
# Get BM25 results | |
tokenized_query = query.split() | |
bm25_scores = bm25.get_scores(tokenized_query) | |
# Convert BM25 scores to list of (chunk_id, score) | |
bm25_list = [ | |
(chunk_ids[i], float(score)) for i, score in enumerate(bm25_scores) | |
] | |
# Sort bm25_list by score in descending order and limit to top N results | |
bm25_list = sorted(bm25_list, key=lambda x: x[1], reverse=True)[ | |
: self.config.num_chunks | |
] | |
# Normalize BM25 scores | |
max_bm25 = max([score for _, score in bm25_list]) if bm25_list else 1 | |
bm25_list = [(doc_id, score / max_bm25) for doc_id, score in bm25_list] | |
# Pass the lists to rank fusion | |
result_lists = [embedding_list, bm25_list] | |
weights = [self.config.embedding_weight, self.config.bm25_weight] | |
combined_results = reciprocal_rank_fusion(result_lists, weights=weights) | |
return combined_results | |
except Exception as e: | |
self.logger.error(f"Error in rank fusion retrieval: {str(e)}") | |
raise | |
def generate_enhanced_summary( | |
self, | |
vector_store: Chroma, | |
bm25: BM25Okapi, | |
chunk_ids: List[str], | |
query: str = "Summarize the main points of this document", | |
) -> List[Dict]: | |
"""Generate enhanced summary using both vector and BM25 retrieval""" | |
try: | |
# Get combined results using rank fusion | |
ranked_results = self.retrieve_with_rank_fusion( | |
vector_store, bm25, chunk_ids, query | |
) | |
# Prepare context and track sources | |
contexts = [] | |
sources = [] | |
# Get full documents for top results | |
for chunk_id, score in ranked_results[: self.config.num_chunks]: | |
results = vector_store.get( | |
where={"chunk_id": chunk_id}, include=["documents", "metadatas"] | |
) | |
if results["documents"]: | |
context = results["documents"][0] | |
metadata = results["metadatas"][0] | |
contexts.append(context) | |
sources.append( | |
{ | |
"content": context, | |
"page": metadata["page"], | |
"chunk_id": chunk_id, | |
"relevance_score": score, | |
"context": metadata.get("context", ""), | |
} | |
) | |
url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}" | |
resposta = requests.get(url_request) | |
if resposta.status_code != 200: | |
return Response( | |
{ | |
"error": "Ocorreu um problema. Pode ser que o modelo não tenha sido encontrado. Tente novamente e/ou entre em contato com a equipe técnica" | |
} | |
) | |
modelo_buscado = resposta.json()["modelo"] | |
llm = ChatOpenAI( | |
temperature=self.gpt_temperature, | |
model_name=self.gpt_model, | |
api_key=self.openai_api_key, | |
) | |
prompt_gerar_relatorio = PromptTemplate( | |
template=self.prompt_relatorio, input_variables=["context"] | |
) | |
relatorio_gerado = llm.predict( | |
prompt_gerar_relatorio.format(context="\n\n".join(contexts)) | |
) | |
prompt_gerar_modelo = PromptTemplate( | |
template=self.prompt_modelo, | |
input_variables=["context", "modelo_usuario"], | |
) | |
modelo_gerado = llm.predict( | |
prompt_gerar_modelo.format( | |
context=relatorio_gerado, modelo_usuario=modelo_buscado | |
) | |
) | |
# Split the response into paragraphs | |
summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()] | |
# Create structured output | |
structured_output = [] | |
for idx, summary in enumerate(summaries): | |
source_idx = min(idx, len(sources) - 1) | |
structured_output.append( | |
{ | |
"content": summary, | |
"source": { | |
"page": sources[source_idx]["page"], | |
"text": sources[source_idx]["content"][:200] + "...", | |
"context": sources[source_idx]["context"], | |
"relevance_score": sources[source_idx]["relevance_score"], | |
"chunk_id": sources[source_idx]["chunk_id"], | |
}, | |
} | |
) | |
return structured_output | |
except Exception as e: | |
self.logger.error(f"Error generating enhanced summary: {str(e)}") | |
raise | |
async def get_llm_summary_answer_by_cursor_complete( | |
serializer, listaPDFs=None, contexto=None | |
): | |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs""" | |
allPdfsChunks = [] | |
# Configuration | |
config = RetrievalConfig( | |
num_chunks=serializer["num_chunks_retrieval"], | |
embedding_weight=serializer["embedding_weight"], | |
bm25_weight=serializer["bm25_weight"], | |
context_window=serializer["context_window"], | |
chunk_overlap=serializer["chunk_overlap"], | |
) | |
# Initialize enhanced summarizer | |
summarizer = EnhancedDocumentSummarizer( | |
openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
claude_api_key=os.environ.get("CLAUDE_API_KEY"), | |
config=config, | |
embedding_model=serializer["hf_embedding"], | |
chunk_overlap=serializer["chunk_overlap"], | |
chunk_size=serializer["chunk_size"], | |
num_k_rerank=serializer["num_k_rerank"], | |
model_cohere_rerank=serializer["model_cohere_rerank"], | |
claude_context_model=serializer["claude_context_model"], | |
prompt_relatorio=serializer["prompt_relatorio"], | |
gpt_model=serializer["model"], | |
gpt_temperature=serializer["gpt_temperature"], | |
id_modelo_do_usuario=serializer["id_modelo_do_usuario"], | |
prompt_modelo=serializer["prompt_modelo"], | |
) | |
full_text = "" | |
if contexto: | |
full_text = contexto | |
chunks = summarizer.load_and_split_text(full_text) | |
allPdfsChunks = chunks | |
else: | |
# # Load and process document | |
# pdf_path = "./Im_a_storyteller.pdf" | |
# chunks = summarizer.load_and_split_document(pdf_path) | |
# Load and process document | |
for pdf in listaPDFs: | |
pdf_path = pdf | |
chunks = summarizer.load_and_split_document(pdf_path) | |
allPdfsChunks = allPdfsChunks + chunks | |
# Get full text for contextualization | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
full_text = " ".join([page.page_content for page in pages]) | |
# Contextualize chunks | |
contextualized_chunks = await summarizer.contextual_retriever.contextualize_chunks( | |
pages, allPdfsChunks | |
) | |
# Create enhanced vector store and BM25 index | |
vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store( | |
contextualized_chunks | |
) | |
# Generate enhanced summary | |
structured_summaries = summarizer.generate_enhanced_summary( | |
vector_store, bm25, chunk_ids, serializer["user_message"] | |
) | |
# Output results as JSON | |
json_output = json.dumps(structured_summaries, indent=2) | |
print("\nStructured Summaries:") | |
print(json_output) | |
texto_completo = "" | |
for x in structured_summaries: | |
texto_completo = texto_completo + x["content"] | |
return { | |
"resultado": structured_summaries, | |
"texto_completo": texto_completo, | |
"parametros-utilizados": { | |
"num_chunks_retrieval": serializer["num_chunks_retrieval"], | |
"embedding_weight": serializer["embedding_weight"], | |
"bm25_weight": serializer["bm25_weight"], | |
"context_window": serializer["context_window"], | |
"chunk_overlap": serializer["chunk_overlap"], | |
"num_k_rerank": serializer["num_k_rerank"], | |
"model_cohere_rerank": serializer["model_cohere_rerank"], | |
"more_initial_chunks_for_reranking": serializer[ | |
"more_initial_chunks_for_reranking" | |
], | |
"claude_context_model": serializer["claude_context_model"], | |
"gpt_temperature": serializer["gpt_temperature"], | |
"user_message": serializer["user_message"], | |
"model": serializer["model"], | |
"hf_embedding": serializer["hf_embedding"], | |
"chunk_size": serializer["chunk_size"], | |
"chunk_overlap": serializer["chunk_overlap"], | |
"prompt_relatorio": serializer["prompt_relatorio"], | |
"prompt_modelo": serializer["prompt_modelo"], | |
}, | |
} | |
from ragas import evaluate | |
from langchain.chains import SequentialChain | |
from langchain.prompts import PromptTemplate | |
# from langchain.schema import ChainResult | |
from langchain.memory import SimpleMemory | |
def test_ragas(serializer, listaPDFs): | |
# Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer | |
config = RetrievalConfig( | |
num_chunks=serializer["num_chunks_retrieval"], | |
embedding_weight=serializer["embedding_weight"], | |
bm25_weight=serializer["bm25_weight"], | |
context_window=serializer["context_window"], | |
chunk_overlap=serializer["chunk_overlap"], | |
) | |
summarizer = EnhancedDocumentSummarizer( | |
openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
claude_api_key=os.environ.get("CLAUDE_API_KEY"), | |
config=config, | |
embedding_model=serializer["hf_embedding"], | |
chunk_overlap=serializer["chunk_overlap"], | |
chunk_size=serializer["chunk_size"], | |
num_k_rerank=serializer["num_k_rerank"], | |
model_cohere_rerank=serializer["model_cohere_rerank"], | |
claude_context_model=serializer["claude_context_model"], | |
prompt_relatorio=serializer["prompt_relatorio"], | |
gpt_model=serializer["model"], | |
gpt_temperature=serializer["gpt_temperature"], | |
id_modelo_do_usuario=serializer["id_modelo_do_usuario"], | |
prompt_modelo=serializer["prompt_modelo"], | |
) | |
# Step 1: Define the components | |
def load_and_split_documents(pdf_list, summarizer): | |
"""Loads and splits PDF documents into chunks.""" | |
all_chunks = [] | |
for pdf_path in pdf_list: | |
chunks = summarizer.load_and_split_document(pdf_path) | |
all_chunks.extend(chunks) | |
return {"chunks": all_chunks} | |
def get_full_text_from_pdfs(pdf_list): | |
"""Gets the full text from PDFs for contextualization.""" | |
full_text = [] | |
for pdf_path in pdf_list: | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
text = " ".join([page.page_content for page in pages]) | |
full_text.append(text) | |
return {"full_text": " ".join(full_text)} | |
def contextualize_chunks(full_text, chunks, contextual_retriever): | |
"""Adds context to chunks using Claude.""" | |
contextualized_chunks = contextual_retriever.contextualize_chunks( | |
full_text, chunks | |
) | |
return {"contextualized_chunks": contextualized_chunks} | |
def create_vector_store(contextualized_chunks, summarizer): | |
"""Creates an enhanced vector store and BM25 index.""" | |
vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store( | |
contextualized_chunks | |
) | |
return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids} | |
def generate_summary(vector_store, bm25, chunk_ids, query, summarizer): | |
"""Generates an enhanced summary using the vector store and BM25 index.""" | |
structured_summaries = summarizer.generate_enhanced_summary( | |
vector_store, bm25, chunk_ids, query | |
) | |
return {"structured_summaries": structured_summaries} | |
# Step 3: Define Sequential Chain | |
chain = SequentialChain( | |
chains=[ | |
lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer), | |
lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]), | |
lambda inputs: contextualize_chunks( | |
inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever | |
), | |
lambda inputs: create_vector_store( | |
inputs["contextualized_chunks"], summarizer | |
), | |
lambda inputs: generate_summary( | |
inputs["vector_store"], | |
inputs["bm25"], | |
inputs["chunk_ids"], | |
inputs["user_message"], | |
summarizer, | |
), | |
], | |
input_variables=["pdf_list", "user_message"], | |
output_variables=["structured_summaries"], | |
) | |
from ragas.langchain.evalchain import RagasEvaluatorChain | |
from ragas.metrics import ( | |
LLMContextRecall, | |
Faithfulness, | |
FactualCorrectness, | |
SemanticSimilarity, | |
) | |
from ragas import evaluate | |
from ragas.llms import LangchainLLMWrapper | |
# from ragas.embeddings import LangchainEmbeddingsWrapper | |
# evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) | |
evaluator_llm = LangchainLLMWrapper(chain) | |
# evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) | |
from datasets import load_dataset | |
dataset = load_dataset( | |
"explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True | |
) | |
from ragas import EvaluationDataset | |
eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"]) | |
metrics = [ | |
LLMContextRecall(llm=evaluator_llm), | |
FactualCorrectness(llm=evaluator_llm), | |
Faithfulness(llm=evaluator_llm), | |
# SemanticSimilarity(embeddings=evaluator_embeddings) | |
] | |
results = evaluate(dataset=eval_dataset, metrics=metrics) | |
print("results: ", results) | |
# Step 4: Run the Chain | |
inputs = { | |
"pdf_list": listaPDFs, | |
"user_message": serializer["user_message"], | |
} | |
# result = chain.run(inputs) | |
return Response({"msg": results}) | |
# Step 5: Format the Output | |
# return { | |
# "resultado": result["structured_summaries"], | |
# "parametros-utilizados": { | |
# "num_chunks_retrieval": serializer["num_chunks_retrieval"], | |
# "embedding_weight": serializer["embedding_weight"], | |
# "bm25_weight": serializer["bm25_weight"], | |
# "context_window": serializer["context_window"], | |
# "chunk_overlap": serializer["chunk_overlap"], | |
# "num_k_rerank": serializer["num_k_rerank"], | |
# "model_cohere_rerank": serializer["model_cohere_rerank"], | |
# "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"], | |
# "claude_context_model": serializer["claude_context_model"], | |
# "gpt_temperature": serializer["gpt_temperature"], | |
# "user_message": serializer["user_message"], | |
# "model": serializer["model"], | |
# "hf_embedding": serializer["hf_embedding"], | |
# "chunk_size": serializer["chunk_size"], | |
# "chunk_overlap": serializer["chunk_overlap"], | |
# "prompt_relatorio": serializer["prompt_relatorio"], | |
# "prompt_modelo": serializer["prompt_modelo"], | |
# }, | |
# } | |