import gradio as gr import os from typing import List, Dict import numpy as np from datasets import load_dataset from langchain.text_splitter import ( RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter ) from langchain_community.vectorstores import FAISS, Chroma, Qdrant from langchain_community.document_loaders import PyPDFLoader from langchain.chains import ConversationalRetrievalChain from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import HuggingFaceEndpoint from langchain.memory import ConversationBufferMemory from sentence_transformers import SentenceTransformer, util import torch from ragas import evaluate from ragas.metrics import ( ContextRecall, AnswerRelevancy, Faithfulness, ContextPrecision ) import pandas as pd # Constants and setup list_llm = ["meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"] list_llm_simple = [os.path.basename(llm) for llm in list_llm] api_token = os.getenv("HF_TOKEN") CHUNK_SIZES = { "small": {"recursive": 512, "fixed": 512, "token": 256}, "medium": {"recursive": 1024, "fixed": 1024, "token": 512} } # Initialize sentence transformer for evaluation sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') class RAGEvaluator: def __init__(self): self.datasets = { "squad": "squad_v2", "msmarco": "ms_marco" } self.current_dataset = None self.test_samples = [] def load_dataset(self, dataset_name: str, num_samples: int = 5): """Load a smaller subset of questions""" if dataset_name == "squad": dataset = load_dataset("squad_v2", split="validation") # Select diverse questions based on length and type samples = dataset.select(range(0, 1000, 100))[:num_samples] # Take 10 spaced-out samples self.test_samples = [ { "question": sample["question"], "ground_truth": sample["answers"]["text"][0] if sample["answers"]["text"] else "", "context": sample["context"] } for sample in samples if sample["answers"]["text"] ] elif dataset_name == "msmarco": dataset = load_dataset("ms_marco", "v2.1", split="train") samples = dataset.select(range(0, 1000, 100))[:num_samples] self.test_samples = [ { "question": sample["query"], "ground_truth": sample["answers"][0] if sample["answers"] else "", "context": sample["passages"]["passage_text"][0] } for sample in samples if sample["answers"] ] self.current_dataset = dataset_name return self.test_samples def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict: """Evaluate with progress tracking""" if not self.test_samples: return {"error": "No dataset loaded"} results = [] total_questions = len(self.test_samples) # Add progress tracking for i, sample in enumerate(self.test_samples): print(f"Evaluating question {i+1}/{total_questions}") try: response = qa_chain.invoke({ "question": sample["question"], "chat_history": [] }) results.append({ "question": sample["question"], "answer": response["answer"], "contexts": [doc.page_content for doc in response["source_documents"]], "ground_truths": [sample["ground_truth"]] }) except Exception as e: print(f"Error processing question {i+1}: {str(e)}") continue # Calculate RAGAS metrics eval_dataset = Dataset.from_list(results) metrics = [ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision()] try: scores = evaluate(eval_dataset, metrics=metrics) return { "configuration": f"{splitting_strategy}_{chunk_size}", "questions_evaluated": len(results), "context_recall": float(scores['context_recall']), "answer_relevancy": float(scores['answer_relevancy']), "faithfulness": float(scores['faithfulness']), "context_precision": float(scores['context_precision']), "average_score": float(np.mean([ scores['context_recall'], scores['answer_relevancy'], scores['faithfulness'], scores['context_precision'] ])) } except Exception as e: return { "configuration": f"{splitting_strategy}_{chunk_size}", "error": str(e), "questions_evaluated": len(results) } # Text splitting and database functions def get_text_splitter(strategy: str, chunk_size: int = 1024, chunk_overlap: int = 64): splitters = { "recursive": RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ), "fixed": CharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ), "token": TokenTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) } return splitters.get(strategy) def load_doc(list_file_path: List[str], splitting_strategy: str, chunk_size: str): chunk_size_value = CHUNK_SIZES[chunk_size][splitting_strategy] loaders = [PyPDFLoader(x) for x in list_file_path] pages = [] for loader in loaders: pages.extend(loader.load()) text_splitter = get_text_splitter(splitting_strategy, chunk_size_value) doc_splits = text_splitter.split_documents(pages) return doc_splits def create_db(splits, db_choice: str = "faiss"): embeddings = HuggingFaceEmbeddings() db_creators = { "faiss": lambda: FAISS.from_documents(splits, embeddings), "chroma": lambda: Chroma.from_documents(splits, embeddings), "qdrant": lambda: Qdrant.from_documents( splits, embeddings, location=":memory:", collection_name="pdf_docs" ) } return db_creators[db_choice]() def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice, progress=gr.Progress()): list_file_path = [x.name for x in list_file_obj if x is not None] doc_splits = load_doc(list_file_path, splitting_strategy, chunk_size) vector_db = create_db(doc_splits, db_choice) return vector_db, f"Database created using {splitting_strategy} splitting and {db_choice} vector database!" def initialize_llmchain(llm_choice, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()): llm_model = list_llm[llm_choice] llm = HuggingFaceEndpoint( repo_id=llm_model, huggingfacehub_api_token=api_token, temperature=temperature, max_new_tokens=max_tokens, top_k=top_k ) memory = ConversationBufferMemory( memory_key="chat_history", output_key='answer', return_messages=True ) retriever = vector_db.as_retriever() qa_chain = ConversationalRetrievalChain.from_llm( llm, retriever=retriever, memory=memory, return_source_documents=True ) return qa_chain, "LLM initialized successfully!" def conversation(qa_chain, message, history): """Fixed conversation function returning all required outputs""" response = qa_chain.invoke({ "question": message, "chat_history": [(hist[0], hist[1]) for hist in history] }) response_answer = response["answer"] if "Helpful Answer:" in response_answer: response_answer = response_answer.split("Helpful Answer:")[-1] # Get source documents, ensure we have exactly 3 sources = response["source_documents"][:3] source_contents = [] source_pages = [] # Process available sources for source in sources: source_contents.append(source.page_content.strip()) source_pages.append(source.metadata.get("page", 0) + 1) # Pad with empty values if we have fewer than 3 sources while len(source_contents) < 3: source_contents.append("") source_pages.append(0) # Return all required outputs in correct order return ( qa_chain, # State gr.update(value=""), # Clear message box history + [(message, response_answer)], # Updated chat history source_contents[0], # First source source_pages[0], # First page source_contents[1], # Second source source_pages[1], # Second page source_contents[2], # Third source source_pages[2] # Third page ) def demo(): evaluator = RAGEvaluator() with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink", neutral_hue="sky")) as demo: vector_db = gr.State() qa_chain = gr.State() gr.HTML("