import gradio as gr import os from typing import List, Dict import numpy as np from datasets import load_dataset from langchain.text_splitter import ( RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter ) from langchain_community.vectorstores import FAISS, Chroma from langchain_community.document_loaders import PyPDFLoader from langchain.chains import ConversationalRetrievalChain from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import HuggingFaceEndpoint from langchain.memory import ConversationBufferMemory from sentence_transformers import SentenceTransformer, util import torch from ragas import evaluate from ragas.metrics import ( ContextRecall, AnswerRelevancy, Faithfulness, ContextPrecision ) import pandas as pd # Constants and configurations CHUNK_SIZES = { "small": {"recursive": 512, "fixed": 512, "token": 256}, "medium": {"recursive": 1024, "fixed": 1024, "token": 512} } class RAGEvaluator: def __init__(self): self.datasets = { "squad": "squad_v2", "msmarco": "ms_marco" } self.current_dataset = None self.test_samples = [] def load_dataset(self, dataset_name: str, num_samples: int = 50): if dataset_name == "squad": dataset = load_dataset("squad_v2", split="validation") samples = dataset.select(range(num_samples)) self.test_samples = [ { "question": sample["question"], "ground_truth": sample["answers"]["text"][0] if sample["answers"]["text"] else "", "context": sample["context"] } for sample in samples if sample["answers"]["text"] # Filter out samples without answers ] elif dataset_name == "msmarco": dataset = load_dataset("ms_marco", "v2.1", split="train") samples = dataset.select(range(num_samples)) self.test_samples = [ { "question": sample["query"], "ground_truth": sample["answers"][0] if sample["answers"] else "", "context": sample["passages"]["passage_text"][0] } for sample in samples if sample["answers"] # Filter out samples without answers ] self.current_dataset = dataset_name return self.test_samples def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict: if not self.test_samples: return {"error": "No dataset loaded"} results = [] for sample in self.test_samples: response = qa_chain.invoke({ "question": sample["question"], "chat_history": [] }) results.append({ "question": sample["question"], "answer": response["answer"], "contexts": [doc.page_content for doc in response["source_documents"]], "ground_truths": [sample["ground_truth"]] }) # Convert to RAGAS dataset format eval_dataset = Dataset.from_list(results) # Calculate RAGAS metrics metrics = [ ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision() ] scores = evaluate( eval_dataset, metrics=metrics ) return { "configuration": f"{splitting_strategy}_{chunk_size}", "context_recall": float(scores['context_recall']), "answer_relevancy": float(scores['answer_relevancy']), "faithfulness": float(scores['faithfulness']), "context_precision": float(scores['context_precision']), "average_score": float(np.mean([ scores['context_recall'], scores['answer_relevancy'], scores['faithfulness'], scores['context_precision'] ])) } def demo(): evaluator = RAGEvaluator() with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink", neutral_hue="sky")) as demo: vector_db = gr.State() qa_chain = gr.State() gr.HTML("