import gradio as gr import os from typing import List, Dict import numpy as np from datasets import load_dataset from langchain.text_splitter import ( RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter ) from langchain_community.vectorstores import FAISS, Chroma, Qdrant from langchain_community.document_loaders import PyPDFLoader from langchain.chains import ConversationalRetrievalChain from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import HuggingFaceEndpoint from langchain.memory import ConversationBufferMemory from sentence_transformers import SentenceTransformer, util import torch from ragas import evaluate from ragas.metrics import ( ContextRecall, AnswerRelevancy, Faithfulness, ContextPrecision ) import pandas as pd # Constants and setup list_llm = ["meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"] list_llm_simple = [os.path.basename(llm) for llm in list_llm] api_token = os.getenv("HF_TOKEN") CHUNK_SIZES = { "small": {"recursive": 512, "fixed": 512, "token": 256}, "medium": {"recursive": 1024, "fixed": 1024, "token": 512} } # Initialize sentence transformer for evaluation sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') class RAGEvaluator: def __init__(self): self.datasets = { "squad": "squad_v2", "msmarco": "ms_marco" } self.current_dataset = None self.test_samples = [] def load_dataset(self, dataset_name: str, num_samples: int = 10): """Load a smaller subset of questions with proper error handling""" try: if dataset_name == "squad": dataset = load_dataset("squad_v2", split="validation") # Select diverse questions samples = dataset.select(range(0, 1000, 100))[:num_samples] self.test_samples = [] for sample in samples: # Check if answers exist and are not empty if sample.get("answers") and isinstance(sample["answers"], dict) and sample["answers"].get("text"): self.test_samples.append({ "question": sample["question"], "ground_truth": sample["answers"]["text"][0], "context": sample["context"] }) elif dataset_name == "msmarco": dataset = load_dataset("ms_marco", "v2.1", split="dev") samples = dataset.select(range(0, 1000, 100))[:num_samples] self.test_samples = [] for sample in samples: # Check for valid answers if sample.get("answers") and sample["answers"]: self.test_samples.append({ "question": sample["query"], "ground_truth": sample["answers"][0], "context": sample["passages"][0]["passage_text"] if isinstance(sample["passages"], list) else sample["passages"]["passage_text"][0] }) self.current_dataset = dataset_name # Return dataset info return { "dataset": dataset_name, "num_samples": len(self.test_samples), "sample_questions": [s["question"] for s in self.test_samples[:3]], "status": "success" } except Exception as e: print(f"Error loading dataset: {str(e)}") return { "dataset": dataset_name, "error": str(e), "status": "failed" } def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict: """Evaluate with progress tracking and error handling""" if not self.test_samples: return {"error": "No dataset loaded"} results = [] total_questions = len(self.test_samples) # Add progress tracking for i, sample in enumerate(self.test_samples): print(f"Evaluating question {i+1}/{total_questions}") try: response = qa_chain.invoke({ "question": sample["question"], "chat_history": [] }) results.append({ "question": sample["question"], "answer": response["answer"], "contexts": [doc.page_content for doc in response["source_documents"]], "ground_truths": [sample["ground_truth"]] }) except Exception as e: print(f"Error processing question {i+1}: {str(e)}") continue if not results: return { "configuration": f"{splitting_strategy}_{chunk_size}", "error": "No successful evaluations", "questions_evaluated": 0 } try: # Calculate RAGAS metrics eval_dataset = Dataset.from_list(results) metrics = [ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision()] scores = evaluate(eval_dataset, metrics=metrics) return { "configuration": f"{splitting_strategy}_{chunk_size}", "questions_evaluated": len(results), "context_recall": float(scores['context_recall']), "answer_relevancy": float(scores['answer_relevancy']), "faithfulness": float(scores['faithfulness']), "context_precision": float(scores['context_precision']), "average_score": float(np.mean([ scores['context_recall'], scores['answer_relevancy'], scores['faithfulness'], scores['context_precision'] ])) } except Exception as e: return { "configuration": f"{splitting_strategy}_{chunk_size}", "error": str(e), "questions_evaluated": len(results) } # Text splitting and database functions def get_text_splitter(strategy: str, chunk_size: int = 1024, chunk_overlap: int = 64): splitters = { "recursive": RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ), "fixed": CharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ), "token": TokenTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) } return splitters.get(strategy) def load_doc(list_file_path: List[str], splitting_strategy: str, chunk_size: str): chunk_size_value = CHUNK_SIZES[chunk_size][splitting_strategy] loaders = [PyPDFLoader(x) for x in list_file_path] pages = [] for loader in loaders: pages.extend(loader.load()) text_splitter = get_text_splitter(splitting_strategy, chunk_size_value) doc_splits = text_splitter.split_documents(pages) return doc_splits def create_db(splits, db_choice: str = "faiss"): embeddings = HuggingFaceEmbeddings() db_creators = { "faiss": lambda: FAISS.from_documents(splits, embeddings), "chroma": lambda: Chroma.from_documents(splits, embeddings), "qdrant": lambda: Qdrant.from_documents( splits, embeddings, location=":memory:", collection_name="pdf_docs" ) } return db_creators[db_choice]() def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice, progress=gr.Progress()): list_file_path = [x.name for x in list_file_obj if x is not None] doc_splits = load_doc(list_file_path, splitting_strategy, chunk_size) vector_db = create_db(doc_splits, db_choice) return vector_db, f"Database created using {splitting_strategy} splitting and {db_choice} vector database!" def initialize_llmchain(llm_choice, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()): llm_model = list_llm[llm_choice] llm = HuggingFaceEndpoint( repo_id=llm_model, huggingfacehub_api_token=api_token, temperature=temperature, max_new_tokens=max_tokens, top_k=top_k ) memory = ConversationBufferMemory( memory_key="chat_history", output_key='answer', return_messages=True ) retriever = vector_db.as_retriever() qa_chain = ConversationalRetrievalChain.from_llm( llm, retriever=retriever, memory=memory, return_source_documents=True ) return qa_chain, "LLM initialized successfully!" def conversation(qa_chain, message, history): """Fixed conversation function returning all required outputs""" response = qa_chain.invoke({ "question": message, "chat_history": [(hist[0], hist[1]) for hist in history] }) response_answer = response["answer"] if "Helpful Answer:" in response_answer: response_answer = response_answer.split("Helpful Answer:")[-1] # Get source documents, ensure we have exactly 3 sources = response["source_documents"][:3] source_contents = [] source_pages = [] # Process available sources for source in sources: source_contents.append(source.page_content.strip()) source_pages.append(source.metadata.get("page", 0) + 1) # Pad with empty values if we have fewer than 3 sources while len(source_contents) < 3: source_contents.append("") source_pages.append(0) # Return all required outputs in correct order return ( qa_chain, # State gr.update(value=""), # Clear message box history + [(message, response_answer)], # Updated chat history source_contents[0], # First source source_pages[0], # First page source_contents[1], # Second source source_pages[1], # Second page source_contents[2], # Third source source_pages[2] # Third page ) def demo(): evaluator = RAGEvaluator() with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink", neutral_hue="sky")) as demo: vector_db = gr.State() qa_chain = gr.State() gr.HTML("

Enhanced RAG PDF Chatbot with Evaluation

") with gr.Tabs(): # Custom PDF Tab with gr.Tab("Custom PDF Chat"): with gr.Row(): with gr.Column(scale=86): gr.Markdown("Step 1 - Configure and Initialize RAG Pipeline") with gr.Row(): document = gr.Files( height=300, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload PDF documents" ) with gr.Row(): splitting_strategy = gr.Radio( ["recursive", "fixed", "token"], label="Text Splitting Strategy", value="recursive" ) db_choice = gr.Radio( ["faiss", "chroma", "qdrant"], label="Vector Database", value="faiss" ) chunk_size = gr.Radio( ["small", "medium"], label="Chunk Size", value="medium" ) with gr.Row(): db_btn = gr.Button("Create vector database") db_progress = gr.Textbox( value="Not initialized", show_label=False ) gr.Markdown("Step 2 - Configure LLM") with gr.Row(): llm_choice = gr.Radio( list_llm_simple, label="Available LLMs", value=list_llm_simple[0], type="index" ) with gr.Row(): with gr.Accordion("LLM Parameters", open=False): temperature = gr.Slider( minimum=0.01, maximum=1.0, value=0.5, step=0.1, label="Temperature" ) max_tokens = gr.Slider( minimum=128, maximum=4096, value=2048, step=128, label="Max Tokens" ) top_k = gr.Slider( minimum=1, maximum=10, value=3, step=1, label="Top K" ) with gr.Row(): init_llm_btn = gr.Button("Initialize LLM") llm_progress = gr.Textbox( value="Not initialized", show_label=False ) with gr.Column(scale=200): gr.Markdown("Step 3 - Chat with Documents") chatbot = gr.Chatbot(height=505) with gr.Accordion("Source References", open=False): with gr.Row(): source1 = gr.Textbox(label="Source 1", lines=2) page1 = gr.Number(label="Page") with gr.Row(): source2 = gr.Textbox(label="Source 2", lines=2) page2 = gr.Number(label="Page") with gr.Row(): source3 = gr.Textbox(label="Source 3", lines=2) page3 = gr.Number(label="Page") with gr.Row(): msg = gr.Textbox( placeholder="Ask a question", show_label=False ) with gr.Row(): submit_btn = gr.Button("Submit") clear_btn = gr.ClearButton( [msg, chatbot], value="Clear Chat" ) # Evaluation Tab with gr.Tab("RAG Evaluation"): with gr.Row(): dataset_choice = gr.Dropdown( choices=list(evaluator.datasets.keys()), label="Select Evaluation Dataset", value="squad" ) load_dataset_btn = gr.Button("Load Dataset") with gr.Row(): dataset_info = gr.JSON(label="Dataset Information") with gr.Row(): eval_splitting_strategy = gr.Radio( ["recursive", "fixed", "token"], label="Text Splitting Strategy", value="recursive" ) eval_chunk_size = gr.Radio( ["small", "medium"], label="Chunk Size", value="medium" ) with gr.Row(): evaluate_btn = gr.Button("Run Evaluation") evaluation_results = gr.DataFrame(label="Evaluation Results") # Event handlers db_btn.click( initialize_database, inputs=[document, splitting_strategy, chunk_size, db_choice], outputs=[vector_db, db_progress] ) init_llm_btn.click( initialize_llmchain, inputs=[llm_choice, temperature, max_tokens, top_k, vector_db], outputs=[qa_chain, llm_progress] ) msg.submit( conversation, inputs=[qa_chain, msg, chatbot], outputs=[qa_chain, msg, chatbot, source1, page1, source2, page2, source3, page3] ) submit_btn.click( conversation, inputs=[qa_chain, msg, chatbot], outputs=[qa_chain, msg, chatbot, source1, page1, source2, page2, source3, page3] ) def load_dataset_handler(dataset_name): try: result = evaluator.load_dataset(dataset_name) if result.get("status") == "success": return { "dataset": result["dataset"], "samples_loaded": result["num_samples"], "example_questions": result["sample_questions"], "status": "ready for evaluation" } else: return { "error": result.get("error", "Unknown error occurred"), "status": "failed to load dataset" } except Exception as e: return { "error": str(e), "status": "failed to load dataset" } def run_evaluation(dataset_choice, splitting_strategy, chunk_size, vector_db, qa_chain): if not evaluator.current_dataset: return pd.DataFrame() results = evaluator.evaluate_configuration( vector_db=vector_db, qa_chain=qa_chain, splitting_strategy=splitting_strategy, chunk_size=chunk_size ) return pd.DataFrame([results]) load_dataset_btn.click( load_dataset_handler, inputs=[dataset_choice], outputs=[dataset_info] ) evaluate_btn.click( run_evaluation, inputs=[ dataset_choice, eval_splitting_strategy, eval_chunk_size, vector_db, qa_chain ], outputs=[evaluation_results] ) # Clear button handlers clear_btn.click( lambda: [None, "", 0, "", 0, "", 0], outputs=[chatbot, source1, page1, source2, page2, source3, page3] ) # Launch the demo demo.queue().launch(debug=True) if __name__ == "__main__": demo()