RAG-PDF-Chatbot

Sleeping

App Files Files Community

RAG-PDF-Chatbot / app4.py

arjunanand13

Rename app.py to app4.py

05fb03d verified 24 days ago

raw

history blame contribute delete

21.1 kB

	import gradio as gr
	import os
	from typing import List, Dict
	import numpy as np
	from datasets import load_dataset
	from langchain.text_splitter import (
	RecursiveCharacterTextSplitter,
	CharacterTextSplitter,
	TokenTextSplitter
	)
	from langchain_community.vectorstores import FAISS, Chroma, Qdrant
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.chains import ConversationalRetrievalChain
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_huggingface import HuggingFaceEndpoint
	from langchain.memory import ConversationBufferMemory
	from sentence_transformers import SentenceTransformer, util
	import torch
	from ragas import evaluate
	from ragas.metrics import (
	ContextRecall,
	AnswerRelevancy,
	Faithfulness,
	ContextPrecision
	)
	import pandas as pd

	# Constants and setup
	list_llm = ["meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"]
	list_llm_simple = [os.path.basename(llm) for llm in list_llm]
	api_token = os.getenv("HF_TOKEN")

	CHUNK_SIZES = {
	"small": {"recursive": 512, "fixed": 512, "token": 256},
	"medium": {"recursive": 1024, "fixed": 1024, "token": 512}
	}

	# Initialize sentence transformer for evaluation
	sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	class RAGEvaluator:
	def __init__(self):
	self.datasets = {
	"squad": "squad_v2",
	"msmarco": "ms_marco"
	}
	self.current_dataset = None
	self.test_samples = []

	def load_dataset(self, dataset_name: str, num_samples: int = 10):
	"""Load dataset with proper error handling"""
	try:
	if dataset_name == "squad":
	dataset = load_dataset("squad_v2", split="validation")
	samples = dataset.select(range(0, 1000, 100))[:num_samples]

	self.test_samples = []
	for sample in samples:
	# Handle SQuAD format
	answers = sample["answers"]
	if answers["text"]: # Check if there are answers
	self.test_samples.append({
	"question": sample["question"],
	"ground_truth": answers["text"][0],
	"context": sample["context"]
	})

	elif dataset_name == "msmarco":
	dataset = load_dataset("ms_marco", "v2.1", split="test") # Changed from dev to test
	samples = dataset.select(range(0, 1000, 100))[:num_samples]

	self.test_samples = []
	for sample in samples:
	if sample["answers"]: # Check if answers exist
	self.test_samples.append({
	"question": sample["query"],
	"ground_truth": sample["answers"][0],
	"context": sample["passages"]["passage_text"][0]
	})

	self.current_dataset = dataset_name
	return {
	"dataset": dataset_name,
	"samples_loaded": len(self.test_samples),
	"example_questions": [s["question"] for s in self.test_samples[:3]]
	}

	except Exception as e:
	print(f"Error loading dataset: {str(e)}")
	return {
	"error": str(e),
	"status": "failed"
	}

	def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict:
	"""Evaluate with progress tracking and error handling"""
	if not self.test_samples:
	return {"error": "No dataset loaded"}

	results = []
	total_questions = len(self.test_samples)

	# Add progress tracking
	for i, sample in enumerate(self.test_samples):
	print(f"Evaluating question {i+1}/{total_questions}")

	try:
	response = qa_chain.invoke({
	"question": sample["question"],
	"chat_history": []
	})

	results.append({
	"question": sample["question"],
	"answer": response["answer"],
	"contexts": [doc.page_content for doc in response["source_documents"]],
	"ground_truths": [sample["ground_truth"]]
	})
	except Exception as e:
	print(f"Error processing question {i+1}: {str(e)}")
	continue

	if not results:
	return {
	"configuration": f"{splitting_strategy}_{chunk_size}",
	"error": "No successful evaluations",
	"questions_evaluated": 0
	}

	try:
	# Calculate RAGAS metrics
	eval_dataset = Dataset.from_list(results)
	metrics = [ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision()]
	scores = evaluate(eval_dataset, metrics=metrics)

	return {
	"configuration": f"{splitting_strategy}_{chunk_size}",
	"questions_evaluated": len(results),
	"context_recall": float(scores['context_recall']),
	"answer_relevancy": float(scores['answer_relevancy']),
	"faithfulness": float(scores['faithfulness']),
	"context_precision": float(scores['context_precision']),
	"average_score": float(np.mean([
	scores['context_recall'],
	scores['answer_relevancy'],
	scores['faithfulness'],
	scores['context_precision']
	]))
	}
	except Exception as e:
	return {
	"configuration": f"{splitting_strategy}_{chunk_size}",
	"error": str(e),
	"questions_evaluated": len(results)
	}

	# Text splitting and database functions
	def get_text_splitter(strategy: str, chunk_size: int = 1024, chunk_overlap: int = 64):
	splitters = {
	"recursive": RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	),
	"fixed": CharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	),
	"token": TokenTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	}
	return splitters.get(strategy)

	def load_doc(list_file_path: List[str], splitting_strategy: str, chunk_size: str):
	chunk_size_value = CHUNK_SIZES[chunk_size][splitting_strategy]
	loaders = [PyPDFLoader(x) for x in list_file_path]
	pages = []
	for loader in loaders:
	pages.extend(loader.load())

	text_splitter = get_text_splitter(splitting_strategy, chunk_size_value)
	doc_splits = text_splitter.split_documents(pages)
	return doc_splits

	def create_db(splits, db_choice: str = "faiss"):
	embeddings = HuggingFaceEmbeddings()
	db_creators = {
	"faiss": lambda: FAISS.from_documents(splits, embeddings),
	"chroma": lambda: Chroma.from_documents(splits, embeddings),
	"qdrant": lambda: Qdrant.from_documents(
	splits,
	embeddings,
	location=":memory:",
	collection_name="pdf_docs"
	)
	}
	return db_creators[db_choice]()

	def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice, progress=gr.Progress()):
	"""Initialize vector database with error handling"""
	try:
	if not list_file_obj:
	return None, "No files uploaded. Please upload PDF documents first."

	list_file_path = [x.name for x in list_file_obj if x is not None]
	if not list_file_path:
	return None, "No valid files found. Please upload PDF documents."

	doc_splits = load_doc(list_file_path, splitting_strategy, chunk_size)
	if not doc_splits:
	return None, "No content extracted from documents."

	vector_db = create_db(doc_splits, db_choice)
	return vector_db, f"Database created successfully using {splitting_strategy} splitting and {db_choice} vector database!"

	except Exception as e:
	return None, f"Error creating database: {str(e)}"

	def initialize_llmchain(llm_choice, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
	"""Initialize LLM chain with error handling"""
	try:
	if vector_db is None:
	return None, "Please create vector database first."

	llm_model = list_llm[llm_choice]

	llm = HuggingFaceEndpoint(
	repo_id=llm_model,
	huggingfacehub_api_token=api_token,
	temperature=temperature,
	max_new_tokens=max_tokens,
	top_k=top_k
	)

	memory = ConversationBufferMemory(
	memory_key="chat_history",
	output_key='answer',
	return_messages=True
	)

	retriever = vector_db.as_retriever()
	qa_chain = ConversationalRetrievalChain.from_llm(
	llm,
	retriever=retriever,
	memory=memory,
	return_source_documents=True
	)
	return qa_chain, "LLM initialized successfully!"

	except Exception as e:
	return None, f"Error initializing LLM: {str(e)}"

	def conversation(qa_chain, message, history):
	"""Fixed conversation function returning all required outputs"""
	response = qa_chain.invoke({
	"question": message,
	"chat_history": [(hist[0], hist[1]) for hist in history]
	})

	response_answer = response["answer"]
	if "Helpful Answer:" in response_answer:
	response_answer = response_answer.split("Helpful Answer:")[-1]

	# Get source documents, ensure we have exactly 3
	sources = response["source_documents"][:3]
	source_contents = []
	source_pages = []

	# Process available sources
	for source in sources:
	source_contents.append(source.page_content.strip())
	source_pages.append(source.metadata.get("page", 0) + 1)

	# Pad with empty values if we have fewer than 3 sources
	while len(source_contents) < 3:
	source_contents.append("")
	source_pages.append(0)

	# Return all required outputs in correct order
	return (
	qa_chain, # State
	gr.update(value=""), # Clear message box
	history + [(message, response_answer)], # Updated chat history
	source_contents[0], # First source
	source_pages[0], # First page
	source_contents[1], # Second source
	source_pages[1], # Second page
	source_contents[2], # Third source
	source_pages[2] # Third page
	)

	def demo():
	evaluator = RAGEvaluator()

	with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink", neutral_hue="sky")) as demo:
	vector_db = gr.State()
	qa_chain = gr.State()

	gr.HTML("<center><h1>Enhanced RAG PDF Chatbot with Evaluation</h1></center>")

	with gr.Tabs():
	# Custom PDF Tab
	with gr.Tab("Custom PDF Chat"):
	with gr.Row():
	with gr.Column(scale=86):
	gr.Markdown("<b>Step 1 - Configure and Initialize RAG Pipeline</b>")
	with gr.Row():
	document = gr.Files(
	height=300,
	file_count="multiple",
	file_types=["pdf"],
	interactive=True,
	label="Upload PDF documents"
	)

	with gr.Row():
	splitting_strategy = gr.Radio(
	["recursive", "fixed", "token"],
	label="Text Splitting Strategy",
	value="recursive"
	)
	db_choice = gr.Radio(
	["faiss", "chroma", "qdrant"],
	label="Vector Database",
	value="faiss"
	)
	chunk_size = gr.Radio(
	["small", "medium"],
	label="Chunk Size",
	value="medium"
	)

	with gr.Row():
	db_btn = gr.Button("Create vector database")
	db_progress = gr.Textbox(
	value="Not initialized",
	show_label=False
	)

	gr.Markdown("<b>Step 2 - Configure LLM</b>")
	with gr.Row():
	llm_choice = gr.Radio(
	list_llm_simple,
	label="Available LLMs",
	value=list_llm_simple[0],
	type="index"
	)

	with gr.Row():
	with gr.Accordion("LLM Parameters", open=False):
	temperature = gr.Slider(
	minimum=0.01,
	maximum=1.0,
	value=0.5,
	step=0.1,
	label="Temperature"
	)
	max_tokens = gr.Slider(
	minimum=128,
	maximum=4096,
	value=2048,
	step=128,
	label="Max Tokens"
	)
	top_k = gr.Slider(
	minimum=1,
	maximum=10,
	value=3,
	step=1,
	label="Top K"
	)

	with gr.Row():
	init_llm_btn = gr.Button("Initialize LLM")
	llm_progress = gr.Textbox(
	value="Not initialized",
	show_label=False
	)

	with gr.Column(scale=200):
	gr.Markdown("<b>Step 3 - Chat with Documents</b>")
	chatbot = gr.Chatbot(height=505)

	with gr.Accordion("Source References", open=False):
	with gr.Row():
	source1 = gr.Textbox(label="Source 1", lines=2)
	page1 = gr.Number(label="Page")
	with gr.Row():
	source2 = gr.Textbox(label="Source 2", lines=2)
	page2 = gr.Number(label="Page")
	with gr.Row():
	source3 = gr.Textbox(label="Source 3", lines=2)
	page3 = gr.Number(label="Page")

	with gr.Row():
	msg = gr.Textbox(
	placeholder="Ask a question",
	show_label=False
	)
	with gr.Row():
	submit_btn = gr.Button("Submit")
	clear_btn = gr.ClearButton(
	[msg, chatbot],
	value="Clear Chat"
	)

	# Evaluation Tab
	with gr.Tab("RAG Evaluation"):
	with gr.Row():
	dataset_choice = gr.Dropdown(
	choices=list(evaluator.datasets.keys()),
	label="Select Evaluation Dataset",
	value="squad"
	)
	load_dataset_btn = gr.Button("Load Dataset")

	with gr.Row():
	dataset_info = gr.JSON(label="Dataset Information")

	with gr.Row():
	eval_splitting_strategy = gr.Radio(
	["recursive", "fixed", "token"],
	label="Text Splitting Strategy",
	value="recursive"
	)
	eval_chunk_size = gr.Radio(
	["small", "medium"],
	label="Chunk Size",
	value="medium"
	)

	with gr.Row():
	evaluate_btn = gr.Button("Run Evaluation")
	evaluation_results = gr.DataFrame(label="Evaluation Results")

	# Event handlers
	db_btn.click(
	initialize_database,
	inputs=[document, splitting_strategy, chunk_size, db_choice],
	outputs=[vector_db, db_progress]
	).then(
	lambda x: gr.update(interactive=True) if x[0] is not None else gr.update(interactive=False),
	inputs=[vector_db],
	outputs=[init_llm_btn]
	)

	init_llm_btn.click(
	initialize_llmchain,
	inputs=[llm_choice, temperature, max_tokens, top_k, vector_db],
	outputs=[qa_chain, llm_progress]
	).then(
	lambda x: gr.update(interactive=True) if x[0] is not None else gr.update(interactive=False),
	inputs=[qa_chain],
	outputs=[msg]
	)

	load_dataset_btn.click(
	lambda x: evaluator.load_dataset(x),
	inputs=[dataset_choice],
	outputs=[dataset_info]
	)

	msg.submit(
	conversation,
	inputs=[qa_chain, msg, chatbot],
	outputs=[qa_chain, msg, chatbot, source1, page1, source2, page2, source3, page3]
	)

	submit_btn.click(
	conversation,
	inputs=[qa_chain, msg, chatbot],
	outputs=[qa_chain, msg, chatbot, source1, page1, source2, page2, source3, page3]
	)

	def load_dataset_handler(dataset_name):
	try:
	result = evaluator.load_dataset(dataset_name)
	if result.get("status") == "success":
	return {
	"dataset": result["dataset"],
	"samples_loaded": result["num_samples"],
	"example_questions": result["sample_questions"],
	"status": "ready for evaluation"
	}
	else:
	return {
	"error": result.get("error", "Unknown error occurred"),
	"status": "failed to load dataset"
	}
	except Exception as e:
	return {
	"error": str(e),
	"status": "failed to load dataset"
	}

	def run_evaluation(dataset_choice, splitting_strategy, chunk_size, vector_db, qa_chain):
	if not evaluator.current_dataset:
	return pd.DataFrame()

	results = evaluator.evaluate_configuration(
	vector_db=vector_db,
	qa_chain=qa_chain,
	splitting_strategy=splitting_strategy,
	chunk_size=chunk_size
	)

	return pd.DataFrame([results])

	load_dataset_btn.click(
	load_dataset_handler,
	inputs=[dataset_choice],
	outputs=[dataset_info]
	)

	evaluate_btn.click(
	run_evaluation,
	inputs=[
	dataset_choice,
	eval_splitting_strategy,
	eval_chunk_size,
	vector_db,
	qa_chain
	],
	outputs=[evaluation_results]
	)

	# Clear button handlers
	clear_btn.click(
	lambda: [None, "", 0, "", 0, "", 0],
	outputs=[chatbot, source1, page1, source2, page2, source3, page3]
	)

	# Launch the demo
	demo.queue().launch(debug=True)

	if __name__ == "__main__":
	demo()