Spaces:

Omarrran
/

Context_Retriever_with_ChromaDB_In-Memory

Running

App Files Files Community

Context_Retriever_with_ChromaDB_In-Memory / app.py

Omarrran

Create app.py

f491b53 verified 4 months ago

raw

history blame contribute delete

4.3 kB

	import gradio as gr
	import chromadb
	import os
	import tempfile
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.document_loaders import PyPDFLoader

	def process_pdf(file_binary):
	log = []
	status_message = ""

	if not file_binary:
	return "No file uploaded.", "Error: No file was provided."

	try:
	log.append("Starting PDF upload and processing...")

	# Write uploaded PDF bytes to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(file_binary)
	temp_path = temp_file.name
	log.append(f"Temporary PDF path: {temp_path}")

	# Load and extract text from the PDF
	try:
	loader = PyPDFLoader(temp_path)
	documents = loader.load()
	log.append(f"Loaded {len(documents)} page(s) from PDF.")
	except Exception as e:
	raise RuntimeError(f"Error loading PDF: {e}")

	# Split text into chunks
	try:
	text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	splits = text_splitter.split_documents(documents)
	log.append(f"Text split into {len(splits)} chunk(s).")
	except Exception as e:
	raise RuntimeError(f"Error splitting text: {e}")

	# Create an in-memory Chroma client (ephemeral)
	try:
	log.append("Initializing in-memory ChromaDB...")
	chroma_client = chromadb.Client() # in-memory, no local storage
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	Chroma.from_documents(
	splits,
	embeddings,
	client=chroma_client
	)
	log.append("Successfully stored PDF chunks in ChromaDB.")
	except Exception as e:
	raise RuntimeError(f"Error creating ChromaDB vector store: {e}")

	status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
	log.append(status_message)

	except Exception as e:
	status_message = "Error"
	log.append(f"Exception occurred: {str(e)}")

	return status_message, "\n".join(log)


	def retrieve_context(query):
	log = []
	if not query:
	return "Error: No query provided."

	try:
	log.append("Retrieving context from in-memory ChromaDB...")

	# Re-initialize the in-memory Chroma client each time
	chroma_client = chromadb.Client() # ephemeral
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)

	# Perform similarity search
	results = vectorstore.similarity_search(query, k=3)
	if results:
	log.append(f"Found {len(results)} matching chunk(s).")
	return "\n\n".join([doc.page_content for doc in results])
	else:
	log.append("No matching context found in the current in-memory DB.")
	return "No relevant context found. Have you processed a PDF yet?"

	except Exception as e:
	log.append(f"Error retrieving context: {str(e)}")
	return "\n".join(log)


	with gr.Blocks() as demo:
	gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")

	with gr.Row():
	# Use type 'binary' to receive file data as binary
	pdf_upload = gr.File(label="Upload PDF", type="binary")
	process_button = gr.Button("Process PDF")

	output_text = gr.Textbox(label="Processing Status")
	log_output = gr.Textbox(label="Log Output", interactive=False)

	# Outputs: [status_message, log_output]
	process_button.click(
	fn=process_pdf,
	inputs=pdf_upload,
	outputs=[output_text, log_output]
	)

	query_input = gr.Textbox(label="Enter your query")
	retrieve_button = gr.Button("Retrieve Context")
	context_output = gr.Textbox(label="Retrieved Context")

	retrieve_button.click(
	fn=retrieve_context,
	inputs=query_input,
	outputs=context_output
	)

	demo.launch()