Omarrran's picture
Create app.py
f491b53 verified
import gradio as gr
import chromadb
import os
import tempfile
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
def process_pdf(file_binary):
log = []
status_message = ""
if not file_binary:
return "No file uploaded.", "Error: No file was provided."
try:
log.append("Starting PDF upload and processing...")
# Write uploaded PDF bytes to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file_binary)
temp_path = temp_file.name
log.append(f"Temporary PDF path: {temp_path}")
# Load and extract text from the PDF
try:
loader = PyPDFLoader(temp_path)
documents = loader.load()
log.append(f"Loaded {len(documents)} page(s) from PDF.")
except Exception as e:
raise RuntimeError(f"Error loading PDF: {e}")
# Split text into chunks
try:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = text_splitter.split_documents(documents)
log.append(f"Text split into {len(splits)} chunk(s).")
except Exception as e:
raise RuntimeError(f"Error splitting text: {e}")
# Create an in-memory Chroma client (ephemeral)
try:
log.append("Initializing in-memory ChromaDB...")
chroma_client = chromadb.Client() # in-memory, no local storage
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
Chroma.from_documents(
splits,
embeddings,
client=chroma_client
)
log.append("Successfully stored PDF chunks in ChromaDB.")
except Exception as e:
raise RuntimeError(f"Error creating ChromaDB vector store: {e}")
status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
log.append(status_message)
except Exception as e:
status_message = "Error"
log.append(f"Exception occurred: {str(e)}")
return status_message, "\n".join(log)
def retrieve_context(query):
log = []
if not query:
return "Error: No query provided."
try:
log.append("Retrieving context from in-memory ChromaDB...")
# Re-initialize the in-memory Chroma client each time
chroma_client = chromadb.Client() # ephemeral
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)
# Perform similarity search
results = vectorstore.similarity_search(query, k=3)
if results:
log.append(f"Found {len(results)} matching chunk(s).")
return "\n\n".join([doc.page_content for doc in results])
else:
log.append("No matching context found in the current in-memory DB.")
return "No relevant context found. Have you processed a PDF yet?"
except Exception as e:
log.append(f"Error retrieving context: {str(e)}")
return "\n".join(log)
with gr.Blocks() as demo:
gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")
with gr.Row():
# Use type 'binary' to receive file data as binary
pdf_upload = gr.File(label="Upload PDF", type="binary")
process_button = gr.Button("Process PDF")
output_text = gr.Textbox(label="Processing Status")
log_output = gr.Textbox(label="Log Output", interactive=False)
# Outputs: [status_message, log_output]
process_button.click(
fn=process_pdf,
inputs=pdf_upload,
outputs=[output_text, log_output]
)
query_input = gr.Textbox(label="Enter your query")
retrieve_button = gr.Button("Retrieve Context")
context_output = gr.Textbox(label="Retrieved Context")
retrieve_button.click(
fn=retrieve_context,
inputs=query_input,
outputs=context_output
)
demo.launch()