import gradio as gr import chromadb import os import tempfile from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import PyPDFLoader def process_pdf(file_binary): log = [] status_message = "" if not file_binary: return "No file uploaded.", "Error: No file was provided." try: log.append("Starting PDF upload and processing...") # Write uploaded PDF bytes to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: temp_file.write(file_binary) temp_path = temp_file.name log.append(f"Temporary PDF path: {temp_path}") # Load and extract text from the PDF try: loader = PyPDFLoader(temp_path) documents = loader.load() log.append(f"Loaded {len(documents)} page(s) from PDF.") except Exception as e: raise RuntimeError(f"Error loading PDF: {e}") # Split text into chunks try: text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) splits = text_splitter.split_documents(documents) log.append(f"Text split into {len(splits)} chunk(s).") except Exception as e: raise RuntimeError(f"Error splitting text: {e}") # Create an in-memory Chroma client (ephemeral) try: log.append("Initializing in-memory ChromaDB...") chroma_client = chromadb.Client() # in-memory, no local storage embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) Chroma.from_documents( splits, embeddings, client=chroma_client ) log.append("Successfully stored PDF chunks in ChromaDB.") except Exception as e: raise RuntimeError(f"Error creating ChromaDB vector store: {e}") status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!" log.append(status_message) except Exception as e: status_message = "Error" log.append(f"Exception occurred: {str(e)}") return status_message, "\n".join(log) def retrieve_context(query): log = [] if not query: return "Error: No query provided." try: log.append("Retrieving context from in-memory ChromaDB...") # Re-initialize the in-memory Chroma client each time chroma_client = chromadb.Client() # ephemeral embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) vectorstore = Chroma(embedding_function=embeddings, client=chroma_client) # Perform similarity search results = vectorstore.similarity_search(query, k=3) if results: log.append(f"Found {len(results)} matching chunk(s).") return "\n\n".join([doc.page_content for doc in results]) else: log.append("No matching context found in the current in-memory DB.") return "No relevant context found. Have you processed a PDF yet?" except Exception as e: log.append(f"Error retrieving context: {str(e)}") return "\n".join(log) with gr.Blocks() as demo: gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)") with gr.Row(): # Use type 'binary' to receive file data as binary pdf_upload = gr.File(label="Upload PDF", type="binary") process_button = gr.Button("Process PDF") output_text = gr.Textbox(label="Processing Status") log_output = gr.Textbox(label="Log Output", interactive=False) # Outputs: [status_message, log_output] process_button.click( fn=process_pdf, inputs=pdf_upload, outputs=[output_text, log_output] ) query_input = gr.Textbox(label="Enter your query") retrieve_button = gr.Button("Retrieve Context") context_output = gr.Textbox(label="Retrieved Context") retrieve_button.click( fn=retrieve_context, inputs=query_input, outputs=context_output ) demo.launch()