Spaces:

Omarrran
/

Context_Retriever_with_ChromaDB_In-Memory

Sleeping

App Files Files Community

Omarrran commited on 21 days ago

Commit

f491b53

verified ·

1 Parent(s): 6e11324

Create app.py

Browse files

Files changed (1) hide show

app.py +125 -0

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import gradio as gr
+import chromadb
+import os
+import tempfile
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.document_loaders import PyPDFLoader
+def process_pdf(file_binary):
+    log = []
+    status_message = ""
+    if not file_binary:
+        return "No file uploaded.", "Error: No file was provided."
+    try:
+        log.append("Starting PDF upload and processing...")
+        # Write uploaded PDF bytes to a temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(file_binary)
+            temp_path = temp_file.name
+        log.append(f"Temporary PDF path: {temp_path}")
+        # Load and extract text from the PDF
+        try:
+            loader = PyPDFLoader(temp_path)
+            documents = loader.load()
+            log.append(f"Loaded {len(documents)} page(s) from PDF.")
+        except Exception as e:
+            raise RuntimeError(f"Error loading PDF: {e}")
+        # Split text into chunks
+        try:
+            text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+            splits = text_splitter.split_documents(documents)
+            log.append(f"Text split into {len(splits)} chunk(s).")
+        except Exception as e:
+            raise RuntimeError(f"Error splitting text: {e}")
+        # Create an in-memory Chroma client (ephemeral)
+        try:
+            log.append("Initializing in-memory ChromaDB...")
+            chroma_client = chromadb.Client()  # in-memory, no local storage
+            embeddings = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            )
+            Chroma.from_documents(
+                splits,
+                embeddings,
+                client=chroma_client
+            )
+            log.append("Successfully stored PDF chunks in ChromaDB.")
+        except Exception as e:
+            raise RuntimeError(f"Error creating ChromaDB vector store: {e}")
+        status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
+        log.append(status_message)
+    except Exception as e:
+        status_message = "Error"
+        log.append(f"Exception occurred: {str(e)}")
+    return status_message, "\n".join(log)
+def retrieve_context(query):
+    log = []
+    if not query:
+        return "Error: No query provided."
+    try:
+        log.append("Retrieving context from in-memory ChromaDB...")
+        # Re-initialize the in-memory Chroma client each time
+        chroma_client = chromadb.Client()  # ephemeral
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)
+        # Perform similarity search
+        results = vectorstore.similarity_search(query, k=3)
+        if results:
+            log.append(f"Found {len(results)} matching chunk(s).")
+            return "\n\n".join([doc.page_content for doc in results])
+        else:
+            log.append("No matching context found in the current in-memory DB.")
+            return "No relevant context found. Have you processed a PDF yet?"
+    except Exception as e:
+        log.append(f"Error retrieving context: {str(e)}")
+        return "\n".join(log)
+with gr.Blocks() as demo:
+    gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")
+    with gr.Row():
+        # Use type 'binary' to receive file data as binary
+        pdf_upload = gr.File(label="Upload PDF", type="binary")
+        process_button = gr.Button("Process PDF")
+    output_text = gr.Textbox(label="Processing Status")
+    log_output = gr.Textbox(label="Log Output", interactive=False)
+    # Outputs: [status_message, log_output]
+    process_button.click(
+        fn=process_pdf,
+        inputs=pdf_upload,
+        outputs=[output_text, log_output]
+    )
+    query_input = gr.Textbox(label="Enter your query")
+    retrieve_button = gr.Button("Retrieve Context")
+    context_output = gr.Textbox(label="Retrieved Context")
+    retrieve_button.click(
+        fn=retrieve_context,
+        inputs=query_input,
+        outputs=context_output
+    )
+demo.launch()