Spaces:

Omarrran
/

Context_Retriever_with_ChromaDB_In-Memory

Running

Context_Retriever_with_ChromaDB_In-Memory

File size: 4,297 Bytes

f491b53

import gradio as gr
import chromadb
import os
import tempfile
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

def process_pdf(file_binary):
    log = []
    status_message = ""

    if not file_binary:
        return "No file uploaded.", "Error: No file was provided."

    try:
        log.append("Starting PDF upload and processing...")

        # Write uploaded PDF bytes to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file_binary)
            temp_path = temp_file.name
        log.append(f"Temporary PDF path: {temp_path}")

        # Load and extract text from the PDF
        try:
            loader = PyPDFLoader(temp_path)
            documents = loader.load()
            log.append(f"Loaded {len(documents)} page(s) from PDF.")
        except Exception as e:
            raise RuntimeError(f"Error loading PDF: {e}")

        # Split text into chunks
        try:
            text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
            splits = text_splitter.split_documents(documents)
            log.append(f"Text split into {len(splits)} chunk(s).")
        except Exception as e:
            raise RuntimeError(f"Error splitting text: {e}")

        # Create an in-memory Chroma client (ephemeral)
        try:
            log.append("Initializing in-memory ChromaDB...")
            chroma_client = chromadb.Client()  # in-memory, no local storage
            embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2"
            )
            Chroma.from_documents(
                splits,
                embeddings,
                client=chroma_client
            )
            log.append("Successfully stored PDF chunks in ChromaDB.")
        except Exception as e:
            raise RuntimeError(f"Error creating ChromaDB vector store: {e}")

        status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
        log.append(status_message)

    except Exception as e:
        status_message = "Error"
        log.append(f"Exception occurred: {str(e)}")

    return status_message, "\n".join(log)


def retrieve_context(query):
    log = []
    if not query:
        return "Error: No query provided."

    try:
        log.append("Retrieving context from in-memory ChromaDB...")

        # Re-initialize the in-memory Chroma client each time
        chroma_client = chromadb.Client()  # ephemeral
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)

        # Perform similarity search
        results = vectorstore.similarity_search(query, k=3)
        if results:
            log.append(f"Found {len(results)} matching chunk(s).")
            return "\n\n".join([doc.page_content for doc in results])
        else:
            log.append("No matching context found in the current in-memory DB.")
            return "No relevant context found. Have you processed a PDF yet?"

    except Exception as e:
        log.append(f"Error retrieving context: {str(e)}")
        return "\n".join(log)


with gr.Blocks() as demo:
    gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")

    with gr.Row():
        # Use type 'binary' to receive file data as binary
        pdf_upload = gr.File(label="Upload PDF", type="binary")
        process_button = gr.Button("Process PDF")

    output_text = gr.Textbox(label="Processing Status")
    log_output = gr.Textbox(label="Log Output", interactive=False)

    # Outputs: [status_message, log_output]
    process_button.click(
        fn=process_pdf, 
        inputs=pdf_upload, 
        outputs=[output_text, log_output]
    )

    query_input = gr.Textbox(label="Enter your query")
    retrieve_button = gr.Button("Retrieve Context")
    context_output = gr.Textbox(label="Retrieved Context")

    retrieve_button.click(
        fn=retrieve_context, 
        inputs=query_input, 
        outputs=context_output
    )

demo.launch()