File size: 4,297 Bytes
f491b53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import chromadb
import os
import tempfile
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

def process_pdf(file_binary):
    log = []
    status_message = ""

    if not file_binary:
        return "No file uploaded.", "Error: No file was provided."

    try:
        log.append("Starting PDF upload and processing...")

        # Write uploaded PDF bytes to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file_binary)
            temp_path = temp_file.name
        log.append(f"Temporary PDF path: {temp_path}")

        # Load and extract text from the PDF
        try:
            loader = PyPDFLoader(temp_path)
            documents = loader.load()
            log.append(f"Loaded {len(documents)} page(s) from PDF.")
        except Exception as e:
            raise RuntimeError(f"Error loading PDF: {e}")

        # Split text into chunks
        try:
            text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
            splits = text_splitter.split_documents(documents)
            log.append(f"Text split into {len(splits)} chunk(s).")
        except Exception as e:
            raise RuntimeError(f"Error splitting text: {e}")

        # Create an in-memory Chroma client (ephemeral)
        try:
            log.append("Initializing in-memory ChromaDB...")
            chroma_client = chromadb.Client()  # in-memory, no local storage
            embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2"
            )
            Chroma.from_documents(
                splits,
                embeddings,
                client=chroma_client
            )
            log.append("Successfully stored PDF chunks in ChromaDB.")
        except Exception as e:
            raise RuntimeError(f"Error creating ChromaDB vector store: {e}")

        status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
        log.append(status_message)

    except Exception as e:
        status_message = "Error"
        log.append(f"Exception occurred: {str(e)}")

    return status_message, "\n".join(log)


def retrieve_context(query):
    log = []
    if not query:
        return "Error: No query provided."

    try:
        log.append("Retrieving context from in-memory ChromaDB...")

        # Re-initialize the in-memory Chroma client each time
        chroma_client = chromadb.Client()  # ephemeral
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)

        # Perform similarity search
        results = vectorstore.similarity_search(query, k=3)
        if results:
            log.append(f"Found {len(results)} matching chunk(s).")
            return "\n\n".join([doc.page_content for doc in results])
        else:
            log.append("No matching context found in the current in-memory DB.")
            return "No relevant context found. Have you processed a PDF yet?"

    except Exception as e:
        log.append(f"Error retrieving context: {str(e)}")
        return "\n".join(log)


with gr.Blocks() as demo:
    gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")

    with gr.Row():
        # Use type 'binary' to receive file data as binary
        pdf_upload = gr.File(label="Upload PDF", type="binary")
        process_button = gr.Button("Process PDF")

    output_text = gr.Textbox(label="Processing Status")
    log_output = gr.Textbox(label="Log Output", interactive=False)

    # Outputs: [status_message, log_output]
    process_button.click(
        fn=process_pdf, 
        inputs=pdf_upload, 
        outputs=[output_text, log_output]
    )

    query_input = gr.Textbox(label="Enter your query")
    retrieve_button = gr.Button("Retrieve Context")
    context_output = gr.Textbox(label="Retrieved Context")

    retrieve_button.click(
        fn=retrieve_context, 
        inputs=query_input, 
        outputs=context_output
    )

demo.launch()