Omarrran commited on
Commit
f491b53
·
verified ·
1 Parent(s): 6e11324

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import chromadb
3
+ import os
4
+ import tempfile
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.document_loaders import PyPDFLoader
9
+
10
+ def process_pdf(file_binary):
11
+ log = []
12
+ status_message = ""
13
+
14
+ if not file_binary:
15
+ return "No file uploaded.", "Error: No file was provided."
16
+
17
+ try:
18
+ log.append("Starting PDF upload and processing...")
19
+
20
+ # Write uploaded PDF bytes to a temporary file
21
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
22
+ temp_file.write(file_binary)
23
+ temp_path = temp_file.name
24
+ log.append(f"Temporary PDF path: {temp_path}")
25
+
26
+ # Load and extract text from the PDF
27
+ try:
28
+ loader = PyPDFLoader(temp_path)
29
+ documents = loader.load()
30
+ log.append(f"Loaded {len(documents)} page(s) from PDF.")
31
+ except Exception as e:
32
+ raise RuntimeError(f"Error loading PDF: {e}")
33
+
34
+ # Split text into chunks
35
+ try:
36
+ text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
37
+ splits = text_splitter.split_documents(documents)
38
+ log.append(f"Text split into {len(splits)} chunk(s).")
39
+ except Exception as e:
40
+ raise RuntimeError(f"Error splitting text: {e}")
41
+
42
+ # Create an in-memory Chroma client (ephemeral)
43
+ try:
44
+ log.append("Initializing in-memory ChromaDB...")
45
+ chroma_client = chromadb.Client() # in-memory, no local storage
46
+ embeddings = HuggingFaceEmbeddings(
47
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
48
+ )
49
+ Chroma.from_documents(
50
+ splits,
51
+ embeddings,
52
+ client=chroma_client
53
+ )
54
+ log.append("Successfully stored PDF chunks in ChromaDB.")
55
+ except Exception as e:
56
+ raise RuntimeError(f"Error creating ChromaDB vector store: {e}")
57
+
58
+ status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
59
+ log.append(status_message)
60
+
61
+ except Exception as e:
62
+ status_message = "Error"
63
+ log.append(f"Exception occurred: {str(e)}")
64
+
65
+ return status_message, "\n".join(log)
66
+
67
+
68
+ def retrieve_context(query):
69
+ log = []
70
+ if not query:
71
+ return "Error: No query provided."
72
+
73
+ try:
74
+ log.append("Retrieving context from in-memory ChromaDB...")
75
+
76
+ # Re-initialize the in-memory Chroma client each time
77
+ chroma_client = chromadb.Client() # ephemeral
78
+ embeddings = HuggingFaceEmbeddings(
79
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
80
+ )
81
+ vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)
82
+
83
+ # Perform similarity search
84
+ results = vectorstore.similarity_search(query, k=3)
85
+ if results:
86
+ log.append(f"Found {len(results)} matching chunk(s).")
87
+ return "\n\n".join([doc.page_content for doc in results])
88
+ else:
89
+ log.append("No matching context found in the current in-memory DB.")
90
+ return "No relevant context found. Have you processed a PDF yet?"
91
+
92
+ except Exception as e:
93
+ log.append(f"Error retrieving context: {str(e)}")
94
+ return "\n".join(log)
95
+
96
+
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")
99
+
100
+ with gr.Row():
101
+ # Use type 'binary' to receive file data as binary
102
+ pdf_upload = gr.File(label="Upload PDF", type="binary")
103
+ process_button = gr.Button("Process PDF")
104
+
105
+ output_text = gr.Textbox(label="Processing Status")
106
+ log_output = gr.Textbox(label="Log Output", interactive=False)
107
+
108
+ # Outputs: [status_message, log_output]
109
+ process_button.click(
110
+ fn=process_pdf,
111
+ inputs=pdf_upload,
112
+ outputs=[output_text, log_output]
113
+ )
114
+
115
+ query_input = gr.Textbox(label="Enter your query")
116
+ retrieve_button = gr.Button("Retrieve Context")
117
+ context_output = gr.Textbox(label="Retrieved Context")
118
+
119
+ retrieve_button.click(
120
+ fn=retrieve_context,
121
+ inputs=query_input,
122
+ outputs=context_output
123
+ )
124
+
125
+ demo.launch()