Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -149,6 +149,7 @@ def clear_inputs():
|
|
149 |
uploaded_documents = []
|
150 |
|
151 |
|
|
|
152 |
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB
|
153 |
def process_pdf(pdf_file, uploaded_documents):
|
154 |
if pdf_file is None:
|
@@ -156,24 +157,22 @@ def process_pdf(pdf_file, uploaded_documents):
|
|
156 |
|
157 |
# Open the PDF file and extract text page by page
|
158 |
with pdfplumber.open(pdf_file.name) as pdf:
|
159 |
-
|
160 |
for page_num, page in enumerate(pdf.pages, start=1):
|
161 |
text = page.extract_text()
|
162 |
if text:
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
# Update the upload history with metadata including page number
|
177 |
document_record = {
|
178 |
"Document Name": pdf_file.name,
|
179 |
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
@@ -187,7 +186,8 @@ def process_pdf(pdf_file, uploaded_documents):
|
|
187 |
# Convert the list of dictionaries into a list of lists for the dataframe
|
188 |
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
|
189 |
|
190 |
-
return table_data, f"Uploaded {len(chunks)} chunks to the vector database
|
|
|
191 |
|
192 |
|
193 |
# Gradio Interface
|
|
|
149 |
uploaded_documents = []
|
150 |
|
151 |
|
152 |
+
|
153 |
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB
|
154 |
def process_pdf(pdf_file, uploaded_documents):
|
155 |
if pdf_file is None:
|
|
|
157 |
|
158 |
# Open the PDF file and extract text page by page
|
159 |
with pdfplumber.open(pdf_file.name) as pdf:
|
160 |
+
chunks = []
|
161 |
for page_num, page in enumerate(pdf.pages, start=1):
|
162 |
text = page.extract_text()
|
163 |
if text:
|
164 |
+
# Split the text into chunks and attach page number metadata to each chunk
|
165 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
166 |
+
page_chunks = text_splitter.split_text(text)
|
167 |
+
for chunk in page_chunks:
|
168 |
+
# Create a Document with the page number as metadata
|
169 |
+
document = Document(page_content=chunk, metadata={"page_number": page_num})
|
170 |
+
chunks.append(document)
|
171 |
+
|
172 |
+
# Embed and upload the chunks into the vector database
|
173 |
+
chunk_ids = vectorstore.add_documents(chunks)
|
174 |
+
|
175 |
+
# Update the upload history
|
|
|
|
|
176 |
document_record = {
|
177 |
"Document Name": pdf_file.name,
|
178 |
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
186 |
# Convert the list of dictionaries into a list of lists for the dataframe
|
187 |
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
|
188 |
|
189 |
+
return table_data, f"Uploaded {len(chunks)} chunks to the vector database with page numbers included as metadata."
|
190 |
+
|
191 |
|
192 |
|
193 |
# Gradio Interface
|