Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -148,27 +148,32 @@ def clear_inputs():
|
|
148 |
# Create a global list to store uploaded document records
|
149 |
uploaded_documents = []
|
150 |
|
|
|
151 |
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB
|
152 |
def process_pdf(pdf_file, uploaded_documents):
|
153 |
if pdf_file is None:
|
154 |
return uploaded_documents, "No PDF file uploaded."
|
|
|
|
|
155 |
with pdfplumber.open(pdf_file.name) as pdf:
|
156 |
all_text = ""
|
157 |
-
for page in pdf.pages:
|
158 |
-
|
|
|
|
|
159 |
|
160 |
# Split the text into chunks
|
161 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
162 |
chunks = text_splitter.split_text(all_text)
|
163 |
|
164 |
-
# Embed and upload the chunks into the vector database
|
165 |
chunk_ids = []
|
166 |
for chunk in chunks:
|
167 |
-
document = Document(page_content=chunk)
|
168 |
chunk_id = vectorstore.add_documents([document])
|
169 |
chunk_ids.append(chunk_id)
|
170 |
|
171 |
-
# Update the upload history
|
172 |
document_record = {
|
173 |
"Document Name": pdf_file.name,
|
174 |
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
@@ -182,7 +187,8 @@ def process_pdf(pdf_file, uploaded_documents):
|
|
182 |
# Convert the list of dictionaries into a list of lists for the dataframe
|
183 |
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
|
184 |
|
185 |
-
return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
|
|
|
186 |
|
187 |
# Gradio Interface
|
188 |
with gr.Blocks() as demo:
|
|
|
148 |
# Create a global list to store uploaded document records
|
149 |
uploaded_documents = []
|
150 |
|
151 |
+
|
152 |
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB
|
153 |
def process_pdf(pdf_file, uploaded_documents):
|
154 |
if pdf_file is None:
|
155 |
return uploaded_documents, "No PDF file uploaded."
|
156 |
+
|
157 |
+
# Open the PDF file and extract text page by page
|
158 |
with pdfplumber.open(pdf_file.name) as pdf:
|
159 |
all_text = ""
|
160 |
+
for page_num, page in enumerate(pdf.pages, start=1):
|
161 |
+
text = page.extract_text()
|
162 |
+
if text:
|
163 |
+
all_text += text
|
164 |
|
165 |
# Split the text into chunks
|
166 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
167 |
chunks = text_splitter.split_text(all_text)
|
168 |
|
169 |
+
# Embed and upload the chunks into the vector database with page number metadata
|
170 |
chunk_ids = []
|
171 |
for chunk in chunks:
|
172 |
+
document = Document(page_content=chunk, metadata={"page_number": page_num})
|
173 |
chunk_id = vectorstore.add_documents([document])
|
174 |
chunk_ids.append(chunk_id)
|
175 |
|
176 |
+
# Update the upload history with metadata including page number
|
177 |
document_record = {
|
178 |
"Document Name": pdf_file.name,
|
179 |
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
187 |
# Convert the list of dictionaries into a list of lists for the dataframe
|
188 |
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
|
189 |
|
190 |
+
return table_data, f"Uploaded {len(chunks)} chunks to the vector database, with page numbers included as metadata."
|
191 |
+
|
192 |
|
193 |
# Gradio Interface
|
194 |
with gr.Blocks() as demo:
|