Pijush2023 commited on
Commit
b8feedd
·
verified ·
1 Parent(s): f6db95f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -148,27 +148,32 @@ def clear_inputs():
148
  # Create a global list to store uploaded document records
149
  uploaded_documents = []
150
 
 
151
  # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
152
  def process_pdf(pdf_file, uploaded_documents):
153
  if pdf_file is None:
154
  return uploaded_documents, "No PDF file uploaded."
 
 
155
  with pdfplumber.open(pdf_file.name) as pdf:
156
  all_text = ""
157
- for page in pdf.pages:
158
- all_text += page.extract_text()
 
 
159
 
160
  # Split the text into chunks
161
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
162
  chunks = text_splitter.split_text(all_text)
163
 
164
- # Embed and upload the chunks into the vector database
165
  chunk_ids = []
166
  for chunk in chunks:
167
- document = Document(page_content=chunk)
168
  chunk_id = vectorstore.add_documents([document])
169
  chunk_ids.append(chunk_id)
170
 
171
- # Update the upload history
172
  document_record = {
173
  "Document Name": pdf_file.name,
174
  "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
@@ -182,7 +187,8 @@ def process_pdf(pdf_file, uploaded_documents):
182
  # Convert the list of dictionaries into a list of lists for the dataframe
183
  table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
184
 
185
- return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
 
186
 
187
  # Gradio Interface
188
  with gr.Blocks() as demo:
 
148
  # Create a global list to store uploaded document records
149
  uploaded_documents = []
150
 
151
+
152
  # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
153
  def process_pdf(pdf_file, uploaded_documents):
154
  if pdf_file is None:
155
  return uploaded_documents, "No PDF file uploaded."
156
+
157
+ # Open the PDF file and extract text page by page
158
  with pdfplumber.open(pdf_file.name) as pdf:
159
  all_text = ""
160
+ for page_num, page in enumerate(pdf.pages, start=1):
161
+ text = page.extract_text()
162
+ if text:
163
+ all_text += text
164
 
165
  # Split the text into chunks
166
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
167
  chunks = text_splitter.split_text(all_text)
168
 
169
+ # Embed and upload the chunks into the vector database with page number metadata
170
  chunk_ids = []
171
  for chunk in chunks:
172
+ document = Document(page_content=chunk, metadata={"page_number": page_num})
173
  chunk_id = vectorstore.add_documents([document])
174
  chunk_ids.append(chunk_id)
175
 
176
+ # Update the upload history with metadata including page number
177
  document_record = {
178
  "Document Name": pdf_file.name,
179
  "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 
187
  # Convert the list of dictionaries into a list of lists for the dataframe
188
  table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
189
 
190
+ return table_data, f"Uploaded {len(chunks)} chunks to the vector database, with page numbers included as metadata."
191
+
192
 
193
  # Gradio Interface
194
  with gr.Blocks() as demo: