Pijush2023 commited on
Commit
7a7da87
·
verified ·
1 Parent(s): b8feedd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -149,6 +149,7 @@ def clear_inputs():
149
  uploaded_documents = []
150
 
151
 
 
152
  # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
153
  def process_pdf(pdf_file, uploaded_documents):
154
  if pdf_file is None:
@@ -156,24 +157,22 @@ def process_pdf(pdf_file, uploaded_documents):
156
 
157
  # Open the PDF file and extract text page by page
158
  with pdfplumber.open(pdf_file.name) as pdf:
159
- all_text = ""
160
  for page_num, page in enumerate(pdf.pages, start=1):
161
  text = page.extract_text()
162
  if text:
163
- all_text += text
164
-
165
- # Split the text into chunks
166
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
167
- chunks = text_splitter.split_text(all_text)
168
-
169
- # Embed and upload the chunks into the vector database with page number metadata
170
- chunk_ids = []
171
- for chunk in chunks:
172
- document = Document(page_content=chunk, metadata={"page_number": page_num})
173
- chunk_id = vectorstore.add_documents([document])
174
- chunk_ids.append(chunk_id)
175
-
176
- # Update the upload history with metadata including page number
177
  document_record = {
178
  "Document Name": pdf_file.name,
179
  "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
@@ -187,7 +186,8 @@ def process_pdf(pdf_file, uploaded_documents):
187
  # Convert the list of dictionaries into a list of lists for the dataframe
188
  table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
189
 
190
- return table_data, f"Uploaded {len(chunks)} chunks to the vector database, with page numbers included as metadata."
 
191
 
192
 
193
  # Gradio Interface
 
149
  uploaded_documents = []
150
 
151
 
152
+
153
  # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
154
  def process_pdf(pdf_file, uploaded_documents):
155
  if pdf_file is None:
 
157
 
158
  # Open the PDF file and extract text page by page
159
  with pdfplumber.open(pdf_file.name) as pdf:
160
+ chunks = []
161
  for page_num, page in enumerate(pdf.pages, start=1):
162
  text = page.extract_text()
163
  if text:
164
+ # Split the text into chunks and attach page number metadata to each chunk
165
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
166
+ page_chunks = text_splitter.split_text(text)
167
+ for chunk in page_chunks:
168
+ # Create a Document with the page number as metadata
169
+ document = Document(page_content=chunk, metadata={"page_number": page_num})
170
+ chunks.append(document)
171
+
172
+ # Embed and upload the chunks into the vector database
173
+ chunk_ids = vectorstore.add_documents(chunks)
174
+
175
+ # Update the upload history
 
 
176
  document_record = {
177
  "Document Name": pdf_file.name,
178
  "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 
186
  # Convert the list of dictionaries into a list of lists for the dataframe
187
  table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
188
 
189
+ return table_data, f"Uploaded {len(chunks)} chunks to the vector database with page numbers included as metadata."
190
+
191
 
192
 
193
  # Gradio Interface