radarbackend11262024v11

Runtime error

App Files Files Community

Pijush2023 commited on Oct 17, 2024

Commit

b8feedd

verified ·

1 Parent(s): f6db95f

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -6

app.py CHANGED Viewed

@@ -148,27 +148,32 @@ def clear_inputs():
 # Create a global list to store uploaded document records
 uploaded_documents = []
 # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
 def process_pdf(pdf_file, uploaded_documents):
     if pdf_file is None:
         return uploaded_documents, "No PDF file uploaded."
     with pdfplumber.open(pdf_file.name) as pdf:
         all_text = ""
-        for page in pdf.pages:
-            all_text += page.extract_text()
     # Split the text into chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     chunks = text_splitter.split_text(all_text)
-    # Embed and upload the chunks into the vector database
     chunk_ids = []
     for chunk in chunks:
-        document = Document(page_content=chunk)
         chunk_id = vectorstore.add_documents([document])
         chunk_ids.append(chunk_id)
-    # Update the upload history
     document_record = {
         "Document Name": pdf_file.name,
         "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
@@ -182,7 +187,8 @@ def process_pdf(pdf_file, uploaded_documents):
     # Convert the list of dictionaries into a list of lists for the dataframe
     table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
-    return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
 # Gradio Interface
 with gr.Blocks() as demo:

 # Create a global list to store uploaded document records
 uploaded_documents = []
 # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
 def process_pdf(pdf_file, uploaded_documents):
     if pdf_file is None:
         return uploaded_documents, "No PDF file uploaded."
+    # Open the PDF file and extract text page by page
     with pdfplumber.open(pdf_file.name) as pdf:
         all_text = ""
+        for page_num, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text()
+            if text:
+                all_text += text
     # Split the text into chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     chunks = text_splitter.split_text(all_text)
+    # Embed and upload the chunks into the vector database with page number metadata
     chunk_ids = []
     for chunk in chunks:
+        document = Document(page_content=chunk, metadata={"page_number": page_num})
         chunk_id = vectorstore.add_documents([document])
         chunk_ids.append(chunk_id)
+    # Update the upload history with metadata including page number
     document_record = {
         "Document Name": pdf_file.name,
         "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     # Convert the list of dictionaries into a list of lists for the dataframe
     table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
+    return table_data, f"Uploaded {len(chunks)} chunks to the vector database, with page numbers included as metadata."
 # Gradio Interface
 with gr.Blocks() as demo: