radarbackend11262024v11

Runtime error

App Files Files Community

Pijush2023 commited on Oct 17, 2024

Commit

7a7da87

verified ·

1 Parent(s): b8feedd

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -16

app.py CHANGED Viewed

@@ -149,6 +149,7 @@ def clear_inputs():
 uploaded_documents = []
 # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
 def process_pdf(pdf_file, uploaded_documents):
     if pdf_file is None:
@@ -156,24 +157,22 @@ def process_pdf(pdf_file, uploaded_documents):
     # Open the PDF file and extract text page by page
     with pdfplumber.open(pdf_file.name) as pdf:
-        all_text = ""
         for page_num, page in enumerate(pdf.pages, start=1):
             text = page.extract_text()
             if text:
-                all_text += text
-    # Split the text into chunks
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    chunks = text_splitter.split_text(all_text)
-    # Embed and upload the chunks into the vector database with page number metadata
-    chunk_ids = []
-    for chunk in chunks:
-        document = Document(page_content=chunk, metadata={"page_number": page_num})
-        chunk_id = vectorstore.add_documents([document])
-        chunk_ids.append(chunk_id)
-    # Update the upload history with metadata including page number
     document_record = {
         "Document Name": pdf_file.name,
         "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
@@ -187,7 +186,8 @@ def process_pdf(pdf_file, uploaded_documents):
     # Convert the list of dictionaries into a list of lists for the dataframe
     table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
-    return table_data, f"Uploaded {len(chunks)} chunks to the vector database, with page numbers included as metadata."
 # Gradio Interface

 uploaded_documents = []
 # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
 def process_pdf(pdf_file, uploaded_documents):
     if pdf_file is None:
     # Open the PDF file and extract text page by page
     with pdfplumber.open(pdf_file.name) as pdf:
+        chunks = []
         for page_num, page in enumerate(pdf.pages, start=1):
             text = page.extract_text()
             if text:
+                # Split the text into chunks and attach page number metadata to each chunk
+                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+                page_chunks = text_splitter.split_text(text)
+                for chunk in page_chunks:
+                    # Create a Document with the page number as metadata
+                    document = Document(page_content=chunk, metadata={"page_number": page_num})
+                    chunks.append(document)
+    # Embed and upload the chunks into the vector database
+    chunk_ids = vectorstore.add_documents(chunks)
+    # Update the upload history
     document_record = {
         "Document Name": pdf_file.name,
         "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     # Convert the list of dictionaries into a list of lists for the dataframe
     table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
+    return table_data, f"Uploaded {len(chunks)} chunks to the vector database with page numbers included as metadata."
 # Gradio Interface