radarbackend11262024v11

Runtime error

App Files Files Community

Pijush2023 commited on Oct 14, 2024

Commit

08b4bf1

verified ·

1 Parent(s): 3a9b7db

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -14

app.py CHANGED Viewed

@@ -14,19 +14,63 @@ from langchain_pinecone import PineconeVectorStore
 # OpenAI API key
 openai_api_key = os.getenv("OPENAI_API_KEY")
 # Initialize Pinecone with PineconeGRPC
 from pinecone import Pinecone
 pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
 # Define index name and parameters
 index_name = "italy-kg"
-# Embedding using OpenAI
-embeddings = OpenAIEmbeddings(api_key=openai_api_key)
-vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
 # Gradio Blocks app with PDF uploader and table for logs
 def process_pdf(file):
     # Extract text from PDF using pdfplumber
@@ -61,17 +105,23 @@ def process_pdf(file):
 with gr.Blocks() as demo:
     gr.Markdown("# PDF Uploader to Pinecone with Logs")
-    with gr.Row():
         with gr.Column():
-            pdf_input = gr.File(label="Upload PDF", type="filepath")
-            process_button = gr.Button("Process PDF")
-        with gr.Column():
-            output_text = gr.Textbox(label="Status", interactive=False)
-            log_table = gr.DataFrame(label="Logs", interactive=False)
-    # Define action on button click
-    process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table])
-# Launch the Gradio app
-demo.launch()

 # OpenAI API key
 openai_api_key = os.getenv("OPENAI_API_KEY")
+# Embedding using OpenAI
+embeddings = OpenAIEmbeddings(api_key=openai_api_key)
 # Initialize Pinecone with PineconeGRPC
 from pinecone import Pinecone
 pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
 # Define index name and parameters
 index_name = "italy-kg"
+vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
+# Create a global list to store uploaded document records
+uploaded_documents = []
+from datetime import datetime
+from langchain_core.documents import Document
+# Function to process PDF, extract text, split it into chunks, and upload to the vector DB
+def process_pdf(pdf_file,uploaded_documents):
+    if pdf_file is None:
+        return uploaded_documents, "No PDF file uploaded."
+    with pdfplumber.open(pdf_file) as pdf:
+        all_text = ""
+        for page in pdf.pages:
+            all_text += page.extract_text()
+    # Split the text into chunks
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
+    chunks = text_splitter.split_text(all_text)
+    # Embed and upload the chunks into the vector database
+    chunk_ids = []
+    for chunk in chunks:
+        document = Document(page_content=chunk)
+        chunk_id = vectorstore.add_documents([document])
+        chunk_ids.append(chunk_id)
+    # Update the upload history
+    document_record = {
+        "Document Name": pdf_file.name,
+        "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "Chunks": len(chunks),
+        "Pinecone Index": index_name
+    }
+    # Add the record to the global list
+    uploaded_documents.append(document_record)
+    # Convert the list of dictionaries into a list of lists for the dataframe
+    table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
+    return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
 # Gradio Blocks app with PDF uploader and table for logs
 def process_pdf(file):
     # Extract text from PDF using pdfplumber
 with gr.Blocks() as demo:
     gr.Markdown("# PDF Uploader to Pinecone with Logs")
+# File upload component
         with gr.Column():
+                file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+                # Button to trigger processing
+                process_button = gr.Button("Process PDF and Upload")
+                # Dataframe to display uploaded document records
+                document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False)
+                # Output textbox for results
+                output_textbox = gr.Textbox(label="Result")
+                # Define button click action
+                # process_button.click(fn=process_pdf, inputs=file_input, outputs=output_textbox)
+                process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox])
+demo.queue()
+demo.launch(show_error=True)