import gradio as gr import pdfplumber import os from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import OpenAIEmbeddings from langchain.vectorstores import Pinecone import pinecone import pandas as pd import time from pinecone.grpc import PineconeGRPC as Pinecone from pinecone import ServerlessSpec from langchain_pinecone import PineconeVectorStore from datetime import datetime # OpenAI API key openai_api_key = os.getenv("OPENAI_API_KEY") # Embedding using OpenAI embeddings = OpenAIEmbeddings(api_key=openai_api_key) # Initialize Pinecone with PineconeGRPC from pinecone import Pinecone pc = Pinecone(api_key=os.environ['PINECONE_API_KEY']) # Define index name and parameters index_name = "italy-kg" vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings) # Create a global list to store uploaded document records uploaded_documents = [] # Function to process PDF, extract text, split it into chunks, and upload to the vector DB def process_pdf(pdf_file, uploaded_documents): if pdf_file is None: return uploaded_documents, "No PDF file uploaded." with pdfplumber.open(pdf_file.name) as pdf: all_text = "" for page in pdf.pages: all_text += page.extract_text() # Split the text into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) chunks = text_splitter.split_text(all_text) # Embed and upload the chunks into the vector database chunk_ids = [] for chunk in chunks: document = Document(page_content=chunk) chunk_id = vectorstore.add_documents([document]) chunk_ids.append(chunk_id) # Update the upload history document_record = { "Document Name": pdf_file.name, "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Chunks": len(chunks), "Pinecone Index": index_name } # Add the record to the global list uploaded_documents.append(document_record) # Convert the list of dictionaries into a list of lists for the dataframe table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents] return table_data, f"Uploaded {len(chunks)} chunks to the vector database." # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# PDF Uploader to Pinecone with Logs") # File upload component with gr.Column(): file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) # Button to trigger processing process_button = gr.Button("Process PDF and Upload") # Dataframe to display uploaded document records document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False) # Output textbox for results output_textbox = gr.Textbox(label="Result") # Define button click action process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox]) demo.queue() demo.launch(show_error=True)