import gradio as gr import pdfplumber import os from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import OpenAIEmbeddings from langchain.vectorstores import Pinecone import pinecone import pandas as pd import time from pinecone.grpc import PineconeGRPC as Pinecone from pinecone import ServerlessSpec from langchain_pinecone import PineconeVectorStore # OpenAI API key openai_api_key = os.getenv("OPENAI_API_KEY") # Initialize Pinecone with PineconeGRPC from pinecone import Pinecone pc = Pinecone(api_key=os.environ['PINECONE_API_KEY']) # Define index name and parameters index_name = "italy-kg" # Embedding using OpenAI embeddings = OpenAIEmbeddings(api_key=openai_api_key) vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings) # Gradio Blocks app with PDF uploader and table for logs def process_pdf(file): # Extract text from PDF using pdfplumber with pdfplumber.open(file.name) as pdf: text = "" for page in pdf.pages: text += page.extract_text() # Split text using RecursiveCharacterTextSplitter documents = [Document(page_content=text)] text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) docs = text_splitter.split_documents(documents) # Add documents to Pinecone Vector Store vectorstore = Pinecone.from_existing_index(index_name, embeddings) vectorstore.add_documents(docs) # Prepare log data log_data = { "File Name": [file.name], "File Size (KB)": [os.path.getsize(file.name) / 1024], "Number of Chunks": [len(docs)], "Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")] } # Create a DataFrame for logs df_logs = pd.DataFrame(log_data) return "PDF processed successfully!", df_logs # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# PDF Uploader to Pinecone with Logs") with gr.Row(): with gr.Column(): pdf_input = gr.File(label="Upload PDF", type="file") process_button = gr.Button("Process PDF") with gr.Column(): output_text = gr.Textbox(label="Status", interactive=False) log_table = gr.DataFrame(label="Logs", interactive=False) # Define action on button click process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table]) # Launch the Gradio app demo.launch()