import gradio as gr import pdfplumber import os from langchain.document_loaders import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Pinecone import pinecone import pandas as pd import time from pinecone.grpc import PineconeGRPC as Pinecone from pinecone import ServerlessSpec # OpenAI API key openai_api_key = os.getenv("OPENAI_API_KEY") # Initialize Pinecone with PineconeGRPC pinecone_api_key = os.getenv("PINECONE_API_KEY") pc = Pinecone(api_key=pinecone_api_key) # Define index name and parameters index_name = "italy-kg" # Create index if it doesn't exist if index_name not in pinecone.list_indexes(): pc.create_index( name=index_name, dimension=1536, metric="cosine", spec=ServerlessSpec( cloud="aws", region="us-east-1" ), deletion_protection="disabled" ) # Embedding using OpenAI embeddings = OpenAIEmbeddings(api_key=openai_api_key) # Gradio Blocks app with PDF uploader and table for logs def process_pdf(file): # Extract text from PDF using pdfplumber with pdfplumber.open(file.name) as pdf: text = "" for page in pdf.pages: text += page.extract_text() # Split text using RecursiveCharacterTextSplitter documents = [Document(page_content=text)] text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) docs = text_splitter.split_documents(documents) # Add documents to Pinecone Vector Store vectorstore = Pinecone.from_existing_index(index_name, embeddings) vectorstore.add_documents(docs) # Prepare log data log_data = { "File Name": [file.name], "File Size (KB)": [os.path.getsize(file.name) / 1024], "Number of Chunks": [len(docs)], "Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")] } # Create a DataFrame for logs df_logs = pd.DataFrame(log_data) return "PDF processed successfully!", df_logs # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# PDF Uploader to Pinecone with Logs") with gr.Row(): with gr.Column(): pdf_input = gr.File(label="Upload PDF", type="file") process_button = gr.Button("Process PDF") with gr.Column(): output_text = gr.Textbox(label="Status", interactive=False) log_table = gr.DataFrame(label="Logs", interactive=False) # Define action on button click process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table]) # Launch the Gradio app demo.launch()