Pijush2023's picture
Update app.py
f9fb482 verified
raw
history blame
3.18 kB
import gradio as gr
import pdfplumber
import os
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
import pandas as pd
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from datetime import datetime
# OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
# Embedding using OpenAI
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
# Initialize Pinecone with PineconeGRPC
from pinecone import Pinecone
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
# Define index name and parameters
index_name = "italy-kg"
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
# Create a global list to store uploaded document records
uploaded_documents = []
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB
def process_pdf(pdf_file, uploaded_documents):
if pdf_file is None:
return uploaded_documents, "No PDF file uploaded."
with pdfplumber.open(pdf_file.name) as pdf:
all_text = ""
for page in pdf.pages:
all_text += page.extract_text()
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = text_splitter.split_text(all_text)
# Embed and upload the chunks into the vector database
chunk_ids = []
for chunk in chunks:
document = Document(page_content=chunk)
chunk_id = vectorstore.add_documents([document])
chunk_ids.append(chunk_id)
# Update the upload history
document_record = {
"Document Name": pdf_file.name,
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"Chunks": len(chunks),
"Pinecone Index": index_name
}
# Add the record to the global list
uploaded_documents.append(document_record)
# Convert the list of dictionaries into a list of lists for the dataframe
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Uploader to Pinecone with Logs")
# File upload component
with gr.Column():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
# Button to trigger processing
process_button = gr.Button("Process PDF and Upload")
# Dataframe to display uploaded document records
document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False)
# Output textbox for results
output_textbox = gr.Textbox(label="Result")
# Define button click action
process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox])
demo.queue()
demo.launch(show_error=True)