Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pdfplumber | |
import os | |
from langchain.schema import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Pinecone | |
import pinecone | |
import pandas as pd | |
import time | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
from pinecone import ServerlessSpec | |
from langchain_pinecone import PineconeVectorStore | |
from datetime import datetime | |
# OpenAI API key | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
# Embedding using OpenAI | |
embeddings = OpenAIEmbeddings(api_key=openai_api_key) | |
# Initialize Pinecone with PineconeGRPC | |
from pinecone import Pinecone | |
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY']) | |
# Define index name and parameters | |
index_name = "italy-kg" | |
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings) | |
# Create a global list to store uploaded document records | |
uploaded_documents = [] | |
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB | |
def process_pdf(pdf_file, uploaded_documents): | |
if pdf_file is None: | |
return uploaded_documents, "No PDF file uploaded." | |
with pdfplumber.open(pdf_file.name) as pdf: | |
all_text = "" | |
for page in pdf.pages: | |
all_text += page.extract_text() | |
# Split the text into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) | |
chunks = text_splitter.split_text(all_text) | |
# Embed and upload the chunks into the vector database | |
chunk_ids = [] | |
for chunk in chunks: | |
document = Document(page_content=chunk) | |
chunk_id = vectorstore.add_documents([document]) | |
chunk_ids.append(chunk_id) | |
# Update the upload history | |
document_record = { | |
"Document Name": pdf_file.name, | |
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
"Chunks": len(chunks), | |
"Pinecone Index": index_name | |
} | |
# Add the record to the global list | |
uploaded_documents.append(document_record) | |
# Convert the list of dictionaries into a list of lists for the dataframe | |
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents] | |
return table_data, f"Uploaded {len(chunks)} chunks to the vector database." | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# PDF Uploader to Pinecone with Logs") | |
# File upload component | |
with gr.Column(): | |
file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
# Button to trigger processing | |
process_button = gr.Button("Process PDF and Upload") | |
# Dataframe to display uploaded document records | |
document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False) | |
# Output textbox for results | |
output_textbox = gr.Textbox(label="Result") | |
# Define button click action | |
process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox]) | |
demo.queue() | |
demo.launch(show_error=True) |