Spaces:
Build error
Build error
import gradio as gr | |
import pdfplumber | |
import os | |
from langchain.schema import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Pinecone | |
import pinecone | |
import pandas as pd | |
import time | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
from pinecone import ServerlessSpec | |
# OpenAI API key | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
# Initialize Pinecone with PineconeGRPC | |
pinecone_api_key = os.getenv("PINECONE_API_KEY") | |
pc = Pinecone(api_key=pinecone_api_key) | |
# Define index name and parameters | |
index_name = "italy-kg" | |
# Create index if it doesn't exist | |
if index_name not in pinecone.list_indexes(): | |
pc.create_index( | |
name=index_name, | |
dimension=1536, | |
metric="cosine", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
), | |
deletion_protection="disabled" | |
) | |
# Embedding using OpenAI | |
embeddings = OpenAIEmbeddings(api_key=openai_api_key) | |
# Gradio Blocks app with PDF uploader and table for logs | |
def process_pdf(file): | |
# Extract text from PDF using pdfplumber | |
with pdfplumber.open(file.name) as pdf: | |
text = "" | |
for page in pdf.pages: | |
text += page.extract_text() | |
# Split text using RecursiveCharacterTextSplitter | |
documents = [Document(page_content=text)] | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) | |
docs = text_splitter.split_documents(documents) | |
# Add documents to Pinecone Vector Store | |
vectorstore = Pinecone.from_existing_index(index_name, embeddings) | |
vectorstore.add_documents(docs) | |
# Prepare log data | |
log_data = { | |
"File Name": [file.name], | |
"File Size (KB)": [os.path.getsize(file.name) / 1024], | |
"Number of Chunks": [len(docs)], | |
"Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")] | |
} | |
# Create a DataFrame for logs | |
df_logs = pd.DataFrame(log_data) | |
return "PDF processed successfully!", df_logs | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# PDF Uploader to Pinecone with Logs") | |
with gr.Row(): | |
with gr.Column(): | |
pdf_input = gr.File(label="Upload PDF", type="file") | |
process_button = gr.Button("Process PDF") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Status", interactive=False) | |
log_table = gr.DataFrame(label="Logs", interactive=False) | |
# Define action on button click | |
process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table]) | |
# Launch the Gradio app | |
demo.launch() | |