Pijush2023's picture
Update app.py
7ee8ac6 verified
raw
history blame
2.69 kB
import gradio as gr
import pdfplumber
import os
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
import pandas as pd
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
# OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
# Initialize Pinecone with PineconeGRPC
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
# Define index name and parameters
index_name = "italy-kg"
# Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
pc.create_index(
name=index_name,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
),
deletion_protection="disabled"
)
# Embedding using OpenAI
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
# Gradio Blocks app with PDF uploader and table for logs
def process_pdf(file):
# Extract text from PDF using pdfplumber
with pdfplumber.open(file.name) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
# Split text using RecursiveCharacterTextSplitter
documents = [Document(page_content=text)]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
# Add documents to Pinecone Vector Store
vectorstore = Pinecone.from_existing_index(index_name, embeddings)
vectorstore.add_documents(docs)
# Prepare log data
log_data = {
"File Name": [file.name],
"File Size (KB)": [os.path.getsize(file.name) / 1024],
"Number of Chunks": [len(docs)],
"Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")]
}
# Create a DataFrame for logs
df_logs = pd.DataFrame(log_data)
return "PDF processed successfully!", df_logs
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Uploader to Pinecone with Logs")
with gr.Row():
with gr.Column():
pdf_input = gr.File(label="Upload PDF", type="file")
process_button = gr.Button("Process PDF")
with gr.Column():
output_text = gr.Textbox(label="Status", interactive=False)
log_table = gr.DataFrame(label="Logs", interactive=False)
# Define action on button click
process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table])
# Launch the Gradio app
demo.launch()