Spaces:
Build error
Build error
File size: 2,693 Bytes
7f3430b 3ce55e9 7f3430b 2acce8f b43948a 0839454 7ee8ac6 b43948a 7f3430b 1568ea2 b43948a 563a689 b43948a f694fcb b43948a f694fcb b43948a b773d17 b43948a 7f3430b 7ee5252 b43948a 7ee5252 b43948a 7f3430b b43948a 0f10bbc dd4345a b43948a 98a314b 3ce55e9 b43948a dd4345a b43948a dd4345a b43948a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import gradio as gr
import pdfplumber
import os
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
import pandas as pd
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
# OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
# Initialize Pinecone with PineconeGRPC
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
# Define index name and parameters
index_name = "italy-kg"
# Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
pc.create_index(
name=index_name,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
),
deletion_protection="disabled"
)
# Embedding using OpenAI
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
# Gradio Blocks app with PDF uploader and table for logs
def process_pdf(file):
# Extract text from PDF using pdfplumber
with pdfplumber.open(file.name) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
# Split text using RecursiveCharacterTextSplitter
documents = [Document(page_content=text)]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
# Add documents to Pinecone Vector Store
vectorstore = Pinecone.from_existing_index(index_name, embeddings)
vectorstore.add_documents(docs)
# Prepare log data
log_data = {
"File Name": [file.name],
"File Size (KB)": [os.path.getsize(file.name) / 1024],
"Number of Chunks": [len(docs)],
"Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")]
}
# Create a DataFrame for logs
df_logs = pd.DataFrame(log_data)
return "PDF processed successfully!", df_logs
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Uploader to Pinecone with Logs")
with gr.Row():
with gr.Column():
pdf_input = gr.File(label="Upload PDF", type="file")
process_button = gr.Button("Process PDF")
with gr.Column():
output_text = gr.Textbox(label="Status", interactive=False)
log_table = gr.DataFrame(label="Logs", interactive=False)
# Define action on button click
process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table])
# Launch the Gradio app
demo.launch()
|