Pijush2023 commited on
Commit
08b4bf1
·
verified ·
1 Parent(s): 3a9b7db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -14
app.py CHANGED
@@ -14,19 +14,63 @@ from langchain_pinecone import PineconeVectorStore
14
 
15
  # OpenAI API key
16
  openai_api_key = os.getenv("OPENAI_API_KEY")
 
 
17
 
18
  # Initialize Pinecone with PineconeGRPC
19
  from pinecone import Pinecone
20
  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
21
  # Define index name and parameters
22
  index_name = "italy-kg"
 
23
 
24
 
25
- # Embedding using OpenAI
26
- embeddings = OpenAIEmbeddings(api_key=openai_api_key)
27
 
28
- vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Gradio Blocks app with PDF uploader and table for logs
31
  def process_pdf(file):
32
  # Extract text from PDF using pdfplumber
@@ -61,17 +105,23 @@ def process_pdf(file):
61
  with gr.Blocks() as demo:
62
  gr.Markdown("# PDF Uploader to Pinecone with Logs")
63
 
64
- with gr.Row():
65
  with gr.Column():
66
- pdf_input = gr.File(label="Upload PDF", type="filepath")
67
- process_button = gr.Button("Process PDF")
68
-
69
- with gr.Column():
70
- output_text = gr.Textbox(label="Status", interactive=False)
71
- log_table = gr.DataFrame(label="Logs", interactive=False)
72
 
73
- # Define action on button click
74
- process_button.click(process_pdf, inputs=pdf_input, outputs=[output_text, log_table])
 
 
75
 
76
- # Launch the Gradio app
77
- demo.launch()
 
 
 
 
 
14
 
15
  # OpenAI API key
16
  openai_api_key = os.getenv("OPENAI_API_KEY")
17
+ # Embedding using OpenAI
18
+ embeddings = OpenAIEmbeddings(api_key=openai_api_key)
19
 
20
  # Initialize Pinecone with PineconeGRPC
21
  from pinecone import Pinecone
22
  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
23
  # Define index name and parameters
24
  index_name = "italy-kg"
25
+ vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
26
 
27
 
 
 
28
 
 
29
 
30
+
31
+
32
+
33
+ # Create a global list to store uploaded document records
34
+ uploaded_documents = []
35
+ from datetime import datetime
36
+
37
+
38
+ from langchain_core.documents import Document
39
+ # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
40
+ def process_pdf(pdf_file,uploaded_documents):
41
+ if pdf_file is None:
42
+ return uploaded_documents, "No PDF file uploaded."
43
+ with pdfplumber.open(pdf_file) as pdf:
44
+ all_text = ""
45
+ for page in pdf.pages:
46
+ all_text += page.extract_text()
47
+
48
+ # Split the text into chunks
49
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
50
+ chunks = text_splitter.split_text(all_text)
51
+
52
+ # Embed and upload the chunks into the vector database
53
+ chunk_ids = []
54
+ for chunk in chunks:
55
+ document = Document(page_content=chunk)
56
+ chunk_id = vectorstore.add_documents([document])
57
+ chunk_ids.append(chunk_id)
58
+
59
+ # Update the upload history
60
+ document_record = {
61
+ "Document Name": pdf_file.name,
62
+ "Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
63
+ "Chunks": len(chunks),
64
+ "Pinecone Index": index_name
65
+ }
66
+
67
+ # Add the record to the global list
68
+ uploaded_documents.append(document_record)
69
+
70
+ # Convert the list of dictionaries into a list of lists for the dataframe
71
+ table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
72
+
73
+ return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
74
  # Gradio Blocks app with PDF uploader and table for logs
75
  def process_pdf(file):
76
  # Extract text from PDF using pdfplumber
 
105
  with gr.Blocks() as demo:
106
  gr.Markdown("# PDF Uploader to Pinecone with Logs")
107
 
108
+ # File upload component
109
  with gr.Column():
110
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
111
+ # Button to trigger processing
112
+ process_button = gr.Button("Process PDF and Upload")
113
+
114
+ # Dataframe to display uploaded document records
115
+ document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False)
116
 
117
+
118
+
119
+ # Output textbox for results
120
+ output_textbox = gr.Textbox(label="Result")
121
 
122
+ # Define button click action
123
+ # process_button.click(fn=process_pdf, inputs=file_input, outputs=output_textbox)
124
+ process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox])
125
+
126
+ demo.queue()
127
+ demo.launch(show_error=True)