Pijush2023 commited on
Commit
5e64098
·
verified ·
1 Parent(s): 92489c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -52
app.py CHANGED
@@ -32,15 +32,12 @@ vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
32
 
33
  # Create a global list to store uploaded document records
34
  uploaded_documents = []
35
- from datetime import datetime
36
 
37
-
38
- from langchain_core.documents import Document
39
  # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
40
- def process_pdf(pdf_file,uploaded_documents):
41
  if pdf_file is None:
42
  return uploaded_documents, "No PDF file uploaded."
43
- with pdfplumber.open(pdf_file) as pdf:
44
  all_text = ""
45
  for page in pdf.pages:
46
  all_text += page.extract_text()
@@ -65,63 +62,31 @@ def process_pdf(pdf_file,uploaded_documents):
65
  }
66
 
67
  # Add the record to the global list
68
- uploaded_documents.append(document_record)
69
 
70
  # Convert the list of dictionaries into a list of lists for the dataframe
71
  table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
72
 
73
  return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
74
- # Gradio Blocks app with PDF uploader and table for logs
75
- def process_pdf(file):
76
- # Extract text from PDF using pdfplumber
77
- with pdfplumber.open(file.name) as pdf:
78
- text = ""
79
- for page in pdf.pages:
80
- text += page.extract_text()
81
-
82
- # Split text using RecursiveCharacterTextSplitter
83
- documents = [Document(page_content=text)]
84
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
85
- docs = text_splitter.split_documents(documents)
86
-
87
- # Add documents to Pinecone Vector Store
88
- vectorstore = PineconeVectorStore(index_name, embeddings)
89
- vectorstore.add_documents(docs)
90
-
91
- # Prepare log data
92
- log_data = {
93
- "File Name": [file.name],
94
- "File Size (KB)": [os.path.getsize(file.name) / 1024],
95
- "Number of Chunks": [len(docs)],
96
- "Timestamp": [time.strftime("%Y-%m-%d %H:%M:%S")]
97
- }
98
-
99
- # Create a DataFrame for logs
100
- df_logs = pd.DataFrame(log_data)
101
-
102
- return "PDF processed successfully!", df_logs
103
 
104
  # Gradio Interface
105
  with gr.Blocks() as demo:
106
  gr.Markdown("# PDF Uploader to Pinecone with Logs")
107
 
108
- # File upload component
109
  with gr.Column():
110
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
111
- # Button to trigger processing
112
- process_button = gr.Button("Process PDF and Upload")
113
-
114
- # Dataframe to display uploaded document records
115
- document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False)
116
-
117
-
118
-
119
- # Output textbox for results
120
- output_textbox = gr.Textbox(label="Result")
121
-
122
- # Define button click action
123
- # process_button.click(fn=process_pdf, inputs=file_input, outputs=output_textbox)
124
- process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox])
125
-
126
  demo.queue()
127
  demo.launch(show_error=True)
 
32
 
33
  # Create a global list to store uploaded document records
34
  uploaded_documents = []
 
35
 
 
 
36
  # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
37
+ def process_pdf(pdf_file, uploaded_documents):
38
  if pdf_file is None:
39
  return uploaded_documents, "No PDF file uploaded."
40
+ with pdfplumber.open(pdf_file.name) as pdf:
41
  all_text = ""
42
  for page in pdf.pages:
43
  all_text += page.extract_text()
 
62
  }
63
 
64
  # Add the record to the global list
65
+ uploaded_documents.append(document_record)
66
 
67
  # Convert the list of dictionaries into a list of lists for the dataframe
68
  table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents]
69
 
70
  return table_data, f"Uploaded {len(chunks)} chunks to the vector database."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Gradio Interface
73
  with gr.Blocks() as demo:
74
  gr.Markdown("# PDF Uploader to Pinecone with Logs")
75
 
76
+ # File upload component
77
  with gr.Column():
78
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
79
+ # Button to trigger processing
80
+ process_button = gr.Button("Process PDF and Upload")
81
+
82
+ # Dataframe to display uploaded document records
83
+ document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False)
84
+
85
+ # Output textbox for results
86
+ output_textbox = gr.Textbox(label="Result")
87
+
88
+ # Define button click action
89
+ process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox])
90
+
 
 
 
91
  demo.queue()
92
  demo.launch(show_error=True)