Shreyas094 commited on
Commit
cca553c
1 Parent(s): ca1c10f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -14
app.py CHANGED
@@ -124,10 +124,21 @@ def update_vectors(files, parser):
124
  for file in files:
125
  logging.info(f"Processing file: {file.name}")
126
  try:
127
- if file.name.lower().endswith(('.xlsx', '.xls', '.docx')):
 
 
 
 
128
  data = load_office_document(file)
 
 
 
 
 
 
129
  else:
130
- data = load_document(file, parser)
 
131
 
132
  if not data:
133
  logging.warning(f"No chunks loaded from {file.name}")
@@ -150,19 +161,30 @@ def update_vectors(files, parser):
150
  return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
151
 
152
  try:
153
- # Create or update the office documents vector store
154
- if os.path.exists("office_faiss_database"):
155
- logging.info("Updating existing office FAISS database")
156
- office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
157
- office_database.add_documents(all_data)
158
- else:
159
- logging.info("Creating new office FAISS database")
160
- office_database = FAISS.from_documents(all_data, embed)
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- office_database.save_local("office_faiss_database")
163
- logging.info("Office FAISS database saved")
164
  except Exception as e:
165
- logging.error(f"Error updating office FAISS database: {str(e)}")
166
  return f"Error updating vector store: {str(e)}", display_documents()
167
 
168
  # Save the updated list of documents
@@ -667,7 +689,7 @@ with demo:
667
  gr.Markdown("## Upload and Manage PDF Documents")
668
  with gr.Row():
669
  file_input = gr.Files(label="Upload your documents", file_types=[".pdf", ".docx", ".xlsx", ".xls"])
670
- parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="llamaparse")
671
  update_button = gr.Button("Upload Document")
672
  refresh_button = gr.Button("Refresh Document List")
673
 
 
124
  for file in files:
125
  logging.info(f"Processing file: {file.name}")
126
  try:
127
+ file_extension = os.path.splitext(file.name)[1].lower()
128
+
129
+ if file_extension in ['.xlsx', '.xls', '.docx']:
130
+ if parser != "office":
131
+ logging.warning(f"Using office parser for {file.name} regardless of selected parser")
132
  data = load_office_document(file)
133
+ elif file_extension == '.pdf':
134
+ if parser == "office":
135
+ logging.warning(f"Cannot use office parser for PDF file {file.name}. Using llamaparse.")
136
+ data = load_document(file, "llamaparse")
137
+ else:
138
+ data = load_document(file, parser)
139
  else:
140
+ logging.warning(f"Unsupported file type: {file_extension}")
141
+ continue
142
 
143
  if not data:
144
  logging.warning(f"No chunks loaded from {file.name}")
 
161
  return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
162
 
163
  try:
164
+ # Update the appropriate vector store based on file type
165
+ pdf_data = [doc for doc in all_data if doc.metadata["source"].lower().endswith('.pdf')]
166
+ office_data = [doc for doc in all_data if not doc.metadata["source"].lower().endswith('.pdf')]
167
+
168
+ if pdf_data:
169
+ if os.path.exists("faiss_database"):
170
+ pdf_database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
171
+ pdf_database.add_documents(pdf_data)
172
+ else:
173
+ pdf_database = FAISS.from_documents(pdf_data, embed)
174
+ pdf_database.save_local("faiss_database")
175
+ logging.info("PDF FAISS database updated and saved")
176
+
177
+ if office_data:
178
+ if os.path.exists("office_faiss_database"):
179
+ office_database = FAISS.load_local("office_faiss_database", embed, allow_dangerous_deserialization=True)
180
+ office_database.add_documents(office_data)
181
+ else:
182
+ office_database = FAISS.from_documents(office_data, embed)
183
+ office_database.save_local("office_faiss_database")
184
+ logging.info("Office FAISS database updated and saved")
185
 
 
 
186
  except Exception as e:
187
+ logging.error(f"Error updating FAISS database: {str(e)}")
188
  return f"Error updating vector store: {str(e)}", display_documents()
189
 
190
  # Save the updated list of documents
 
689
  gr.Markdown("## Upload and Manage PDF Documents")
690
  with gr.Row():
691
  file_input = gr.Files(label="Upload your documents", file_types=[".pdf", ".docx", ".xlsx", ".xls"])
692
+ parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse", "office"], label="Select PDF Parser", value="llamaparse")
693
  update_button = gr.Button("Upload Document")
694
  refresh_button = gr.Button("Refresh Document List")
695