AreejMehboob commited on
Commit
49d41a8
Β·
verified Β·
1 Parent(s): 0f51e34

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +183 -134
src/streamlit_app.py CHANGED
@@ -220,11 +220,31 @@ if 'demo_results' not in st.session_state:
220
  if 'demo_selected_methods' not in st.session_state:
221
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
222
 
223
- # Tesla demo document path (adjust as needed)
224
- TESLA_DOC_PATH = r"C:\Users\Areej\Desktop\get-tables-fastapi\tesla_docs_28-41 (1)-9-14.pdf"
225
- OUTPUT_BASE_PATH = r"C:\Users\Areej\Desktop\get-tables-fastapi\output"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  def show_home_page():
 
 
 
228
  # Header
229
  st.markdown("""
230
  <div class="main-header">
@@ -235,19 +255,44 @@ def show_home_page():
235
  </div>
236
  """, unsafe_allow_html=True)
237
 
 
 
 
 
238
  # Main buttons
239
  col1, col2, col3 = st.columns([1, 2, 1])
240
  with col2:
241
- col_btn1, col_btn2 = st.columns(2)
242
- with col_btn1:
243
- if st.button("πŸ“„ Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
244
- st.session_state.page = 'upload'
245
- st.rerun()
246
-
247
- with col_btn2:
248
- if st.button("⚑ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
249
- st.session_state.page = 'demo_setup'
250
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  # Features section
253
  st.markdown("---")
@@ -291,14 +336,14 @@ def show_upload_page():
291
  st.markdown("**Or specify file path:**")
292
  input_file_path = st.text_input(
293
  "Input File Path",
294
- placeholder="C:\\path\\to\\your\\document.pdf",
295
- help="Enter the full path to your PDF file"
296
  )
297
 
298
  # Output directory with show/hide functionality
299
  output_dir = st.text_input(
300
  "Output Directory",
301
- placeholder="C:\\path\\to\\output\\folder",
302
  help="Directory where extracted tables will be saved",
303
  type="password" if not st.session_state.show_output_dir else "default"
304
  )
@@ -338,25 +383,32 @@ def show_demo_setup_page():
338
  st.markdown("## ⚑ Tesla 10K Demo Setup")
339
  st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
340
 
 
 
 
341
  # Document info
342
  st.markdown("### πŸ“„ Document Information")
343
- st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
 
 
 
 
344
 
345
- # Extraction method selection (removed output directory section completely)
346
  st.markdown("### πŸ”§ Select Extraction Methods")
347
  col1, col2, col3 = st.columns(3)
348
 
349
  with col1:
350
  docling = st.checkbox("Docling",
351
- value=st.session_state.demo_selected_methods['docling'],
352
  help="Advanced document processing")
353
  with col2:
354
  llamaparse = st.checkbox("LlamaParse",
355
- value=st.session_state.demo_selected_methods['llamaparse'],
356
  help="AI-powered parsing")
357
  with col3:
358
  unstructured = st.checkbox("Unstructured",
359
- value=st.session_state.demo_selected_methods['unstructured'],
360
  help="General purpose extraction")
361
 
362
  # Update session state
@@ -369,13 +421,33 @@ def show_demo_setup_page():
369
  # Process button
370
  col1, col2 = st.columns([2, 1])
371
  with col1:
372
- if st.button("πŸš€ Process Tesla Document", type="primary"):
373
- if docling or llamaparse or unstructured:
374
- st.session_state.page = 'demo'
375
- st.session_state.processing = True
376
- st.rerun()
377
- else:
378
- st.error("Please select at least one extraction method.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  with col2:
381
  if st.button("← Back to Home"):
@@ -428,7 +500,7 @@ def show_processing_demo():
428
 
429
  method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
430
 
431
- time.sleep(0.33)
432
 
433
  # Show completion
434
  st.markdown("""
@@ -442,37 +514,18 @@ def show_processing_demo():
442
  process_tesla_demo()
443
 
444
  st.session_state.processing = False
445
- time.sleep(2)
446
  st.rerun()
447
 
448
  def process_tesla_demo():
449
  """Process Tesla demo document using selected extraction methods"""
450
  try:
451
- # Create output directory for demo (using the base path)
452
- demo_output_dir = os.path.join(OUTPUT_BASE_PATH, "tesla_demo")
453
-
454
- # Prepare the request data for selected methods only
455
- data = {
456
- 'input_file_path': TESLA_DOC_PATH,
457
- 'output_dir': demo_output_dir,
458
- 'docling': st.session_state.demo_selected_methods['docling'],
459
- 'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
460
- 'unstructured': st.session_state.demo_selected_methods['unstructured']
461
- }
462
-
463
- # Make request to FastAPI endpoint (uncomment when ready)
464
- # response = requests.post('http://localhost:8000/extract', data=data)
465
- # if response.status_code == 200:
466
- # st.session_state.demo_results = response.json()
467
-
468
  # For demo purposes, simulate successful processing for selected methods only
469
  results = {}
470
- if st.session_state.demo_selected_methods['docling']:
471
- results['docling'] = {'status': 'success', 'total_tables': 5}
472
- if st.session_state.demo_selected_methods['llamaparse']:
473
- results['llamaparse'] = {'status': 'success', 'total_tables': 3}
474
- if st.session_state.demo_selected_methods['unstructured']:
475
- results['unstructured'] = {'status': 'success', 'total_tables': 4}
476
 
477
  st.session_state.demo_results = {'results': results}
478
 
@@ -481,31 +534,29 @@ def process_tesla_demo():
481
 
482
  def count_html_files(directory):
483
  """Count only HTML files in directory"""
484
- if not os.path.exists(directory):
485
  return 0
486
 
487
- html_files = glob.glob(os.path.join(directory, "*.html"))
488
- html_files.extend(glob.glob(os.path.join(directory, "**", "*.html"), recursive=True))
489
  return len(html_files)
490
 
491
  def get_excel_files(directory):
492
  """Get all Excel files from directory"""
493
- if not os.path.exists(directory):
494
  return []
495
 
496
- excel_files = glob.glob(os.path.join(directory, "*.xlsx"))
497
- excel_files.extend(glob.glob(os.path.join(directory, "*.xls")))
498
- excel_files.extend(glob.glob(os.path.join(directory, "*.csv")))
499
- excel_files.extend(glob.glob(os.path.join(directory, "**", "*.xlsx"), recursive=True))
500
- excel_files.extend(glob.glob(os.path.join(directory, "**", "*.xls"), recursive=True))
501
  return excel_files
502
 
503
  def get_file_info(file_path):
504
  """Get file information including size and modification time"""
505
- if not os.path.exists(file_path):
506
  return {"size": 0, "modified": "Unknown"}
507
 
508
- stat = os.stat(file_path)
509
  size_kb = stat.st_size / 1024
510
  modified = datetime.fromtimestamp(stat.st_mtime)
511
 
@@ -517,13 +568,18 @@ def get_file_info(file_path):
517
  def show_demo_results():
518
  st.markdown("## πŸ“Š Tesla 10K Processing Results")
519
 
 
 
 
520
  # Document info
521
  col1, col2 = st.columns([2, 1])
522
  with col1:
523
- st.markdown("### πŸ“„ tesla_docs_28-41 (1)-9-14.pdf")
524
  st.markdown("**Status:** βœ… Complete")
525
- processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
526
- st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
 
 
527
 
528
  with col2:
529
  if st.button("πŸ”„ Reset"):
@@ -535,42 +591,41 @@ def show_demo_results():
535
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
536
  st.rerun()
537
 
538
- # Method selection tabs - only show selected methods
539
- available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
540
 
541
- if len(available_methods) > 1:
542
- st.markdown("### πŸ”§ Select Extraction Method to View")
543
-
544
- method_labels = {
545
- 'docling': 'πŸ”§ Docling',
546
- 'llamaparse': 'πŸ¦™ LlamaParse',
547
- 'unstructured': 'πŸ“Š Unstructured'
548
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
- # Create columns based on number of available methods
551
- cols = st.columns(len(available_methods))
 
552
 
553
- for i, method in enumerate(available_methods):
554
- with cols[i]:
555
- # Show HTML file count for each method using the same logic as show_html_tables
556
- method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
557
- html_files = []
558
- if os.path.exists(method_output_dir):
559
- html_files = glob.glob(os.path.join(method_output_dir, "**", "*.html"), recursive=True)
560
- html_files = list(set(html_files))
561
- html_count = len(html_files)
562
- button_label = f"{method_labels[method]} ({html_count} HTML files)"
563
-
564
- if st.button(button_label, key=f"tab_{method}", use_container_width=True):
565
- st.session_state.selected_method = method
566
-
567
- # Default to first available method if no method selected
568
- if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
569
- st.session_state.selected_method = available_methods[0] if available_methods else None
570
-
571
- # Show results for selected method
572
- if st.session_state.selected_method:
573
- show_method_results(st.session_state.selected_method)
574
 
575
  def show_method_results(method):
576
  st.markdown(f"### πŸ“‹ Results from {method.title()}")
@@ -588,24 +643,22 @@ def show_method_results(method):
588
 
589
  def show_html_tables(method):
590
  """Display HTML tables from the method's output directory"""
591
- method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
592
 
593
  # Get actual HTML files from directory
594
  html_files = []
595
- if os.path.exists(method_output_dir):
596
- # Use only the recursive glob, which includes the top-level directory
597
- html_files = glob.glob(os.path.join(method_output_dir, "**", "*.html"), recursive=True)
598
- # Remove duplicates just in case
599
- html_files = list(set(html_files))
600
 
601
- # Sort files by table number if possible (e.g., table_1, table_2, ...)
602
  import re
603
  def extract_table_number(filename):
604
- match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
605
  if match:
606
  return int(match.group(1))
607
- return float('inf') # Put files without a number at the end
608
- html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
 
609
 
610
  if html_files:
611
  st.markdown(f"**Found {len(html_files)} HTML table(s):**")
@@ -617,7 +670,7 @@ def show_html_tables(method):
617
  st.markdown(f"""
618
  <div class="table-header">
619
  <h4 style="color: #495057;">πŸ“‹ Table {i+1}</h4>
620
- <small style="color: #6c757d;">File: {os.path.basename(html_file)}</small>
621
  </div>
622
  """, unsafe_allow_html=True)
623
 
@@ -657,7 +710,7 @@ def show_html_tables(method):
657
 
658
  def show_excel_files(method):
659
  """Display Excel files from the method's output directory"""
660
- method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
661
 
662
  # Get actual Excel files from directory
663
  excel_files = get_excel_files(method_output_dir)
@@ -668,7 +721,7 @@ def show_excel_files(method):
668
  for i, excel_file in enumerate(excel_files):
669
  # Get file info
670
  file_info = get_file_info(excel_file)
671
- file_name = os.path.basename(excel_file)
672
 
673
  # File info card
674
  st.markdown(f"""
@@ -683,40 +736,35 @@ def show_excel_files(method):
683
 
684
  # Try to read and display Excel file preview
685
  try:
686
- df = pd.read_excel(excel_file)
 
 
 
 
687
  if not df.empty:
688
  st.markdown(f"**Preview (first 5 rows):**")
689
  st.dataframe(df.head(), use_container_width=True)
690
  st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
691
  else:
692
- st.info("Excel file is empty")
 
693
  except Exception as e:
694
- # Try reading as CSV if Excel reading fails
695
- try:
696
- df = pd.read_csv(excel_file)
697
- if not df.empty:
698
- st.markdown(f"**Preview (first 5 rows, read as CSV):**")
699
- st.dataframe(df.head(), use_container_width=True)
700
- st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
701
- else:
702
- st.info("CSV file is empty")
703
- except Exception as e2:
704
- st.warning(f"Could not preview file as Excel or CSV: {e2}")
705
 
706
  # Download button for Excel file
707
  try:
708
  with open(excel_file, 'rb') as f:
709
- excel_data = f.read()
710
  st.download_button(
711
  label=f"⬇️ Download",
712
- data=excel_data,
713
  file_name=file_name,
714
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
715
  key=f"download_excel_{method}_{i}",
716
  use_container_width=True
717
  )
718
  except Exception as e:
719
- st.error(f"Error reading Excel file for download: {e}")
720
 
721
  if i < len(excel_files) - 1:
722
  st.markdown("---")
@@ -765,11 +813,10 @@ def process_document(file_path, output_dir, docling, llamaparse, unstructured):
765
  st.json(method_result)
766
 
767
  # List files in output directory
768
- method_dir = os.path.join(output_dir, selected_method)
769
 
770
  # HTML files
771
- html_files = glob.glob(os.path.join(method_dir, "*.html"))
772
- html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
773
 
774
  # Excel files
775
  excel_files = get_excel_files(method_dir)
@@ -780,12 +827,12 @@ def process_document(file_path, output_dir, docling, llamaparse, unstructured):
780
  if html_files:
781
  st.markdown("**HTML Files:**")
782
  for html_file in html_files:
783
- st.markdown(f"- {os.path.basename(html_file)}")
784
 
785
  if excel_files:
786
  st.markdown("**Excel Files:**")
787
  for excel_file in excel_files:
788
- st.markdown(f"- {os.path.basename(excel_file)}")
789
  else:
790
  st.warning("No successful extractions found.")
791
 
@@ -811,7 +858,9 @@ def main():
811
  st.rerun()
812
  with nav_col2:
813
  st.button("History", use_container_width=True)
 
814
  st.markdown("---")
 
815
  # Route to appropriate page
816
  if st.session_state.page == 'home':
817
  show_home_page()
 
220
  if 'demo_selected_methods' not in st.session_state:
221
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
222
 
223
+ # Get the current directory (src) and set output path
224
+ CURRENT_DIR = Path(__file__).parent
225
+ OUTPUT_BASE_PATH = CURRENT_DIR / "output"
226
+
227
+ # Create output directory if it doesn't exist
228
+ OUTPUT_BASE_PATH.mkdir(exist_ok=True)
229
+
230
+ def check_existing_results():
231
+ """Check if there are existing results in the output directory"""
232
+ existing_methods = []
233
+
234
+ for method in ['docling', 'llamaparse', 'unstructured']:
235
+ method_dir = OUTPUT_BASE_PATH / method
236
+ if method_dir.exists():
237
+ # Check for HTML files
238
+ html_files = list(method_dir.glob("**/*.html"))
239
+ if html_files:
240
+ existing_methods.append(method)
241
+
242
+ return existing_methods
243
 
244
  def show_home_page():
245
+ # Check for existing results
246
+ existing_methods = check_existing_results()
247
+
248
  # Header
249
  st.markdown("""
250
  <div class="main-header">
 
255
  </div>
256
  """, unsafe_allow_html=True)
257
 
258
+ # Show existing results notification if any
259
+ if existing_methods:
260
+ st.info(f"πŸ“ Found existing results from: {', '.join([m.title() for m in existing_methods])}. Click 'View Results' to see them.")
261
+
262
  # Main buttons
263
  col1, col2, col3 = st.columns([1, 2, 1])
264
  with col2:
265
+ if existing_methods:
266
+ # Show three buttons if results exist
267
+ col_btn1, col_btn2, col_btn3 = st.columns(3)
268
+ with col_btn1:
269
+ if st.button("πŸ“„ Upload PDF", key="upload_btn", help="Upload your own PDF document"):
270
+ st.session_state.page = 'upload'
271
+ st.rerun()
272
+
273
+ with col_btn2:
274
+ if st.button("⚑ Try Demo", key="demo_btn", help="Try with Tesla's 10K form"):
275
+ st.session_state.page = 'demo_setup'
276
+ st.rerun()
277
+
278
+ with col_btn3:
279
+ if st.button("πŸ‘οΈ View Results", key="view_results_btn", help="View existing results"):
280
+ st.session_state.page = 'demo'
281
+ st.session_state.processing = False
282
+ st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']}
283
+ st.rerun()
284
+ else:
285
+ # Show two buttons if no results exist
286
+ col_btn1, col_btn2 = st.columns(2)
287
+ with col_btn1:
288
+ if st.button("πŸ“„ Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
289
+ st.session_state.page = 'upload'
290
+ st.rerun()
291
+
292
+ with col_btn2:
293
+ if st.button("⚑ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
294
+ st.session_state.page = 'demo_setup'
295
+ st.rerun()
296
 
297
  # Features section
298
  st.markdown("---")
 
336
  st.markdown("**Or specify file path:**")
337
  input_file_path = st.text_input(
338
  "Input File Path",
339
+ placeholder="path/to/your/document.pdf",
340
+ help="Enter the path to your PDF file"
341
  )
342
 
343
  # Output directory with show/hide functionality
344
  output_dir = st.text_input(
345
  "Output Directory",
346
+ value=str(OUTPUT_BASE_PATH),
347
  help="Directory where extracted tables will be saved",
348
  type="password" if not st.session_state.show_output_dir else "default"
349
  )
 
383
  st.markdown("## ⚑ Tesla 10K Demo Setup")
384
  st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
385
 
386
+ # Check for existing results
387
+ existing_methods = check_existing_results()
388
+
389
  # Document info
390
  st.markdown("### πŸ“„ Document Information")
391
+ if existing_methods:
392
+ st.success(f"**Found existing results from:** {', '.join([m.title() for m in existing_methods])}")
393
+ st.info("**Note:** You can view existing results or process with different methods")
394
+ else:
395
+ st.info("**Document:** Tesla 10K form - Financial tables extraction demo")
396
 
397
+ # Extraction method selection
398
  st.markdown("### πŸ”§ Select Extraction Methods")
399
  col1, col2, col3 = st.columns(3)
400
 
401
  with col1:
402
  docling = st.checkbox("Docling",
403
+ value=st.session_state.demo_selected_methods.get('docling', True),
404
  help="Advanced document processing")
405
  with col2:
406
  llamaparse = st.checkbox("LlamaParse",
407
+ value=st.session_state.demo_selected_methods.get('llamaparse', False),
408
  help="AI-powered parsing")
409
  with col3:
410
  unstructured = st.checkbox("Unstructured",
411
+ value=st.session_state.demo_selected_methods.get('unstructured', False),
412
  help="General purpose extraction")
413
 
414
  # Update session state
 
421
  # Process button
422
  col1, col2 = st.columns([2, 1])
423
  with col1:
424
+ if existing_methods:
425
+ # Show two buttons if results exist
426
+ col_btn1, col_btn2 = st.columns(2)
427
+ with col_btn1:
428
+ if st.button("πŸ‘οΈ View Existing Results", type="secondary"):
429
+ st.session_state.page = 'demo'
430
+ st.session_state.processing = False
431
+ st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']}
432
+ st.rerun()
433
+
434
+ with col_btn2:
435
+ if st.button("πŸš€ Process New", type="primary"):
436
+ if docling or llamaparse or unstructured:
437
+ st.session_state.page = 'demo'
438
+ st.session_state.processing = True
439
+ st.rerun()
440
+ else:
441
+ st.error("Please select at least one extraction method.")
442
+ else:
443
+ # Show single process button if no results exist
444
+ if st.button("πŸš€ Process Tesla Document", type="primary"):
445
+ if docling or llamaparse or unstructured:
446
+ st.session_state.page = 'demo'
447
+ st.session_state.processing = True
448
+ st.rerun()
449
+ else:
450
+ st.error("Please select at least one extraction method.")
451
 
452
  with col2:
453
  if st.button("← Back to Home"):
 
500
 
501
  method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
502
 
503
+ time.sleep(0.1) # Reduced sleep time for faster demo
504
 
505
  # Show completion
506
  st.markdown("""
 
514
  process_tesla_demo()
515
 
516
  st.session_state.processing = False
517
+ time.sleep(1)
518
  st.rerun()
519
 
520
  def process_tesla_demo():
521
  """Process Tesla demo document using selected extraction methods"""
522
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  # For demo purposes, simulate successful processing for selected methods only
524
  results = {}
525
+ selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
526
+
527
+ for method in selected_methods:
528
+ results[method] = {'status': 'success', 'total_tables': 3 + hash(method) % 3} # Simulate different table counts
 
 
529
 
530
  st.session_state.demo_results = {'results': results}
531
 
 
534
 
535
  def count_html_files(directory):
536
  """Count only HTML files in directory"""
537
+ if not directory.exists():
538
  return 0
539
 
540
+ html_files = list(directory.glob("**/*.html"))
 
541
  return len(html_files)
542
 
543
  def get_excel_files(directory):
544
  """Get all Excel files from directory"""
545
+ if not directory.exists():
546
  return []
547
 
548
+ excel_files = []
549
+ for ext in ['*.xlsx', '*.xls', '*.csv']:
550
+ excel_files.extend(directory.glob(f"**/{ext}"))
551
+
 
552
  return excel_files
553
 
554
  def get_file_info(file_path):
555
  """Get file information including size and modification time"""
556
+ if not file_path.exists():
557
  return {"size": 0, "modified": "Unknown"}
558
 
559
+ stat = file_path.stat()
560
  size_kb = stat.st_size / 1024
561
  modified = datetime.fromtimestamp(stat.st_mtime)
562
 
 
568
  def show_demo_results():
569
  st.markdown("## πŸ“Š Tesla 10K Processing Results")
570
 
571
+ # Check for existing results
572
+ existing_methods = check_existing_results()
573
+
574
  # Document info
575
  col1, col2 = st.columns([2, 1])
576
  with col1:
577
+ st.markdown("### πŸ“„ Tesla 10K Document")
578
  st.markdown("**Status:** βœ… Complete")
579
+ if existing_methods:
580
+ st.markdown(f"**Available results:** {', '.join([m.title() for m in existing_methods])}")
581
+ else:
582
+ st.warning("No results found in output directory")
583
 
584
  with col2:
585
  if st.button("πŸ”„ Reset"):
 
591
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
592
  st.rerun()
593
 
594
+ # Method selection tabs - only show available methods
595
+ available_methods = existing_methods
596
 
597
+ if available_methods:
598
+ if len(available_methods) > 1:
599
+ st.markdown("### πŸ”§ Select Extraction Method to View")
600
+
601
+ method_labels = {
602
+ 'docling': 'πŸ”§ Docling',
603
+ 'llamaparse': 'πŸ¦™ LlamaParse',
604
+ 'unstructured': 'πŸ“Š Unstructured'
605
+ }
606
+
607
+ # Create columns based on number of available methods
608
+ cols = st.columns(len(available_methods))
609
+
610
+ for i, method in enumerate(available_methods):
611
+ with cols[i]:
612
+ # Show HTML file count for each method
613
+ method_output_dir = OUTPUT_BASE_PATH / method
614
+ html_count = count_html_files(method_output_dir)
615
+ button_label = f"{method_labels[method]} ({html_count} HTML files)"
616
+
617
+ if st.button(button_label, key=f"tab_{method}", use_container_width=True):
618
+ st.session_state.selected_method = method
619
 
620
+ # Default to first available method if no method selected
621
+ if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
622
+ st.session_state.selected_method = available_methods[0]
623
 
624
+ # Show results for selected method
625
+ if st.session_state.selected_method:
626
+ show_method_results(st.session_state.selected_method)
627
+ else:
628
+ st.info("No results found. Please process a document first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  def show_method_results(method):
631
  st.markdown(f"### πŸ“‹ Results from {method.title()}")
 
643
 
644
  def show_html_tables(method):
645
  """Display HTML tables from the method's output directory"""
646
+ method_output_dir = OUTPUT_BASE_PATH / method
647
 
648
  # Get actual HTML files from directory
649
  html_files = []
650
+ if method_output_dir.exists():
651
+ html_files = list(method_output_dir.glob("**/*.html"))
 
 
 
652
 
653
+ # Sort files by table number if possible
654
  import re
655
  def extract_table_number(filename):
656
+ match = re.search(r"table[_-](\d+)", filename.name, re.IGNORECASE)
657
  if match:
658
  return int(match.group(1))
659
+ return float('inf')
660
+
661
+ html_files.sort(key=extract_table_number)
662
 
663
  if html_files:
664
  st.markdown(f"**Found {len(html_files)} HTML table(s):**")
 
670
  st.markdown(f"""
671
  <div class="table-header">
672
  <h4 style="color: #495057;">πŸ“‹ Table {i+1}</h4>
673
+ <small style="color: #6c757d;">File: {html_file.name}</small>
674
  </div>
675
  """, unsafe_allow_html=True)
676
 
 
710
 
711
  def show_excel_files(method):
712
  """Display Excel files from the method's output directory"""
713
+ method_output_dir = OUTPUT_BASE_PATH / method
714
 
715
  # Get actual Excel files from directory
716
  excel_files = get_excel_files(method_output_dir)
 
721
  for i, excel_file in enumerate(excel_files):
722
  # Get file info
723
  file_info = get_file_info(excel_file)
724
+ file_name = excel_file.name
725
 
726
  # File info card
727
  st.markdown(f"""
 
736
 
737
  # Try to read and display Excel file preview
738
  try:
739
+ if excel_file.suffix.lower() in ['.xlsx', '.xls']:
740
+ df = pd.read_excel(excel_file)
741
+ else:
742
+ df = pd.read_csv(excel_file)
743
+
744
  if not df.empty:
745
  st.markdown(f"**Preview (first 5 rows):**")
746
  st.dataframe(df.head(), use_container_width=True)
747
  st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
748
  else:
749
+ st.info("File is empty")
750
+
751
  except Exception as e:
752
+ st.warning(f"Could not preview file: {e}")
 
 
 
 
 
 
 
 
 
 
753
 
754
  # Download button for Excel file
755
  try:
756
  with open(excel_file, 'rb') as f:
757
+ file_data = f.read()
758
  st.download_button(
759
  label=f"⬇️ Download",
760
+ data=file_data,
761
  file_name=file_name,
762
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
763
  key=f"download_excel_{method}_{i}",
764
  use_container_width=True
765
  )
766
  except Exception as e:
767
+ st.error(f"Error reading file for download: {e}")
768
 
769
  if i < len(excel_files) - 1:
770
  st.markdown("---")
 
813
  st.json(method_result)
814
 
815
  # List files in output directory
816
+ method_dir = Path(output_dir) / selected_method
817
 
818
  # HTML files
819
+ html_files = list(method_dir.glob("**/*.html"))
 
820
 
821
  # Excel files
822
  excel_files = get_excel_files(method_dir)
 
827
  if html_files:
828
  st.markdown("**HTML Files:**")
829
  for html_file in html_files:
830
+ st.markdown(f"- {html_file.name}")
831
 
832
  if excel_files:
833
  st.markdown("**Excel Files:**")
834
  for excel_file in excel_files:
835
+ st.markdown(f"- {excel_file.name}")
836
  else:
837
  st.warning("No successful extractions found.")
838
 
 
858
  st.rerun()
859
  with nav_col2:
860
  st.button("History", use_container_width=True)
861
+
862
  st.markdown("---")
863
+
864
  # Route to appropriate page
865
  if st.session_state.page == 'home':
866
  show_home_page()