AreejMehboob commited on
Commit
b9e9522
Β·
verified Β·
1 Parent(s): 49d41a8

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +135 -179
src/streamlit_app.py CHANGED
@@ -220,31 +220,16 @@ if 'demo_results' not in st.session_state:
220
  if 'demo_selected_methods' not in st.session_state:
221
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
222
 
223
- # Get the current directory (src) and set output path
224
- CURRENT_DIR = Path(__file__).parent
225
- OUTPUT_BASE_PATH = CURRENT_DIR / "output"
226
 
227
- # Create output directory if it doesn't exist
228
- OUTPUT_BASE_PATH.mkdir(exist_ok=True)
229
 
230
- def check_existing_results():
231
- """Check if there are existing results in the output directory"""
232
- existing_methods = []
233
-
234
- for method in ['docling', 'llamaparse', 'unstructured']:
235
- method_dir = OUTPUT_BASE_PATH / method
236
- if method_dir.exists():
237
- # Check for HTML files
238
- html_files = list(method_dir.glob("**/*.html"))
239
- if html_files:
240
- existing_methods.append(method)
241
-
242
- return existing_methods
243
 
244
  def show_home_page():
245
- # Check for existing results
246
- existing_methods = check_existing_results()
247
-
248
  # Header
249
  st.markdown("""
250
  <div class="main-header">
@@ -255,44 +240,19 @@ def show_home_page():
255
  </div>
256
  """, unsafe_allow_html=True)
257
 
258
- # Show existing results notification if any
259
- if existing_methods:
260
- st.info(f"πŸ“ Found existing results from: {', '.join([m.title() for m in existing_methods])}. Click 'View Results' to see them.")
261
-
262
  # Main buttons
263
  col1, col2, col3 = st.columns([1, 2, 1])
264
  with col2:
265
- if existing_methods:
266
- # Show three buttons if results exist
267
- col_btn1, col_btn2, col_btn3 = st.columns(3)
268
- with col_btn1:
269
- if st.button("πŸ“„ Upload PDF", key="upload_btn", help="Upload your own PDF document"):
270
- st.session_state.page = 'upload'
271
- st.rerun()
272
-
273
- with col_btn2:
274
- if st.button("⚑ Try Demo", key="demo_btn", help="Try with Tesla's 10K form"):
275
- st.session_state.page = 'demo_setup'
276
- st.rerun()
277
-
278
- with col_btn3:
279
- if st.button("πŸ‘οΈ View Results", key="view_results_btn", help="View existing results"):
280
- st.session_state.page = 'demo'
281
- st.session_state.processing = False
282
- st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']}
283
- st.rerun()
284
- else:
285
- # Show two buttons if no results exist
286
- col_btn1, col_btn2 = st.columns(2)
287
- with col_btn1:
288
- if st.button("πŸ“„ Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
289
- st.session_state.page = 'upload'
290
- st.rerun()
291
-
292
- with col_btn2:
293
- if st.button("⚑ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
294
- st.session_state.page = 'demo_setup'
295
- st.rerun()
296
 
297
  # Features section
298
  st.markdown("---")
@@ -336,14 +296,14 @@ def show_upload_page():
336
  st.markdown("**Or specify file path:**")
337
  input_file_path = st.text_input(
338
  "Input File Path",
339
- placeholder="path/to/your/document.pdf",
340
- help="Enter the path to your PDF file"
341
  )
342
 
343
  # Output directory with show/hide functionality
344
  output_dir = st.text_input(
345
  "Output Directory",
346
- value=str(OUTPUT_BASE_PATH),
347
  help="Directory where extracted tables will be saved",
348
  type="password" if not st.session_state.show_output_dir else "default"
349
  )
@@ -383,32 +343,25 @@ def show_demo_setup_page():
383
  st.markdown("## ⚑ Tesla 10K Demo Setup")
384
  st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
385
 
386
- # Check for existing results
387
- existing_methods = check_existing_results()
388
-
389
  # Document info
390
  st.markdown("### πŸ“„ Document Information")
391
- if existing_methods:
392
- st.success(f"**Found existing results from:** {', '.join([m.title() for m in existing_methods])}")
393
- st.info("**Note:** You can view existing results or process with different methods")
394
- else:
395
- st.info("**Document:** Tesla 10K form - Financial tables extraction demo")
396
 
397
- # Extraction method selection
398
  st.markdown("### πŸ”§ Select Extraction Methods")
399
  col1, col2, col3 = st.columns(3)
400
 
401
  with col1:
402
  docling = st.checkbox("Docling",
403
- value=st.session_state.demo_selected_methods.get('docling', True),
404
  help="Advanced document processing")
405
  with col2:
406
  llamaparse = st.checkbox("LlamaParse",
407
- value=st.session_state.demo_selected_methods.get('llamaparse', False),
408
  help="AI-powered parsing")
409
  with col3:
410
  unstructured = st.checkbox("Unstructured",
411
- value=st.session_state.demo_selected_methods.get('unstructured', False),
412
  help="General purpose extraction")
413
 
414
  # Update session state
@@ -421,33 +374,13 @@ def show_demo_setup_page():
421
  # Process button
422
  col1, col2 = st.columns([2, 1])
423
  with col1:
424
- if existing_methods:
425
- # Show two buttons if results exist
426
- col_btn1, col_btn2 = st.columns(2)
427
- with col_btn1:
428
- if st.button("πŸ‘οΈ View Existing Results", type="secondary"):
429
- st.session_state.page = 'demo'
430
- st.session_state.processing = False
431
- st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']}
432
- st.rerun()
433
-
434
- with col_btn2:
435
- if st.button("πŸš€ Process New", type="primary"):
436
- if docling or llamaparse or unstructured:
437
- st.session_state.page = 'demo'
438
- st.session_state.processing = True
439
- st.rerun()
440
- else:
441
- st.error("Please select at least one extraction method.")
442
- else:
443
- # Show single process button if no results exist
444
- if st.button("πŸš€ Process Tesla Document", type="primary"):
445
- if docling or llamaparse or unstructured:
446
- st.session_state.page = 'demo'
447
- st.session_state.processing = True
448
- st.rerun()
449
- else:
450
- st.error("Please select at least one extraction method.")
451
 
452
  with col2:
453
  if st.button("← Back to Home"):
@@ -500,7 +433,7 @@ def show_processing_demo():
500
 
501
  method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
502
 
503
- time.sleep(0.1) # Reduced sleep time for faster demo
504
 
505
  # Show completion
506
  st.markdown("""
@@ -514,18 +447,37 @@ def show_processing_demo():
514
  process_tesla_demo()
515
 
516
  st.session_state.processing = False
517
- time.sleep(1)
518
  st.rerun()
519
 
520
  def process_tesla_demo():
521
  """Process Tesla demo document using selected extraction methods"""
522
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  # For demo purposes, simulate successful processing for selected methods only
524
  results = {}
525
- selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
526
-
527
- for method in selected_methods:
528
- results[method] = {'status': 'success', 'total_tables': 3 + hash(method) % 3} # Simulate different table counts
 
 
529
 
530
  st.session_state.demo_results = {'results': results}
531
 
@@ -534,29 +486,31 @@ def process_tesla_demo():
534
 
535
  def count_html_files(directory):
536
  """Count only HTML files in directory"""
537
- if not directory.exists():
538
  return 0
539
 
540
- html_files = list(directory.glob("**/*.html"))
 
541
  return len(html_files)
542
 
543
  def get_excel_files(directory):
544
  """Get all Excel files from directory"""
545
- if not directory.exists():
546
  return []
547
 
548
- excel_files = []
549
- for ext in ['*.xlsx', '*.xls', '*.csv']:
550
- excel_files.extend(directory.glob(f"**/{ext}"))
551
-
 
552
  return excel_files
553
 
554
  def get_file_info(file_path):
555
  """Get file information including size and modification time"""
556
- if not file_path.exists():
557
  return {"size": 0, "modified": "Unknown"}
558
 
559
- stat = file_path.stat()
560
  size_kb = stat.st_size / 1024
561
  modified = datetime.fromtimestamp(stat.st_mtime)
562
 
@@ -568,18 +522,13 @@ def get_file_info(file_path):
568
  def show_demo_results():
569
  st.markdown("## πŸ“Š Tesla 10K Processing Results")
570
 
571
- # Check for existing results
572
- existing_methods = check_existing_results()
573
-
574
  # Document info
575
  col1, col2 = st.columns([2, 1])
576
  with col1:
577
- st.markdown("### πŸ“„ Tesla 10K Document")
578
  st.markdown("**Status:** βœ… Complete")
579
- if existing_methods:
580
- st.markdown(f"**Available results:** {', '.join([m.title() for m in existing_methods])}")
581
- else:
582
- st.warning("No results found in output directory")
583
 
584
  with col2:
585
  if st.button("πŸ”„ Reset"):
@@ -591,41 +540,42 @@ def show_demo_results():
591
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
592
  st.rerun()
593
 
594
- # Method selection tabs - only show available methods
595
- available_methods = existing_methods
596
 
597
- if available_methods:
598
- if len(available_methods) > 1:
599
- st.markdown("### πŸ”§ Select Extraction Method to View")
600
-
601
- method_labels = {
602
- 'docling': 'πŸ”§ Docling',
603
- 'llamaparse': 'πŸ¦™ LlamaParse',
604
- 'unstructured': 'πŸ“Š Unstructured'
605
- }
606
-
607
- # Create columns based on number of available methods
608
- cols = st.columns(len(available_methods))
609
-
610
- for i, method in enumerate(available_methods):
611
- with cols[i]:
612
- # Show HTML file count for each method
613
- method_output_dir = OUTPUT_BASE_PATH / method
614
- html_count = count_html_files(method_output_dir)
615
- button_label = f"{method_labels[method]} ({html_count} HTML files)"
616
-
617
- if st.button(button_label, key=f"tab_{method}", use_container_width=True):
618
- st.session_state.selected_method = method
619
 
620
- # Default to first available method if no method selected
621
- if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
622
- st.session_state.selected_method = available_methods[0]
 
 
623
 
624
- # Show results for selected method
625
- if st.session_state.selected_method:
626
- show_method_results(st.session_state.selected_method)
627
- else:
628
- st.info("No results found. Please process a document first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  def show_method_results(method):
631
  st.markdown(f"### πŸ“‹ Results from {method.title()}")
@@ -647,18 +597,20 @@ def show_html_tables(method):
647
 
648
  # Get actual HTML files from directory
649
  html_files = []
650
- if method_output_dir.exists():
651
- html_files = list(method_output_dir.glob("**/*.html"))
 
 
 
652
 
653
- # Sort files by table number if possible
654
  import re
655
  def extract_table_number(filename):
656
- match = re.search(r"table[_-](\d+)", filename.name, re.IGNORECASE)
657
  if match:
658
  return int(match.group(1))
659
- return float('inf')
660
-
661
- html_files.sort(key=extract_table_number)
662
 
663
  if html_files:
664
  st.markdown(f"**Found {len(html_files)} HTML table(s):**")
@@ -670,7 +622,7 @@ def show_html_tables(method):
670
  st.markdown(f"""
671
  <div class="table-header">
672
  <h4 style="color: #495057;">πŸ“‹ Table {i+1}</h4>
673
- <small style="color: #6c757d;">File: {html_file.name}</small>
674
  </div>
675
  """, unsafe_allow_html=True)
676
 
@@ -721,7 +673,7 @@ def show_excel_files(method):
721
  for i, excel_file in enumerate(excel_files):
722
  # Get file info
723
  file_info = get_file_info(excel_file)
724
- file_name = excel_file.name
725
 
726
  # File info card
727
  st.markdown(f"""
@@ -736,35 +688,40 @@ def show_excel_files(method):
736
 
737
  # Try to read and display Excel file preview
738
  try:
739
- if excel_file.suffix.lower() in ['.xlsx', '.xls']:
740
- df = pd.read_excel(excel_file)
741
- else:
742
- df = pd.read_csv(excel_file)
743
-
744
  if not df.empty:
745
  st.markdown(f"**Preview (first 5 rows):**")
746
  st.dataframe(df.head(), use_container_width=True)
747
  st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
748
  else:
749
- st.info("File is empty")
750
-
751
  except Exception as e:
752
- st.warning(f"Could not preview file: {e}")
 
 
 
 
 
 
 
 
 
 
753
 
754
  # Download button for Excel file
755
  try:
756
  with open(excel_file, 'rb') as f:
757
- file_data = f.read()
758
  st.download_button(
759
  label=f"⬇️ Download",
760
- data=file_data,
761
  file_name=file_name,
762
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
763
  key=f"download_excel_{method}_{i}",
764
  use_container_width=True
765
  )
766
  except Exception as e:
767
- st.error(f"Error reading file for download: {e}")
768
 
769
  if i < len(excel_files) - 1:
770
  st.markdown("---")
@@ -813,10 +770,11 @@ def process_document(file_path, output_dir, docling, llamaparse, unstructured):
813
  st.json(method_result)
814
 
815
  # List files in output directory
816
- method_dir = Path(output_dir) / selected_method
817
 
818
  # HTML files
819
- html_files = list(method_dir.glob("**/*.html"))
 
820
 
821
  # Excel files
822
  excel_files = get_excel_files(method_dir)
@@ -827,12 +785,12 @@ def process_document(file_path, output_dir, docling, llamaparse, unstructured):
827
  if html_files:
828
  st.markdown("**HTML Files:**")
829
  for html_file in html_files:
830
- st.markdown(f"- {html_file.name}")
831
 
832
  if excel_files:
833
  st.markdown("**Excel Files:**")
834
  for excel_file in excel_files:
835
- st.markdown(f"- {excel_file.name}")
836
  else:
837
  st.warning("No successful extractions found.")
838
 
@@ -858,9 +816,7 @@ def main():
858
  st.rerun()
859
  with nav_col2:
860
  st.button("History", use_container_width=True)
861
-
862
  st.markdown("---")
863
-
864
  # Route to appropriate page
865
  if st.session_state.page == 'home':
866
  show_home_page()
 
220
  if 'demo_selected_methods' not in st.session_state:
221
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
222
 
223
+ # Get the directory where the script is located (src)
224
+ SCRIPT_DIR = Path(__file__).parent
 
225
 
226
+ # Tesla demo document path (assuming it's in the src directory or adjust as needed)
227
+ TESLA_DOC_PATH = SCRIPT_DIR / "tesla_docs_28-41 (1)-9-14.pdf"
228
 
229
+ # Output directory is src/output
230
+ OUTPUT_BASE_PATH = SCRIPT_DIR / "output"
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  def show_home_page():
 
 
 
233
  # Header
234
  st.markdown("""
235
  <div class="main-header">
 
240
  </div>
241
  """, unsafe_allow_html=True)
242
 
 
 
 
 
243
  # Main buttons
244
  col1, col2, col3 = st.columns([1, 2, 1])
245
  with col2:
246
+ col_btn1, col_btn2 = st.columns(2)
247
+ with col_btn1:
248
+ if st.button("πŸ“„ Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
249
+ st.session_state.page = 'upload'
250
+ st.rerun()
251
+
252
+ with col_btn2:
253
+ if st.button("⚑ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
254
+ st.session_state.page = 'demo_setup'
255
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  # Features section
258
  st.markdown("---")
 
296
  st.markdown("**Or specify file path:**")
297
  input_file_path = st.text_input(
298
  "Input File Path",
299
+ placeholder="C:\\path\\to\\your\\document.pdf",
300
+ help="Enter the full path to your PDF file"
301
  )
302
 
303
  # Output directory with show/hide functionality
304
  output_dir = st.text_input(
305
  "Output Directory",
306
+ placeholder="C:\\path\\to\\output\\folder",
307
  help="Directory where extracted tables will be saved",
308
  type="password" if not st.session_state.show_output_dir else "default"
309
  )
 
343
  st.markdown("## ⚑ Tesla 10K Demo Setup")
344
  st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
345
 
 
 
 
346
  # Document info
347
  st.markdown("### πŸ“„ Document Information")
348
+ st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
 
 
 
 
349
 
350
+ # Extraction method selection (removed output directory section completely)
351
  st.markdown("### πŸ”§ Select Extraction Methods")
352
  col1, col2, col3 = st.columns(3)
353
 
354
  with col1:
355
  docling = st.checkbox("Docling",
356
+ value=st.session_state.demo_selected_methods['docling'],
357
  help="Advanced document processing")
358
  with col2:
359
  llamaparse = st.checkbox("LlamaParse",
360
+ value=st.session_state.demo_selected_methods['llamaparse'],
361
  help="AI-powered parsing")
362
  with col3:
363
  unstructured = st.checkbox("Unstructured",
364
+ value=st.session_state.demo_selected_methods['unstructured'],
365
  help="General purpose extraction")
366
 
367
  # Update session state
 
374
  # Process button
375
  col1, col2 = st.columns([2, 1])
376
  with col1:
377
+ if st.button("πŸš€ Process Tesla Document", type="primary"):
378
+ if docling or llamaparse or unstructured:
379
+ st.session_state.page = 'demo'
380
+ st.session_state.processing = True
381
+ st.rerun()
382
+ else:
383
+ st.error("Please select at least one extraction method.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  with col2:
386
  if st.button("← Back to Home"):
 
433
 
434
  method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
435
 
436
+ time.sleep(0.33)
437
 
438
  # Show completion
439
  st.markdown("""
 
447
  process_tesla_demo()
448
 
449
  st.session_state.processing = False
450
+ time.sleep(2)
451
  st.rerun()
452
 
453
  def process_tesla_demo():
454
  """Process Tesla demo document using selected extraction methods"""
455
  try:
456
+ # Create output directory for demo (using the base path)
457
+ demo_output_dir = OUTPUT_BASE_PATH / "tesla_demo"
458
+
459
+ # Prepare the request data for selected methods only
460
+ data = {
461
+ 'input_file_path': str(TESLA_DOC_PATH),
462
+ 'output_dir': str(demo_output_dir),
463
+ 'docling': st.session_state.demo_selected_methods['docling'],
464
+ 'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
465
+ 'unstructured': st.session_state.demo_selected_methods['unstructured']
466
+ }
467
+
468
+ # Make request to FastAPI endpoint (uncomment when ready)
469
+ # response = requests.post('http://localhost:8000/extract', data=data)
470
+ # if response.status_code == 200:
471
+ # st.session_state.demo_results = response.json()
472
+
473
  # For demo purposes, simulate successful processing for selected methods only
474
  results = {}
475
+ if st.session_state.demo_selected_methods['docling']:
476
+ results['docling'] = {'status': 'success', 'total_tables': 5}
477
+ if st.session_state.demo_selected_methods['llamaparse']:
478
+ results['llamaparse'] = {'status': 'success', 'total_tables': 3}
479
+ if st.session_state.demo_selected_methods['unstructured']:
480
+ results['unstructured'] = {'status': 'success', 'total_tables': 4}
481
 
482
  st.session_state.demo_results = {'results': results}
483
 
 
486
 
487
  def count_html_files(directory):
488
  """Count only HTML files in directory"""
489
+ if not os.path.exists(directory):
490
  return 0
491
 
492
+ html_files = glob.glob(os.path.join(str(directory), "*.html"))
493
+ html_files.extend(glob.glob(os.path.join(str(directory), "**", "*.html"), recursive=True))
494
  return len(html_files)
495
 
496
  def get_excel_files(directory):
497
  """Get all Excel files from directory"""
498
+ if not os.path.exists(directory):
499
  return []
500
 
501
+ excel_files = glob.glob(os.path.join(str(directory), "*.xlsx"))
502
+ excel_files.extend(glob.glob(os.path.join(str(directory), "*.xls")))
503
+ excel_files.extend(glob.glob(os.path.join(str(directory), "*.csv")))
504
+ excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xlsx"), recursive=True))
505
+ excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xls"), recursive=True))
506
  return excel_files
507
 
508
  def get_file_info(file_path):
509
  """Get file information including size and modification time"""
510
+ if not os.path.exists(file_path):
511
  return {"size": 0, "modified": "Unknown"}
512
 
513
+ stat = os.stat(file_path)
514
  size_kb = stat.st_size / 1024
515
  modified = datetime.fromtimestamp(stat.st_mtime)
516
 
 
522
  def show_demo_results():
523
  st.markdown("## πŸ“Š Tesla 10K Processing Results")
524
 
 
 
 
525
  # Document info
526
  col1, col2 = st.columns([2, 1])
527
  with col1:
528
+ st.markdown("### πŸ“„ tesla_docs_28-41 (1)-9-14.pdf")
529
  st.markdown("**Status:** βœ… Complete")
530
+ processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
531
+ st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
 
 
532
 
533
  with col2:
534
  if st.button("πŸ”„ Reset"):
 
540
  st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
541
  st.rerun()
542
 
543
+ # Method selection tabs - only show selected methods
544
+ available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
545
 
546
+ if len(available_methods) > 1:
547
+ st.markdown("### πŸ”§ Select Extraction Method to View")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
549
+ method_labels = {
550
+ 'docling': 'πŸ”§ Docling',
551
+ 'llamaparse': 'πŸ¦™ LlamaParse',
552
+ 'unstructured': 'πŸ“Š Unstructured'
553
+ }
554
 
555
+ # Create columns based on number of available methods
556
+ cols = st.columns(len(available_methods))
557
+
558
+ for i, method in enumerate(available_methods):
559
+ with cols[i]:
560
+ # Show HTML file count for each method using the same logic as show_html_tables
561
+ method_output_dir = OUTPUT_BASE_PATH / method
562
+ html_files = []
563
+ if os.path.exists(method_output_dir):
564
+ html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
565
+ html_files = list(set(html_files))
566
+ html_count = len(html_files)
567
+ button_label = f"{method_labels[method]} ({html_count} HTML files)"
568
+
569
+ if st.button(button_label, key=f"tab_{method}", use_container_width=True):
570
+ st.session_state.selected_method = method
571
+
572
+ # Default to first available method if no method selected
573
+ if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
574
+ st.session_state.selected_method = available_methods[0] if available_methods else None
575
+
576
+ # Show results for selected method
577
+ if st.session_state.selected_method:
578
+ show_method_results(st.session_state.selected_method)
579
 
580
  def show_method_results(method):
581
  st.markdown(f"### πŸ“‹ Results from {method.title()}")
 
597
 
598
  # Get actual HTML files from directory
599
  html_files = []
600
+ if os.path.exists(method_output_dir):
601
+ # Use only the recursive glob, which includes the top-level directory
602
+ html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
603
+ # Remove duplicates just in case
604
+ html_files = list(set(html_files))
605
 
606
+ # Sort files by table number if possible (e.g., table_1, table_2, ...)
607
  import re
608
  def extract_table_number(filename):
609
+ match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
610
  if match:
611
  return int(match.group(1))
612
+ return float('inf') # Put files without a number at the end
613
+ html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
 
614
 
615
  if html_files:
616
  st.markdown(f"**Found {len(html_files)} HTML table(s):**")
 
622
  st.markdown(f"""
623
  <div class="table-header">
624
  <h4 style="color: #495057;">πŸ“‹ Table {i+1}</h4>
625
+ <small style="color: #6c757d;">File: {os.path.basename(html_file)}</small>
626
  </div>
627
  """, unsafe_allow_html=True)
628
 
 
673
  for i, excel_file in enumerate(excel_files):
674
  # Get file info
675
  file_info = get_file_info(excel_file)
676
+ file_name = os.path.basename(excel_file)
677
 
678
  # File info card
679
  st.markdown(f"""
 
688
 
689
  # Try to read and display Excel file preview
690
  try:
691
+ df = pd.read_excel(excel_file)
 
 
 
 
692
  if not df.empty:
693
  st.markdown(f"**Preview (first 5 rows):**")
694
  st.dataframe(df.head(), use_container_width=True)
695
  st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
696
  else:
697
+ st.info("Excel file is empty")
 
698
  except Exception as e:
699
+ # Try reading as CSV if Excel reading fails
700
+ try:
701
+ df = pd.read_csv(excel_file)
702
+ if not df.empty:
703
+ st.markdown(f"**Preview (first 5 rows, read as CSV):**")
704
+ st.dataframe(df.head(), use_container_width=True)
705
+ st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
706
+ else:
707
+ st.info("CSV file is empty")
708
+ except Exception as e2:
709
+ st.warning(f"Could not preview file as Excel or CSV: {e2}")
710
 
711
  # Download button for Excel file
712
  try:
713
  with open(excel_file, 'rb') as f:
714
+ excel_data = f.read()
715
  st.download_button(
716
  label=f"⬇️ Download",
717
+ data=excel_data,
718
  file_name=file_name,
719
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
720
  key=f"download_excel_{method}_{i}",
721
  use_container_width=True
722
  )
723
  except Exception as e:
724
+ st.error(f"Error reading Excel file for download: {e}")
725
 
726
  if i < len(excel_files) - 1:
727
  st.markdown("---")
 
770
  st.json(method_result)
771
 
772
  # List files in output directory
773
+ method_dir = os.path.join(output_dir, selected_method)
774
 
775
  # HTML files
776
+ html_files = glob.glob(os.path.join(method_dir, "*.html"))
777
+ html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
778
 
779
  # Excel files
780
  excel_files = get_excel_files(method_dir)
 
785
  if html_files:
786
  st.markdown("**HTML Files:**")
787
  for html_file in html_files:
788
+ st.markdown(f"- {os.path.basename(html_file)}")
789
 
790
  if excel_files:
791
  st.markdown("**Excel Files:**")
792
  for excel_file in excel_files:
793
+ st.markdown(f"- {os.path.basename(excel_file)}")
794
  else:
795
  st.warning("No successful extractions found.")
796
 
 
816
  st.rerun()
817
  with nav_col2:
818
  st.button("History", use_container_width=True)
 
819
  st.markdown("---")
 
820
  # Route to appropriate page
821
  if st.session_state.page == 'home':
822
  show_home_page()