kambris commited on
Commit
d1c8665
·
verified ·
1 Parent(s): db177e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +667 -349
app.py CHANGED
@@ -9,7 +9,450 @@ from typing import Dict, List, Optional
9
  import json
10
  import io
11
  from datetime import datetime
12
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class ManusCopistaMetadataScraper:
15
  def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
@@ -68,39 +511,45 @@ class ManusCopistaMetadataScraper:
68
  progress_callback(f"Found {len(all_ids)} copyist IDs.")
69
 
70
  return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
71
-
72
 
73
  def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
74
- """Extract copyist IDs from a single page"""
75
  ids = set()
76
-
77
  if not soup:
78
  return []
79
-
80
- # Look for ALL links on the page
81
- links = soup.find_all('a', href=True)
82
-
 
 
83
  for link in links:
84
- href = link.get('href', '')
85
- # Check if this is a copyist detail link
86
- if 'manus-authorities/detail/' in href or '/detail/' in href:
87
- copyist_id = self.extract_copyist_id_from_url(href)
88
- if copyist_id:
89
- ids.add(copyist_id)
90
-
91
- # Also check for JavaScript-generated content or data attributes
92
- # Look for script tags that might contain copyist IDs
93
- scripts = soup.find_all('script')
94
- for script in scripts:
95
- script_text = script.string if script.string else ''
96
- # Look for ID patterns in JavaScript
97
- id_matches = re.findall(r'\b\d{5,7}\b', script_text)
98
- for match in id_matches:
99
- if self.is_potential_copyist_id(match):
100
- ids.add(match)
101
-
102
  return list(ids)
 
 
 
 
 
 
 
 
 
 
103
 
 
 
 
 
 
 
 
104
  def is_potential_copyist_id(self, id_str: str) -> bool:
105
  """Check if a string looks like a copyist ID"""
106
  if not id_str or not id_str.isdigit():
@@ -116,28 +565,6 @@ class ManusCopistaMetadataScraper:
116
  return 100000 <= id_num <= 999999
117
  except ValueError:
118
  return False
119
-
120
- def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
121
- """Extract copyist IDs from the table with id 'authorities-results-content'"""
122
- ids = set()
123
-
124
- if not soup:
125
- return []
126
-
127
- table_body = soup.find('tbody', id='authorities-results-content')
128
- if not table_body:
129
- return []
130
-
131
- links = table_body.find_all('a', href=True)
132
- for link in links:
133
- href = link['href']
134
- if 'detail/' in href:
135
- match = re.search(r'detail/(\d+)', href)
136
- if match:
137
- ids.add(match.group(1))
138
-
139
- return list(ids)
140
-
141
 
142
  def is_valid_copyist_id(self, id_str: str) -> bool:
143
  """Check if an ID corresponds to a valid copyist page"""
@@ -313,20 +740,20 @@ class ManusCopistaMetadataScraper:
313
  if not title_cell:
314
  continue
315
 
316
- # Get the field name
317
  title_div = title_cell.find('div', class_='table-title-item')
318
  if not title_div:
319
  continue
320
 
321
  field_name = title_div.get_text(strip=True)
322
 
323
- # Get the data cell
324
  data_cells = row.find_all('td')
325
- data_cell = data_cells[1] if len(data_cells) > 1 else None
326
- if not data_cell:
327
  continue
328
 
329
- # Extract data based on the cell content
 
 
330
  self.extract_cell_data(field_name, data_cell, metadata)
331
 
332
  except (AttributeError, IndexError):
@@ -335,16 +762,18 @@ class ManusCopistaMetadataScraper:
335
  return metadata
336
 
337
  def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
338
- """Extract data from a table cell based on its class and content - Updated for single-row CSV"""
339
  try:
340
  cell_classes = data_cell.get('class', [])
341
 
 
342
  if 'table-text' in cell_classes:
343
  text_item = data_cell.find('div', class_='table-text-item')
344
  if text_item:
345
  value = text_item.get_text(strip=True)
346
  self.map_field_value(field_name, value, metadata)
347
 
 
348
  elif 'table-link' in cell_classes:
349
  text_item = data_cell.find('div', class_='table-text-item')
350
  if text_item:
@@ -354,16 +783,18 @@ class ManusCopistaMetadataScraper:
354
  link_url = link.get('href', '')
355
  self.map_field_link(field_name, link_text, link_url, metadata)
356
  else:
 
357
  value = text_item.get_text(strip=True)
358
  self.map_field_value(field_name, value, metadata)
359
 
 
360
  elif 'table-list' in cell_classes:
361
  values = []
362
- # Look for table-list-item containers first
 
363
  list_containers = data_cell.find_all('div', class_='table-list-item')
364
 
365
  if list_containers:
366
- # Process each list container
367
  for container in list_containers:
368
  text_items = container.find_all('div', class_='table-text-item')
369
  for item in text_items:
@@ -383,7 +814,7 @@ class ManusCopistaMetadataScraper:
383
  except AttributeError:
384
  continue
385
  else:
386
- # Fallback: look for direct table-text-item elements
387
  text_items = data_cell.find_all('div', class_='table-text-item')
388
  for item in text_items:
389
  try:
@@ -402,14 +833,13 @@ class ManusCopistaMetadataScraper:
402
  except AttributeError:
403
  continue
404
 
405
- # Join all values with semicolon separator for single-row CSV
406
  self.map_field_list(field_name, values, metadata)
407
 
 
408
  elif 'table-text-html' in cell_classes:
409
  text_item = data_cell.find('div', class_='table-text-item')
410
  if text_item:
411
- # For HTML content, get text but preserve some formatting
412
- # Clean up the text and remove extra whitespace
413
  value = ' '.join(text_item.get_text(strip=True).split())
414
  self.map_field_value(field_name, value, metadata)
415
 
@@ -417,7 +847,7 @@ class ManusCopistaMetadataScraper:
417
  pass
418
 
419
  def map_field_value(self, field_name: str, value: str, metadata: Dict):
420
- """Map field names to metadata dictionary keys"""
421
  field_mapping = {
422
  'CNMN code': 'cnmn_code',
423
  'Date of creation': 'date_of_creation',
@@ -431,7 +861,7 @@ class ManusCopistaMetadataScraper:
431
  metadata[mapped_key] = value
432
 
433
  def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
434
- """Map field names with links to metadata dictionary"""
435
  if field_name == 'VID SBN':
436
  metadata['vid_sbn'] = link_text
437
  metadata['vid_sbn_url'] = link_url
@@ -440,8 +870,7 @@ class ManusCopistaMetadataScraper:
440
  metadata['isni_url'] = link_url
441
 
442
  def map_field_list(self, field_name: str, values: List, metadata: Dict):
443
- """Map field names with multiple values to metadata dictionary - Updated for single-row CSV"""
444
- # Join multiple values with semicolon separator
445
  joined_values = '; '.join(str(v) for v in values if v)
446
 
447
  if field_name == 'Other identifiers':
@@ -451,8 +880,50 @@ class ManusCopistaMetadataScraper:
451
  elif field_name == 'Names in manuscript':
452
  metadata['names_in_manuscript'] = joined_values
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
455
- """Scrape all available copyist metadata with progress updates"""
456
  try:
457
  # Discover all copyist IDs
458
  copyist_ids = self.discover_all_copyist_ids(progress_callback)
@@ -463,312 +934,159 @@ class ManusCopistaMetadataScraper:
463
  if progress_callback:
464
  progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
465
 
466
- # Limit entries if specified (for testing)
467
  if max_entries and max_entries > 0:
468
  copyist_ids = copyist_ids[:max_entries]
469
  if progress_callback:
470
  progress_callback(f"Limited to first {max_entries} entries for testing")
471
 
472
- # Process each copyist
473
- all_metadata = []
474
- total_ids = len(copyist_ids)
475
- successful_scrapes = 0
476
- failed_scrapes = 0
477
 
478
- for i, copyist_id in enumerate(copyist_ids, 1):
479
- if progress_callback:
480
- progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
481
-
482
- # Construct detail URL
483
- detail_url = f"{self.detail_base_url}{copyist_id}"
484
-
485
- # Get detailed metadata
486
- detail_soup = self.get_page_content(detail_url)
487
-
488
- if detail_soup:
489
- metadata = self.extract_metadata_from_table(detail_soup)
490
-
491
- # Combine with basic info
492
- combined_data = {
493
- 'copyist_id': copyist_id,
494
- 'detail_url': detail_url,
495
- 'scrape_order': i,
496
- 'scrape_timestamp': datetime.now().isoformat(),
497
- **metadata
498
- }
499
-
500
- all_metadata.append(combined_data)
501
- successful_scrapes += 1
502
- else:
503
- failed_scrapes += 1
504
- if progress_callback:
505
- progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
506
-
507
- # Progress update every 50 records
508
- if i % 50 == 0 and progress_callback:
509
- progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
510
-
511
- # Be respectful with delays
512
- if delay > 0:
513
- time.sleep(delay)
514
-
515
- df = pd.DataFrame(all_metadata)
516
- success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
517
  return df, success_msg
518
 
519
  except Exception as e:
520
  return pd.DataFrame(), f"Error during scraping: {str(e)}"
521
 
522
 
523
- def test_url_pattern_extraction():
524
- """Test URL pattern extraction - moved outside the class"""
525
- scraper = ManusCopistaMetadataScraper()
526
-
527
- test_urls = [
528
- "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/183323",
529
- "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/154985",
530
- "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/177035"
531
- ]
532
-
533
- print("Testing URL pattern extraction:")
534
- print("=" * 50)
535
-
536
- for url in test_urls:
537
- extracted_id = scraper.extract_copyist_id_from_url(url)
538
- print(f"URL: {url}")
539
- print(f"Extracted ID: {extracted_id}")
540
- print(f"Match: {'✓' if extracted_id else '✗'}")
541
- print("-" * 30)
542
-
543
- # Test each pattern individually
544
- print("\nTesting individual patterns:")
545
- print("=" * 50)
546
-
547
- test_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/183323"
548
- patterns = [
549
- r'/manus-authorities/detail/(\d+)',
550
- r'copisti2.*?detail/(\d+)',
551
- r'/detail/(\d+)',
552
- r'authorities.*?(\d{5,7})',
553
- r'copista.*?(\d{5,7})'
554
- ]
555
-
556
- for i, pattern in enumerate(patterns, 1):
557
- match = re.search(pattern, test_url, re.IGNORECASE)
558
- if match:
559
- print(f"Pattern {i}: {pattern} → Matches: {match.group(1)}")
560
- else:
561
- print(f"Pattern {i}: {pattern} → No match")
562
-
563
-
564
- def test_discovery_interface():
565
- """Test the discovery method"""
566
- scraper = ManusCopistaMetadataScraper()
567
-
568
- progress_updates = []
569
- def progress_callback(msg):
570
- progress_updates.append(msg)
571
- print(msg)
572
 
573
- results = scraper.test_discovery_method(progress_callback)
574
- return str(results), "\n".join(progress_updates)
575
-
576
- def scrape_interface(delay, test_mode, test_entries):
577
- """Gradio interface function"""
578
- try:
579
- # Validate inputs
580
- if delay < 0.5 or delay > 10:
581
- return None, "Please enter a delay between 0.5 and 10 seconds"
582
-
583
- max_entries = None
584
- if test_mode:
585
- if test_entries < 1 or test_entries > 100:
586
- return None, "Please enter a number between 1 and 100 for test entries"
587
- max_entries = int(test_entries)
588
-
589
- scraper = ManusCopistaMetadataScraper()
590
-
591
- # Create progress updates
592
- progress_updates = []
593
- def progress_callback(msg):
594
- progress_updates.append(msg)
595
- print(msg) # Also print to console
596
-
597
- df, message = scraper.scrape_all_copyists_with_progress(
598
- delay=float(delay),
599
- max_entries=max_entries,
600
- progress_callback=progress_callback
601
- )
602
-
603
- if df.empty:
604
- return None, f"Scraping failed: {message}"
605
-
606
- # Select key columns for display
607
- display_columns = [
608
- 'copyist_id', 'copyist_name', 'cnmn_code', 'vid_sbn',
609
- 'isni_code', 'biographical_note', 'date_of_creation'
610
- ]
611
-
612
- # Only include columns that exist
613
- available_columns = [col for col in display_columns if col in df.columns]
614
- display_df = df[available_columns]
615
-
616
- return display_df, f"{message}. Data ready for download."
617
-
618
- except Exception as e:
619
- return None, f"Error: {str(e)}"
620
-
621
- def download_csv(delay, test_mode, test_entries):
622
- """Generate CSV file for download"""
623
- try:
624
- max_entries = None
625
- if test_mode:
626
- max_entries = int(test_entries)
627
-
628
- scraper = ManusCopistaMetadataScraper()
629
- df, message = scraper.scrape_all_copyists_with_progress(
630
- delay=float(delay),
631
- max_entries=max_entries
632
- )
633
-
634
- if df.empty:
635
- return None
636
-
637
- # Save to temporary file
638
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
639
- filename = f"manus_copyists_complete_{timestamp}.csv"
640
- df.to_csv(filename, index=False)
641
-
642
- return filename
643
-
644
- except Exception as e:
645
- return None
646
-
647
- # Create Gradio interface
648
- with gr.Blocks(title="Manus Copista Complete Scraper", theme=gr.themes.Soft()) as demo:
649
- gr.Markdown(
650
- """
651
- # 📜 Manus Copista Complete Metadata Scraper
652
-
653
- This tool discovers and scrapes metadata for ALL available medieval copyists from the Manus database.
654
-
655
- **Updated for Single-Row CSV Output:**
656
- - All table data is properly extracted into single CSV rows
657
- - List fields (identifiers, sources, names) are joined with semicolons
658
- - Links are preserved with format: "text (url)"
659
- - HTML content is cleaned and formatted properly
660
-
661
- **What it does:**
662
- 1. Automatically discovers all copyist IDs from the database
663
- 2. Scrapes detailed metadata from each copyist's detail page
664
- 3. Exports complete dataset as CSV with all content in single rows
665
- """
666
- )
667
-
668
- with gr.Row():
669
- with gr.Column():
670
- test_discovery_btn = gr.Button("🔍 Test Discovery Method", variant="secondary")
671
-
672
- delay_input = gr.Number(
673
- label="Delay Between Requests (seconds)",
674
- value=2.0,
675
- minimum=0.5,
676
- maximum=10.0,
677
- step=0.1,
678
- info="Delay between requests to be respectful to the server"
679
- )
680
 
681
- test_mode = gr.Checkbox(
682
- label="Test Mode (Limited Records)",
683
- value=True,
684
- info="Enable to test with limited records first"
 
685
  )
686
 
687
- test_entries_input = gr.Number(
688
- label="Test Mode: Number of Records",
689
- value=10,
690
- minimum=1,
691
- maximum=100,
692
- step=1,
693
- info="Number of records to scrape in test mode",
694
- visible=True
695
- )
696
 
697
- scrape_btn = gr.Button("🔍 Start Scraping", variant="primary", size="lg")
698
-
699
- with gr.Column():
700
- gr.Markdown(
701
- """
702
- ### Instructions:
703
- 1. **Test Discovery**: First test the ID discovery method
704
- 2. **Test First**: Start with test mode (10-20 records)
705
- 3. **Set Delay**: Use 2+ seconds to be respectful
706
- 4. **Full Scrape**: Disable test mode for complete dataset
707
- 5. **Monitor Progress**: Check status messages
708
- 6. **Download**: Get complete CSV results
709
-
710
- ### CSV Format:
711
- - Each copyist = one row
712
- - Multiple values joined with semicolons
713
- - Links preserved as "text (url)"
714
- - Clean, structured output
715
- """
 
 
 
 
 
716
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
 
718
- # Hide/show test entries based on test mode
719
- test_mode.change(
720
- fn=lambda x: gr.update(visible=x),
721
- inputs=[test_mode],
722
- outputs=[test_entries_input]
723
- )
724
-
725
- # Output components
726
- with gr.Row():
727
- with gr.Column():
728
- status_output = gr.Textbox(
729
- label="Status Messages",
730
- lines=10,
731
- max_lines=20,
732
- info="Real-time progress updates"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  )
 
 
 
 
 
 
 
734
 
735
- with gr.Column():
736
- discovery_output = gr.Textbox(
737
- label="Discovery Test Results",
738
- lines=10,
739
- max_lines=20,
740
- info="Results from testing the discovery method"
741
  )
742
- # Data display and download
743
- with gr.Row():
744
- data_output = gr.DataFrame(
745
- label="Scraped Data Preview",
746
- interactive=False,
747
- wrap=True
748
- )
749
 
750
- with gr.Row():
751
- download_btn = gr.Button("📥 Download Complete CSV", variant="secondary")
752
- csv_file = gr.File(label="Download CSV File", visible=False)
753
-
754
- # Event handlers
755
- test_discovery_btn.click(
756
- fn=test_discovery_interface,
757
- inputs=[],
758
- outputs=[discovery_output, status_output]
759
- )
760
-
761
- scrape_btn.click(
762
- fn=scrape_interface,
763
- inputs=[delay_input, test_mode, test_entries_input],
764
- outputs=[data_output, status_output]
765
- )
766
-
767
- download_btn.click(
768
- fn=download_csv,
769
- inputs=[delay_input, test_mode, test_entries_input],
770
- outputs=[csv_file]
771
- )
772
 
 
773
  if __name__ == "__main__":
774
- demo.launch(share=True)
 
 
 
 
 
 
 
 
9
  import json
10
  import io
11
  from datetime import datetime
12
+ import os
13
+
14
+ # Selenium imports
15
+ from selenium import webdriver
16
+ from selenium.webdriver.common.by import By
17
+ from selenium.webdriver.support.ui import WebDriverWait
18
+ from selenium.webdriver.support import expected_conditions as EC
19
+ from selenium.webdriver.chrome.options import Options
20
+ from selenium.common.exceptions import TimeoutException, WebDriverException
21
+ from webdriver_manager.chrome import ChromeDriverManager
22
+
23
+ class ManusCopistaSeleniumScraper:
24
+ def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
25
+ self.base_url = base_url
26
+ self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
27
+ self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
28
+ self.driver = None
29
+ self.setup_driver()
30
+
31
+ def setup_driver(self):
32
+ """Setup Chrome driver with appropriate options"""
33
+ chrome_options = Options()
34
+ chrome_options.add_argument("--headless")
35
+ chrome_options.add_argument("--no-sandbox")
36
+ chrome_options.add_argument("--disable-dev-shm-usage")
37
+ chrome_options.add_argument("--disable-gpu")
38
+ chrome_options.add_argument("--window-size=1920,1080")
39
+ chrome_options.add_argument("--disable-extensions")
40
+ chrome_options.add_argument("--disable-plugins")
41
+ chrome_options.add_argument("--disable-images")
42
+ chrome_options.add_argument("--disable-javascript")
43
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
44
+
45
+ try:
46
+ # Try to use ChromeDriverManager for automatic driver management
47
+ service = webdriver.ChromeService(ChromeDriverManager().install())
48
+ self.driver = webdriver.Chrome(service=service, options=chrome_options)
49
+ except Exception as e:
50
+ print(f"Error setting up ChromeDriver with manager: {e}")
51
+ try:
52
+ # Fallback to system Chrome driver
53
+ self.driver = webdriver.Chrome(options=chrome_options)
54
+ except Exception as e2:
55
+ print(f"Error setting up system ChromeDriver: {e2}")
56
+ raise Exception("Could not initialize Chrome driver")
57
+
58
+ def get_page_with_selenium(self, url: str, wait_for_element: str = None, timeout: int = 10) -> Optional[BeautifulSoup]:
59
+ """Get page content using Selenium to handle JavaScript"""
60
+ try:
61
+ self.driver.get(url)
62
+
63
+ # Wait for specific element if provided
64
+ if wait_for_element:
65
+ WebDriverWait(self.driver, timeout).until(
66
+ EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element))
67
+ )
68
+ else:
69
+ # Default wait for page to load
70
+ time.sleep(3)
71
+
72
+ # Get page source and parse with BeautifulSoup
73
+ page_source = self.driver.page_source
74
+ return BeautifulSoup(page_source, 'html.parser')
75
+
76
+ except TimeoutException:
77
+ print(f"Timeout waiting for element {wait_for_element} on {url}")
78
+ return None
79
+ except WebDriverException as e:
80
+ print(f"WebDriver error on {url}: {e}")
81
+ return None
82
+ except Exception as e:
83
+ print(f"Error fetching {url}: {e}")
84
+ return None
85
+
86
+ def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
87
+ """Discover all copyist IDs from the browse page using Selenium"""
88
+ all_ids = set()
89
+
90
+ if progress_callback:
91
+ progress_callback(f"Fetching copyist list from: {self.browse_url}")
92
+
93
+ # Wait for the results table to load
94
+ soup = self.get_page_with_selenium(
95
+ self.browse_url,
96
+ wait_for_element="tbody#authorities-results-content",
97
+ timeout=15
98
+ )
99
+
100
+ if not soup:
101
+ if progress_callback:
102
+ progress_callback("Failed to fetch the copyist list page.")
103
+ return []
104
+
105
+ # Extract IDs from the table
106
+ page_ids = self.extract_copyist_ids_from_table(soup)
107
+ all_ids.update(page_ids)
108
+
109
+ if progress_callback:
110
+ progress_callback(f"Found {len(all_ids)} copyist IDs from main page.")
111
+
112
+ # Check for pagination and get additional pages
113
+ pagination_handled = self.handle_pagination(soup, all_ids, progress_callback)
114
+
115
+ if progress_callback:
116
+ progress_callback(f"Total copyist IDs discovered: {len(all_ids)}")
117
+
118
+ return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
119
+
120
+ def extract_copyist_ids_from_table(self, soup: BeautifulSoup) -> List[str]:
121
+ """Extract copyist IDs from the results table"""
122
+ ids = set()
123
+
124
+ if not soup:
125
+ return []
126
+
127
+ # Look for the specific table body
128
+ table_body = soup.find('tbody', id='authorities-results-content')
129
+ if not table_body:
130
+ # Fallback: look for any table with copyist links
131
+ table_body = soup.find('tbody')
132
+
133
+ if not table_body:
134
+ return []
135
+
136
+ # Find all links in the table
137
+ links = table_body.find_all('a', href=True)
138
+ for link in links:
139
+ href = link['href']
140
+ if 'detail/' in href:
141
+ match = re.search(r'detail/(\d+)', href)
142
+ if match:
143
+ ids.add(match.group(1))
144
+
145
+ return list(ids)
146
+
147
+ def handle_pagination(self, soup: BeautifulSoup, all_ids: set, progress_callback=None) -> bool:
148
+ """Handle pagination to get all copyist IDs"""
149
+ try:
150
+ # Look for pagination controls
151
+ pagination_links = soup.find_all('a', href=True)
152
+ next_page_found = False
153
+
154
+ for link in pagination_links:
155
+ link_text = link.get_text(strip=True).lower()
156
+ href = link.get('href', '')
157
+
158
+ # Look for "next" or page numbers
159
+ if ('next' in link_text or 'seguente' in link_text or
160
+ (link_text.isdigit() and int(link_text) > 1)):
161
+
162
+ next_page_found = True
163
+ if progress_callback:
164
+ progress_callback(f"Found pagination link: {link_text}")
165
+
166
+ # Navigate to next page
167
+ full_url = urljoin(self.base_url, href)
168
+ next_soup = self.get_page_with_selenium(
169
+ full_url,
170
+ wait_for_element="tbody#authorities-results-content",
171
+ timeout=15
172
+ )
173
+
174
+ if next_soup:
175
+ new_ids = self.extract_copyist_ids_from_table(next_soup)
176
+ all_ids.update(new_ids)
177
+ if progress_callback:
178
+ progress_callback(f"Added {len(new_ids)} IDs from pagination page")
179
+
180
+ # Recursively handle more pagination
181
+ self.handle_pagination(next_soup, all_ids, progress_callback)
182
+ break
183
+
184
+ return next_page_found
185
+
186
+ except Exception as e:
187
+ if progress_callback:
188
+ progress_callback(f"Error handling pagination: {e}")
189
+ return False
190
+
191
+ def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
192
+ """Extract metadata from the copyist detail page"""
193
+ metadata = {
194
+ 'cnmn_code': '',
195
+ 'vid_sbn': '',
196
+ 'vid_sbn_url': '',
197
+ 'isni_code': '',
198
+ 'isni_url': '',
199
+ 'other_identifiers': '',
200
+ 'biographical_note': '',
201
+ 'bibliographical_sources': '',
202
+ 'bibliographical_notes': '',
203
+ 'names_in_manuscript': '',
204
+ 'date_of_creation': '',
205
+ 'last_modification': '',
206
+ 'page_title': '',
207
+ 'copyist_name': ''
208
+ }
209
+
210
+ if not soup:
211
+ return metadata
212
+
213
+ # Extract page title
214
+ title_tag = soup.find('title')
215
+ if title_tag:
216
+ metadata['page_title'] = title_tag.get_text(strip=True)
217
+
218
+ # Try to extract copyist name
219
+ name_selectors = [
220
+ 'h1', 'h2', '.title', '.copyist-name',
221
+ '[class*="name"]', '[class*="title"]'
222
+ ]
223
+
224
+ for selector in name_selectors:
225
+ element = soup.select_one(selector)
226
+ if element:
227
+ name_text = element.get_text(strip=True)
228
+ if name_text and len(name_text) > 2:
229
+ metadata['copyist_name'] = name_text
230
+ break
231
+
232
+ # Find the main data table
233
+ main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
234
+ if not main_table:
235
+ main_table = soup.find('table')
236
+
237
+ if not main_table:
238
+ return metadata
239
+
240
+ # Process table rows
241
+ rows = main_table.find_all('tr')
242
+ for row in rows:
243
+ try:
244
+ title_cell = row.find('td', class_='table-title')
245
+ if not title_cell:
246
+ continue
247
+
248
+ title_div = title_cell.find('div', class_='table-title-item')
249
+ if not title_div:
250
+ continue
251
+
252
+ field_name = title_div.get_text(strip=True)
253
+
254
+ data_cells = row.find_all('td')
255
+ data_cell = data_cells[1] if len(data_cells) > 1 else None
256
+ if not data_cell:
257
+ continue
258
+
259
+ self.extract_cell_data(field_name, data_cell, metadata)
260
+
261
+ except (AttributeError, IndexError):
262
+ continue
263
+
264
+ return metadata
265
+
266
+ def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
267
+ """Extract data from table cells"""
268
+ try:
269
+ cell_classes = data_cell.get('class', [])
270
+
271
+ if 'table-text' in cell_classes:
272
+ text_item = data_cell.find('div', class_='table-text-item')
273
+ if text_item:
274
+ value = text_item.get_text(strip=True)
275
+ self.map_field_value(field_name, value, metadata)
276
+
277
+ elif 'table-link' in cell_classes:
278
+ text_item = data_cell.find('div', class_='table-text-item')
279
+ if text_item:
280
+ link = text_item.find('a')
281
+ if link:
282
+ link_text = link.get_text(strip=True)
283
+ link_url = link.get('href', '')
284
+ self.map_field_link(field_name, link_text, link_url, metadata)
285
+ else:
286
+ value = text_item.get_text(strip=True)
287
+ self.map_field_value(field_name, value, metadata)
288
+
289
+ elif 'table-list' in cell_classes:
290
+ values = []
291
+ list_containers = data_cell.find_all('div', class_='table-list-item')
292
+
293
+ if list_containers:
294
+ for container in list_containers:
295
+ text_items = container.find_all('div', class_='table-text-item')
296
+ for item in text_items:
297
+ try:
298
+ link = item.find('a')
299
+ if link:
300
+ link_text = link.get_text(strip=True)
301
+ link_url = link.get('href', '')
302
+ if link_url:
303
+ values.append(f"{link_text} ({link_url})")
304
+ else:
305
+ values.append(link_text)
306
+ else:
307
+ text = item.get_text(strip=True)
308
+ if text:
309
+ values.append(text)
310
+ except AttributeError:
311
+ continue
312
+ else:
313
+ text_items = data_cell.find_all('div', class_='table-text-item')
314
+ for item in text_items:
315
+ try:
316
+ link = item.find('a')
317
+ if link:
318
+ link_text = link.get_text(strip=True)
319
+ link_url = link.get('href', '')
320
+ if link_url:
321
+ values.append(f"{link_text} ({link_url})")
322
+ else:
323
+ values.append(link_text)
324
+ else:
325
+ text = item.get_text(strip=True)
326
+ if text:
327
+ values.append(text)
328
+ except AttributeError:
329
+ continue
330
+
331
+ self.map_field_list(field_name, values, metadata)
332
+
333
+ elif 'table-text-html' in cell_classes:
334
+ text_item = data_cell.find('div', class_='table-text-item')
335
+ if text_item:
336
+ value = ' '.join(text_item.get_text(strip=True).split())
337
+ self.map_field_value(field_name, value, metadata)
338
+
339
+ except (AttributeError, TypeError):
340
+ pass
341
+
342
+ def map_field_value(self, field_name: str, value: str, metadata: Dict):
343
+ """Map field values to metadata keys"""
344
+ field_mapping = {
345
+ 'CNMN code': 'cnmn_code',
346
+ 'Date of creation': 'date_of_creation',
347
+ 'Last modification': 'last_modification',
348
+ 'Biographical note': 'biographical_note',
349
+ 'Bibliographical notes': 'bibliographical_notes'
350
+ }
351
+
352
+ mapped_key = field_mapping.get(field_name)
353
+ if mapped_key and mapped_key in metadata:
354
+ metadata[mapped_key] = value
355
+
356
+ def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
357
+ """Map field links to metadata"""
358
+ if field_name == 'VID SBN':
359
+ metadata['vid_sbn'] = link_text
360
+ metadata['vid_sbn_url'] = link_url
361
+ elif field_name == 'Codice ISNI':
362
+ metadata['isni_code'] = link_text
363
+ metadata['isni_url'] = link_url
364
+
365
+ def map_field_list(self, field_name: str, values: List, metadata: Dict):
366
+ """Map field lists to metadata"""
367
+ joined_values = '; '.join(str(v) for v in values if v)
368
+
369
+ if field_name == 'Other identifiers':
370
+ metadata['other_identifiers'] = joined_values
371
+ elif field_name == 'Bibliographical sources':
372
+ metadata['bibliographical_sources'] = joined_values
373
+ elif field_name == 'Names in manuscript':
374
+ metadata['names_in_manuscript'] = joined_values
375
+
376
+ def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
377
+ """Scrape all copyists with Selenium"""
378
+ try:
379
+ # Discover all copyist IDs
380
+ copyist_ids = self.discover_all_copyist_ids(progress_callback)
381
+
382
+ if not copyist_ids:
383
+ return pd.DataFrame(), "No copyist IDs found"
384
+
385
+ if progress_callback:
386
+ progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
387
+
388
+ # Limit entries if specified
389
+ if max_entries and max_entries > 0:
390
+ copyist_ids = copyist_ids[:max_entries]
391
+ if progress_callback:
392
+ progress_callback(f"Limited to first {max_entries} entries for testing")
393
+
394
+ # Process each copyist
395
+ all_metadata = []
396
+ total_ids = len(copyist_ids)
397
+ successful_scrapes = 0
398
+ failed_scrapes = 0
399
+
400
+ for i, copyist_id in enumerate(copyist_ids, 1):
401
+ if progress_callback:
402
+ progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
403
+
404
+ detail_url = f"{self.detail_base_url}{copyist_id}"
405
+
406
+ # Get detailed metadata using Selenium
407
+ detail_soup = self.get_page_with_selenium(
408
+ detail_url,
409
+ wait_for_element="table",
410
+ timeout=10
411
+ )
412
+
413
+ if detail_soup:
414
+ metadata = self.extract_metadata_from_table(detail_soup)
415
+
416
+ combined_data = {
417
+ 'copyist_id': copyist_id,
418
+ 'detail_url': detail_url,
419
+ 'scrape_order': i,
420
+ 'scrape_timestamp': datetime.now().isoformat(),
421
+ **metadata
422
+ }
423
+
424
+ all_metadata.append(combined_data)
425
+ successful_scrapes += 1
426
+ else:
427
+ failed_scrapes += 1
428
+ if progress_callback:
429
+ progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
430
+
431
+ # Progress update
432
+ if i % 50 == 0 and progress_callback:
433
+ progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
434
+
435
+ # Delay between requests
436
+ if delay > 0:
437
+ time.sleep(delay)
438
+
439
+ df = pd.DataFrame(all_metadata)
440
+ success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
441
+ return df, success_msg
442
+
443
+ except Exception as e:
444
+ return pd.DataFrame(), f"Error during scraping: {str(e)}"
445
+
446
+ def cleanup(self):
447
+ """Clean up resources"""
448
+ if self.driver:
449
+ self.driver.quit()
450
+ self.driver = None
451
+
452
+ def __del__(self):
453
+ """Destructor to ensure cleanup"""
454
+ self.cleanup()
455
+
456
 
457
  class ManusCopistaMetadataScraper:
458
  def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
 
511
  progress_callback(f"Found {len(all_ids)} copyist IDs.")
512
 
513
  return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
 
514
 
515
  def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
516
+ """Extract copyist IDs from the table with id 'authorities-results-content'"""
517
  ids = set()
518
+
519
  if not soup:
520
  return []
521
+
522
+ table_body = soup.find('tbody', id='authorities-results-content')
523
+ if not table_body:
524
+ return []
525
+
526
+ links = table_body.find_all('a', href=True)
527
  for link in links:
528
+ href = link['href']
529
+ if 'detail/' in href:
530
+ match = re.search(r'detail/(\d+)', href)
531
+ if match:
532
+ ids.add(match.group(1))
533
+
 
 
 
 
 
 
 
 
 
 
 
 
534
  return list(ids)
535
+
536
+ def extract_copyist_id_from_url(self, url: str) -> Optional[str]:
537
+ """Extract copyist ID from a URL"""
538
+ patterns = [
539
+ r'/manus-authorities/detail/(\d+)',
540
+ r'copisti2.*?detail/(\d+)',
541
+ r'/detail/(\d+)',
542
+ r'authorities.*?(\d{5,7})',
543
+ r'copista.*?(\d{5,7})'
544
+ ]
545
 
546
+ for pattern in patterns:
547
+ match = re.search(pattern, url, re.IGNORECASE)
548
+ if match:
549
+ return match.group(1)
550
+
551
+ return None
552
+
553
  def is_potential_copyist_id(self, id_str: str) -> bool:
554
  """Check if a string looks like a copyist ID"""
555
  if not id_str or not id_str.isdigit():
 
565
  return 100000 <= id_num <= 999999
566
  except ValueError:
567
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
  def is_valid_copyist_id(self, id_str: str) -> bool:
570
  """Check if an ID corresponds to a valid copyist page"""
 
740
  if not title_cell:
741
  continue
742
 
 
743
  title_div = title_cell.find('div', class_='table-title-item')
744
  if not title_div:
745
  continue
746
 
747
  field_name = title_div.get_text(strip=True)
748
 
749
+ # Get the data cell (should be the second td in the row)
750
  data_cells = row.find_all('td')
751
+ if len(data_cells) < 2:
 
752
  continue
753
 
754
+ data_cell = data_cells[1]
755
+
756
+ # Extract data based on cell type
757
  self.extract_cell_data(field_name, data_cell, metadata)
758
 
759
  except (AttributeError, IndexError):
 
762
  return metadata
763
 
764
  def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
765
+ """Extract data from table cells based on their class structure"""
766
  try:
767
  cell_classes = data_cell.get('class', [])
768
 
769
+ # Handle text cells
770
  if 'table-text' in cell_classes:
771
  text_item = data_cell.find('div', class_='table-text-item')
772
  if text_item:
773
  value = text_item.get_text(strip=True)
774
  self.map_field_value(field_name, value, metadata)
775
 
776
+ # Handle link cells
777
  elif 'table-link' in cell_classes:
778
  text_item = data_cell.find('div', class_='table-text-item')
779
  if text_item:
 
783
  link_url = link.get('href', '')
784
  self.map_field_link(field_name, link_text, link_url, metadata)
785
  else:
786
+ # No link, just text
787
  value = text_item.get_text(strip=True)
788
  self.map_field_value(field_name, value, metadata)
789
 
790
+ # Handle list cells
791
  elif 'table-list' in cell_classes:
792
  values = []
793
+
794
+ # Look for list containers
795
  list_containers = data_cell.find_all('div', class_='table-list-item')
796
 
797
  if list_containers:
 
798
  for container in list_containers:
799
  text_items = container.find_all('div', class_='table-text-item')
800
  for item in text_items:
 
814
  except AttributeError:
815
  continue
816
  else:
817
+ # Fallback: look for text items directly
818
  text_items = data_cell.find_all('div', class_='table-text-item')
819
  for item in text_items:
820
  try:
 
833
  except AttributeError:
834
  continue
835
 
 
836
  self.map_field_list(field_name, values, metadata)
837
 
838
+ # Handle HTML text cells
839
  elif 'table-text-html' in cell_classes:
840
  text_item = data_cell.find('div', class_='table-text-item')
841
  if text_item:
842
+ # Clean HTML and get text
 
843
  value = ' '.join(text_item.get_text(strip=True).split())
844
  self.map_field_value(field_name, value, metadata)
845
 
 
847
  pass
848
 
849
  def map_field_value(self, field_name: str, value: str, metadata: Dict):
850
+ """Map field values to the appropriate metadata keys"""
851
  field_mapping = {
852
  'CNMN code': 'cnmn_code',
853
  'Date of creation': 'date_of_creation',
 
861
  metadata[mapped_key] = value
862
 
863
  def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
864
+ """Map field links to metadata"""
865
  if field_name == 'VID SBN':
866
  metadata['vid_sbn'] = link_text
867
  metadata['vid_sbn_url'] = link_url
 
870
  metadata['isni_url'] = link_url
871
 
872
  def map_field_list(self, field_name: str, values: List, metadata: Dict):
873
+ """Map field lists to metadata"""
 
874
  joined_values = '; '.join(str(v) for v in values if v)
875
 
876
  if field_name == 'Other identifiers':
 
880
  elif field_name == 'Names in manuscript':
881
  metadata['names_in_manuscript'] = joined_values
882
 
883
+ def scrape_copyist_by_id(self, copyist_id: str) -> Dict:
884
+ """Scrape a single copyist by ID"""
885
+ detail_url = f"{self.detail_base_url}{copyist_id}"
886
+
887
+ # Get the detail page
888
+ detail_soup = self.get_page_content(detail_url)
889
+ if not detail_soup:
890
+ return {'error': f'Could not fetch data for copyist ID {copyist_id}'}
891
+
892
+ # Extract metadata
893
+ metadata = self.extract_metadata_from_table(detail_soup)
894
+
895
+ # Add basic info
896
+ metadata['copyist_id'] = copyist_id
897
+ metadata['detail_url'] = detail_url
898
+ metadata['scrape_timestamp'] = datetime.now().isoformat()
899
+
900
+ return metadata
901
+
902
+ def scrape_multiple_copyists(self, copyist_ids: List[str], delay: float = 1.0, progress_callback=None) -> pd.DataFrame:
903
+ """Scrape multiple copyists by their IDs"""
904
+ all_metadata = []
905
+
906
+ for i, copyist_id in enumerate(copyist_ids, 1):
907
+ if progress_callback:
908
+ progress_callback(f"Processing {i}/{len(copyist_ids)}: Copyist ID {copyist_id}")
909
+
910
+ metadata = self.scrape_copyist_by_id(copyist_id)
911
+
912
+ if 'error' not in metadata:
913
+ metadata['scrape_order'] = i
914
+ all_metadata.append(metadata)
915
+ else:
916
+ if progress_callback:
917
+ progress_callback(f"Failed to scrape copyist ID {copyist_id}: {metadata['error']}")
918
+
919
+ # Delay between requests
920
+ if delay > 0:
921
+ time.sleep(delay)
922
+
923
+ return pd.DataFrame(all_metadata)
924
+
925
  def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
926
+ """Scrape all copyists with progress updates"""
927
  try:
928
  # Discover all copyist IDs
929
  copyist_ids = self.discover_all_copyist_ids(progress_callback)
 
934
  if progress_callback:
935
  progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
936
 
937
+ # Limit entries if specified
938
  if max_entries and max_entries > 0:
939
  copyist_ids = copyist_ids[:max_entries]
940
  if progress_callback:
941
  progress_callback(f"Limited to first {max_entries} entries for testing")
942
 
943
+ # Scrape the copyists
944
+ df = self.scrape_multiple_copyists(copyist_ids, delay, progress_callback)
 
 
 
945
 
946
+ success_msg = f"Successfully scraped {len(df)} copyist records out of {len(copyist_ids)} discovered IDs"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
  return df, success_msg
948
 
949
  except Exception as e:
950
  return pd.DataFrame(), f"Error during scraping: {str(e)}"
951
 
952
 
953
+ # Gradio Interface Functions
954
+ def create_gradio_interface():
955
+ """Create and return the Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
956
 
957
+ def run_scraper_selenium(delay, max_entries, progress=gr.Progress()):
958
+ """Run the Selenium scraper with progress updates"""
959
+ scraper = None
960
+ try:
961
+ def update_progress(message):
962
+ progress(message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
 
964
+ scraper = ManusCopistaSeleniumScraper()
965
+ df, status = scraper.scrape_all_copyists_with_progress(
966
+ delay=delay,
967
+ max_entries=max_entries if max_entries > 0 else None,
968
+ progress_callback=update_progress
969
  )
970
 
971
+ if df.empty:
972
+ return None, f"No data scraped. Status: {status}"
 
 
 
 
 
 
 
973
 
974
+ # Create CSV output
975
+ csv_output = io.StringIO()
976
+ df.to_csv(csv_output, index=False)
977
+ csv_content = csv_output.getvalue()
978
+
979
+ return csv_content, f"Success! {status}"
980
+
981
+ except Exception as e:
982
+ return None, f"Error: {str(e)}"
983
+ finally:
984
+ if scraper:
985
+ scraper.cleanup()
986
+
987
+ def run_scraper_requests(delay, max_entries, progress=gr.Progress()):
988
+ """Run the requests-based scraper with progress updates"""
989
+ try:
990
+ def update_progress(message):
991
+ progress(message)
992
+
993
+ scraper = ManusCopistaMetadataScraper()
994
+ df, status = scraper.scrape_all_copyists_with_progress(
995
+ delay=delay,
996
+ max_entries=max_entries if max_entries > 0 else None,
997
+ progress_callback=update_progress
998
  )
999
+
1000
+ if df.empty:
1001
+ return None, f"No data scraped. Status: {status}"
1002
+
1003
+ # Create CSV output
1004
+ csv_output = io.StringIO()
1005
+ df.to_csv(csv_output, index=False)
1006
+ csv_content = csv_output.getvalue()
1007
+
1008
+ return csv_content, f"Success! {status}"
1009
+
1010
+ except Exception as e:
1011
+ return None, f"Error: {str(e)}"
1012
+
1013
+ def test_discovery(progress=gr.Progress()):
1014
+ """Test the discovery method"""
1015
+ try:
1016
+ def update_progress(message):
1017
+ progress(message)
1018
+
1019
+ scraper = ManusCopistaMetadataScraper()
1020
+ results = scraper.test_discovery_method(progress_callback=update_progress)
1021
+
1022
+ return json.dumps(results, indent=2), "Discovery test completed"
1023
+
1024
+ except Exception as e:
1025
+ return None, f"Error: {str(e)}"
1026
 
1027
+ with gr.Blocks(title="Manus Copista Scraper") as interface:
1028
+ gr.Markdown("# Manus Copista Metadata Scraper")
1029
+ gr.Markdown("Scrape copyist metadata from the Manus database using either Selenium or requests.")
1030
+
1031
+ with gr.Tab("Selenium Scraper (Recommended)"):
1032
+ gr.Markdown("### Selenium-based scraper (handles JavaScript)")
1033
+
1034
+ with gr.Row():
1035
+ selenium_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
1036
+ selenium_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
1037
+
1038
+ selenium_run_btn = gr.Button("Run Selenium Scraper", variant="primary")
1039
+ selenium_status = gr.Textbox(label="Status", lines=3)
1040
+ selenium_output = gr.File(label="Download CSV")
1041
+
1042
+ selenium_run_btn.click(
1043
+ run_scraper_selenium,
1044
+ inputs=[selenium_delay, selenium_max_entries],
1045
+ outputs=[selenium_output, selenium_status]
1046
+ )
1047
+
1048
+ with gr.Tab("Requests Scraper"):
1049
+ gr.Markdown("### Requests-based scraper (faster, may miss JavaScript content)")
1050
+
1051
+ with gr.Row():
1052
+ requests_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
1053
+ requests_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
1054
+
1055
+ requests_run_btn = gr.Button("Run Requests Scraper", variant="primary")
1056
+ requests_status = gr.Textbox(label="Status", lines=3)
1057
+ requests_output = gr.File(label="Download CSV")
1058
+
1059
+ requests_run_btn.click(
1060
+ run_scraper_requests,
1061
+ inputs=[requests_delay, requests_max_entries],
1062
+ outputs=[requests_output, requests_status]
1063
  )
1064
+
1065
+ with gr.Tab("Discovery Test"):
1066
+ gr.Markdown("### Test the ID discovery process")
1067
+
1068
+ test_btn = gr.Button("Test Discovery Method", variant="secondary")
1069
+ test_status = gr.Textbox(label="Status", lines=2)
1070
+ test_output = gr.Textbox(label="Test Results", lines=20)
1071
 
1072
+ test_btn.click(
1073
+ test_discovery,
1074
+ outputs=[test_output, test_status]
 
 
 
1075
  )
1076
+
1077
+ gr.Markdown("---")
1078
+ gr.Markdown("**Note:** The Selenium scraper is recommended as it can handle JavaScript content. The requests scraper is faster but may miss some data.")
 
 
 
 
1079
 
1080
+ return interface
1081
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1082
 
1083
+ # Main execution
1084
  if __name__ == "__main__":
1085
+ # Create and launch the interface
1086
+ interface = create_gradio_interface()
1087
+ interface.launch(
1088
+ server_name="0.0.0.0",
1089
+ server_port=7860,
1090
+ share=False,
1091
+ debug=True
1092
+ )