Spaces:

kambris
/

Soup

Runtime error

App Files Files Community

kambris commited on Jul 3

Commit

d1c8665

verified ·

1 Parent(s): db177e2

Update app.py

Browse files

Files changed (1) hide show

app.py +667 -349

app.py CHANGED Viewed

@@ -9,7 +9,450 @@ from typing import Dict, List, Optional
 import json
 import io
 from datetime import datetime
-import re
 class ManusCopistaMetadataScraper:
     def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
@@ -68,39 +511,45 @@ class ManusCopistaMetadataScraper:
             progress_callback(f"Found {len(all_ids)} copyist IDs.")
         return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
     def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
-        """Extract copyist IDs from a single page"""
         ids = set()
         if not soup:
             return []
-        # Look for ALL links on the page
-        links = soup.find_all('a', href=True)
         for link in links:
-            href = link.get('href', '')
-            # Check if this is a copyist detail link
-            if 'manus-authorities/detail/' in href or '/detail/' in href:
-                copyist_id = self.extract_copyist_id_from_url(href)
-                if copyist_id:
-                    ids.add(copyist_id)
-        # Also check for JavaScript-generated content or data attributes
-        # Look for script tags that might contain copyist IDs
-        scripts = soup.find_all('script')
-        for script in scripts:
-            script_text = script.string if script.string else ''
-            # Look for ID patterns in JavaScript
-            id_matches = re.findall(r'\b\d{5,7}\b', script_text)
-            for match in id_matches:
-                if self.is_potential_copyist_id(match):
-                    ids.add(match)
         return list(ids)
     def is_potential_copyist_id(self, id_str: str) -> bool:
         """Check if a string looks like a copyist ID"""
         if not id_str or not id_str.isdigit():
@@ -116,28 +565,6 @@ class ManusCopistaMetadataScraper:
             return 100000 <= id_num <= 999999
         except ValueError:
             return False
-    def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
-        """Extract copyist IDs from the table with id 'authorities-results-content'"""
-        ids = set()
-        if not soup:
-            return []
-        table_body = soup.find('tbody', id='authorities-results-content')
-        if not table_body:
-            return []
-        links = table_body.find_all('a', href=True)
-        for link in links:
-            href = link['href']
-            if 'detail/' in href:
-                match = re.search(r'detail/(\d+)', href)
-                if match:
-                    ids.add(match.group(1))
-        return list(ids)
     def is_valid_copyist_id(self, id_str: str) -> bool:
         """Check if an ID corresponds to a valid copyist page"""
@@ -313,20 +740,20 @@ class ManusCopistaMetadataScraper:
                 if not title_cell:
                     continue
-                # Get the field name
                 title_div = title_cell.find('div', class_='table-title-item')
                 if not title_div:
                     continue
                 field_name = title_div.get_text(strip=True)
-                # Get the data cell
                 data_cells = row.find_all('td')
-                data_cell = data_cells[1] if len(data_cells) > 1 else None
-                if not data_cell:
                     continue
-                # Extract data based on the cell content
                 self.extract_cell_data(field_name, data_cell, metadata)
             except (AttributeError, IndexError):
@@ -335,16 +762,18 @@ class ManusCopistaMetadataScraper:
         return metadata
     def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
-        """Extract data from a table cell based on its class and content - Updated for single-row CSV"""
         try:
             cell_classes = data_cell.get('class', [])
             if 'table-text' in cell_classes:
                 text_item = data_cell.find('div', class_='table-text-item')
                 if text_item:
                     value = text_item.get_text(strip=True)
                     self.map_field_value(field_name, value, metadata)
             elif 'table-link' in cell_classes:
                 text_item = data_cell.find('div', class_='table-text-item')
                 if text_item:
@@ -354,16 +783,18 @@ class ManusCopistaMetadataScraper:
                         link_url = link.get('href', '')
                         self.map_field_link(field_name, link_text, link_url, metadata)
                     else:
                         value = text_item.get_text(strip=True)
                         self.map_field_value(field_name, value, metadata)
             elif 'table-list' in cell_classes:
                 values = []
-                # Look for table-list-item containers first
                 list_containers = data_cell.find_all('div', class_='table-list-item')
                 if list_containers:
-                    # Process each list container
                     for container in list_containers:
                         text_items = container.find_all('div', class_='table-text-item')
                         for item in text_items:
@@ -383,7 +814,7 @@ class ManusCopistaMetadataScraper:
                             except AttributeError:
                                 continue
                 else:
-                    # Fallback: look for direct table-text-item elements
                     text_items = data_cell.find_all('div', class_='table-text-item')
                     for item in text_items:
                         try:
@@ -402,14 +833,13 @@ class ManusCopistaMetadataScraper:
                         except AttributeError:
                             continue
-                # Join all values with semicolon separator for single-row CSV
                 self.map_field_list(field_name, values, metadata)
             elif 'table-text-html' in cell_classes:
                 text_item = data_cell.find('div', class_='table-text-item')
                 if text_item:
-                    # For HTML content, get text but preserve some formatting
-                    # Clean up the text and remove extra whitespace
                     value = ' '.join(text_item.get_text(strip=True).split())
                     self.map_field_value(field_name, value, metadata)
@@ -417,7 +847,7 @@ class ManusCopistaMetadataScraper:
             pass
     def map_field_value(self, field_name: str, value: str, metadata: Dict):
-        """Map field names to metadata dictionary keys"""
         field_mapping = {
             'CNMN code': 'cnmn_code',
             'Date of creation': 'date_of_creation',
@@ -431,7 +861,7 @@ class ManusCopistaMetadataScraper:
             metadata[mapped_key] = value
     def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
-        """Map field names with links to metadata dictionary"""
         if field_name == 'VID SBN':
             metadata['vid_sbn'] = link_text
             metadata['vid_sbn_url'] = link_url
@@ -440,8 +870,7 @@ class ManusCopistaMetadataScraper:
             metadata['isni_url'] = link_url
     def map_field_list(self, field_name: str, values: List, metadata: Dict):
-        """Map field names with multiple values to metadata dictionary - Updated for single-row CSV"""
-        # Join multiple values with semicolon separator
         joined_values = '; '.join(str(v) for v in values if v)
         if field_name == 'Other identifiers':
@@ -451,8 +880,50 @@ class ManusCopistaMetadataScraper:
         elif field_name == 'Names in manuscript':
             metadata['names_in_manuscript'] = joined_values
     def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
-        """Scrape all available copyist metadata with progress updates"""
         try:
             # Discover all copyist IDs
             copyist_ids = self.discover_all_copyist_ids(progress_callback)
@@ -463,312 +934,159 @@ class ManusCopistaMetadataScraper:
             if progress_callback:
                 progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
-            # Limit entries if specified (for testing)
             if max_entries and max_entries > 0:
                 copyist_ids = copyist_ids[:max_entries]
                 if progress_callback:
                     progress_callback(f"Limited to first {max_entries} entries for testing")
-            # Process each copyist
-            all_metadata = []
-            total_ids = len(copyist_ids)
-            successful_scrapes = 0
-            failed_scrapes = 0
-            for i, copyist_id in enumerate(copyist_ids, 1):
-                if progress_callback:
-                    progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
-                # Construct detail URL
-                detail_url = f"{self.detail_base_url}{copyist_id}"
-                # Get detailed metadata
-                detail_soup = self.get_page_content(detail_url)
-                if detail_soup:
-                    metadata = self.extract_metadata_from_table(detail_soup)
-                    # Combine with basic info
-                    combined_data = {
-                        'copyist_id': copyist_id,
-                        'detail_url': detail_url,
-                        'scrape_order': i,
-                        'scrape_timestamp': datetime.now().isoformat(),
-                        **metadata
-                    }
-                    all_metadata.append(combined_data)
-                    successful_scrapes += 1
-                else:
-                    failed_scrapes += 1
-                    if progress_callback:
-                        progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
-                # Progress update every 50 records
-                if i % 50 == 0 and progress_callback:
-                    progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
-                # Be respectful with delays
-                if delay > 0:
-                    time.sleep(delay)
-            df = pd.DataFrame(all_metadata)
-            success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
             return df, success_msg
         except Exception as e:
             return pd.DataFrame(), f"Error during scraping: {str(e)}"
-def test_url_pattern_extraction():
-    """Test URL pattern extraction - moved outside the class"""
-    scraper = ManusCopistaMetadataScraper()
-    test_urls = [
-        "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/183323",
-        "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/154985",
-        "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/177035"
-    ]
-    print("Testing URL pattern extraction:")
-    print("=" * 50)
-    for url in test_urls:
-        extracted_id = scraper.extract_copyist_id_from_url(url)
-        print(f"URL: {url}")
-        print(f"Extracted ID: {extracted_id}")
-        print(f"Match: {'✓' if extracted_id else '✗'}")
-        print("-" * 30)
-    # Test each pattern individually
-    print("\nTesting individual patterns:")
-    print("=" * 50)
-    test_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/183323"
-    patterns = [
-        r'/manus-authorities/detail/(\d+)',
-        r'copisti2.*?detail/(\d+)',
-        r'/detail/(\d+)',
-        r'authorities.*?(\d{5,7})',
-        r'copista.*?(\d{5,7})'
-    ]
-    for i, pattern in enumerate(patterns, 1):
-        match = re.search(pattern, test_url, re.IGNORECASE)
-        if match:
-            print(f"Pattern {i}: {pattern} → Matches: {match.group(1)}")
-        else:
-            print(f"Pattern {i}: {pattern} → No match")
-def test_discovery_interface():
-    """Test the discovery method"""
-    scraper = ManusCopistaMetadataScraper()
-    progress_updates = []
-    def progress_callback(msg):
-        progress_updates.append(msg)
-        print(msg)
-    results = scraper.test_discovery_method(progress_callback)
-    return str(results), "\n".join(progress_updates)
-def scrape_interface(delay, test_mode, test_entries):
-    """Gradio interface function"""
-    try:
-        # Validate inputs
-        if delay < 0.5 or delay > 10:
-            return None, "Please enter a delay between 0.5 and 10 seconds"
-        max_entries = None
-        if test_mode:
-            if test_entries < 1 or test_entries > 100:
-                return None, "Please enter a number between 1 and 100 for test entries"
-            max_entries = int(test_entries)
-        scraper = ManusCopistaMetadataScraper()
-        # Create progress updates
-        progress_updates = []
-        def progress_callback(msg):
-            progress_updates.append(msg)
-            print(msg)  # Also print to console
-        df, message = scraper.scrape_all_copyists_with_progress(
-            delay=float(delay),
-            max_entries=max_entries,
-            progress_callback=progress_callback
-        )
-        if df.empty:
-            return None, f"Scraping failed: {message}"
-        # Select key columns for display
-        display_columns = [
-            'copyist_id', 'copyist_name', 'cnmn_code', 'vid_sbn',
-            'isni_code', 'biographical_note', 'date_of_creation'
-        ]
-        # Only include columns that exist
-        available_columns = [col for col in display_columns if col in df.columns]
-        display_df = df[available_columns]
-        return display_df, f"{message}. Data ready for download."
-    except Exception as e:
-        return None, f"Error: {str(e)}"
-def download_csv(delay, test_mode, test_entries):
-    """Generate CSV file for download"""
-    try:
-        max_entries = None
-        if test_mode:
-            max_entries = int(test_entries)
-        scraper = ManusCopistaMetadataScraper()
-        df, message = scraper.scrape_all_copyists_with_progress(
-            delay=float(delay),
-            max_entries=max_entries
-        )
-        if df.empty:
-            return None
-        # Save to temporary file
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"manus_copyists_complete_{timestamp}.csv"
-        df.to_csv(filename, index=False)
-        return filename
-    except Exception as e:
-        return None
-# Create Gradio interface
-with gr.Blocks(title="Manus Copista Complete Scraper", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 📜 Manus Copista Complete Metadata Scraper
-        This tool discovers and scrapes metadata for ALL available medieval copyists from the Manus database.
-        **Updated for Single-Row CSV Output:**
-        - All table data is properly extracted into single CSV rows
-        - List fields (identifiers, sources, names) are joined with semicolons
-        - Links are preserved with format: "text (url)"
-        - HTML content is cleaned and formatted properly
-        **What it does:**
-        1. Automatically discovers all copyist IDs from the database
-        2. Scrapes detailed metadata from each copyist's detail page
-        3. Exports complete dataset as CSV with all content in single rows
-        """
-    )
-    with gr.Row():
-        with gr.Column():
-            test_discovery_btn = gr.Button("🔍 Test Discovery Method", variant="secondary")
-            delay_input = gr.Number(
-                label="Delay Between Requests (seconds)",
-                value=2.0,
-                minimum=0.5,
-                maximum=10.0,
-                step=0.1,
-                info="Delay between requests to be respectful to the server"
-            )
-            test_mode = gr.Checkbox(
-                label="Test Mode (Limited Records)",
-                value=True,
-                info="Enable to test with limited records first"
             )
-            test_entries_input = gr.Number(
-                label="Test Mode: Number of Records",
-                value=10,
-                minimum=1,
-                maximum=100,
-                step=1,
-                info="Number of records to scrape in test mode",
-                visible=True
-            )
-            scrape_btn = gr.Button("🔍 Start Scraping", variant="primary", size="lg")
-        with gr.Column():
-            gr.Markdown(
-                """
-                ### Instructions:
-                1. **Test Discovery**: First test the ID discovery method
-                2. **Test First**: Start with test mode (10-20 records)
-                3. **Set Delay**: Use 2+ seconds to be respectful
-                4. **Full Scrape**: Disable test mode for complete dataset
-                5. **Monitor Progress**: Check status messages
-                6. **Download**: Get complete CSV results
-                ### CSV Format:
-                - Each copyist = one row
-                - Multiple values joined with semicolons
-                - Links preserved as "text (url)"
-                - Clean, structured output
-                """
             )
-    # Hide/show test entries based on test mode
-    test_mode.change(
-        fn=lambda x: gr.update(visible=x),
-        inputs=[test_mode],
-        outputs=[test_entries_input]
-    )
-    # Output components
-    with gr.Row():
-        with gr.Column():
-            status_output = gr.Textbox(
-                label="Status Messages",
-                lines=10,
-                max_lines=20,
-                info="Real-time progress updates"
             )
-        with gr.Column():
-            discovery_output = gr.Textbox(
-                label="Discovery Test Results",
-                lines=10,
-                max_lines=20,
-                info="Results from testing the discovery method"
             )
-    # Data display and download
-    with gr.Row():
-        data_output = gr.DataFrame(
-            label="Scraped Data Preview",
-            interactive=False,
-            wrap=True
-        )
-    with gr.Row():
-        download_btn = gr.Button("📥 Download Complete CSV", variant="secondary")
-        csv_file = gr.File(label="Download CSV File", visible=False)
-    # Event handlers
-    test_discovery_btn.click(
-        fn=test_discovery_interface,
-        inputs=[],
-        outputs=[discovery_output, status_output]
-    )
-    scrape_btn.click(
-        fn=scrape_interface,
-        inputs=[delay_input, test_mode, test_entries_input],
-        outputs=[data_output, status_output]
-    )
-    download_btn.click(
-        fn=download_csv,
-        inputs=[delay_input, test_mode, test_entries_input],
-        outputs=[csv_file]
-    )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import json
 import io
 from datetime import datetime
+import os
+# Selenium imports
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, WebDriverException
+from webdriver_manager.chrome import ChromeDriverManager
+class ManusCopistaSeleniumScraper:
+    def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
+        self.base_url = base_url
+        self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
+        self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
+        self.driver = None
+        self.setup_driver()
+    def setup_driver(self):
+        """Setup Chrome driver with appropriate options"""
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--window-size=1920,1080")
+        chrome_options.add_argument("--disable-extensions")
+        chrome_options.add_argument("--disable-plugins")
+        chrome_options.add_argument("--disable-images")
+        chrome_options.add_argument("--disable-javascript")
+        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+        try:
+            # Try to use ChromeDriverManager for automatic driver management
+            service = webdriver.ChromeService(ChromeDriverManager().install())
+            self.driver = webdriver.Chrome(service=service, options=chrome_options)
+        except Exception as e:
+            print(f"Error setting up ChromeDriver with manager: {e}")
+            try:
+                # Fallback to system Chrome driver
+                self.driver = webdriver.Chrome(options=chrome_options)
+            except Exception as e2:
+                print(f"Error setting up system ChromeDriver: {e2}")
+                raise Exception("Could not initialize Chrome driver")
+    def get_page_with_selenium(self, url: str, wait_for_element: str = None, timeout: int = 10) -> Optional[BeautifulSoup]:
+        """Get page content using Selenium to handle JavaScript"""
+        try:
+            self.driver.get(url)
+            # Wait for specific element if provided
+            if wait_for_element:
+                WebDriverWait(self.driver, timeout).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element))
+                )
+            else:
+                # Default wait for page to load
+                time.sleep(3)
+            # Get page source and parse with BeautifulSoup
+            page_source = self.driver.page_source
+            return BeautifulSoup(page_source, 'html.parser')
+        except TimeoutException:
+            print(f"Timeout waiting for element {wait_for_element} on {url}")
+            return None
+        except WebDriverException as e:
+            print(f"WebDriver error on {url}: {e}")
+            return None
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+            return None
+    def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
+        """Discover all copyist IDs from the browse page using Selenium"""
+        all_ids = set()
+        if progress_callback:
+            progress_callback(f"Fetching copyist list from: {self.browse_url}")
+        # Wait for the results table to load
+        soup = self.get_page_with_selenium(
+            self.browse_url,
+            wait_for_element="tbody#authorities-results-content",
+            timeout=15
+        )
+        if not soup:
+            if progress_callback:
+                progress_callback("Failed to fetch the copyist list page.")
+            return []
+        # Extract IDs from the table
+        page_ids = self.extract_copyist_ids_from_table(soup)
+        all_ids.update(page_ids)
+        if progress_callback:
+            progress_callback(f"Found {len(all_ids)} copyist IDs from main page.")
+        # Check for pagination and get additional pages
+        pagination_handled = self.handle_pagination(soup, all_ids, progress_callback)
+        if progress_callback:
+            progress_callback(f"Total copyist IDs discovered: {len(all_ids)}")
+        return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
+    def extract_copyist_ids_from_table(self, soup: BeautifulSoup) -> List[str]:
+        """Extract copyist IDs from the results table"""
+        ids = set()
+        if not soup:
+            return []
+        # Look for the specific table body
+        table_body = soup.find('tbody', id='authorities-results-content')
+        if not table_body:
+            # Fallback: look for any table with copyist links
+            table_body = soup.find('tbody')
+        if not table_body:
+            return []
+        # Find all links in the table
+        links = table_body.find_all('a', href=True)
+        for link in links:
+            href = link['href']
+            if 'detail/' in href:
+                match = re.search(r'detail/(\d+)', href)
+                if match:
+                    ids.add(match.group(1))
+        return list(ids)
+    def handle_pagination(self, soup: BeautifulSoup, all_ids: set, progress_callback=None) -> bool:
+        """Handle pagination to get all copyist IDs"""
+        try:
+            # Look for pagination controls
+            pagination_links = soup.find_all('a', href=True)
+            next_page_found = False
+            for link in pagination_links:
+                link_text = link.get_text(strip=True).lower()
+                href = link.get('href', '')
+                # Look for "next" or page numbers
+                if ('next' in link_text or 'seguente' in link_text or
+                    (link_text.isdigit() and int(link_text) > 1)):
+                    next_page_found = True
+                    if progress_callback:
+                        progress_callback(f"Found pagination link: {link_text}")
+                    # Navigate to next page
+                    full_url = urljoin(self.base_url, href)
+                    next_soup = self.get_page_with_selenium(
+                        full_url,
+                        wait_for_element="tbody#authorities-results-content",
+                        timeout=15
+                    )
+                    if next_soup:
+                        new_ids = self.extract_copyist_ids_from_table(next_soup)
+                        all_ids.update(new_ids)
+                        if progress_callback:
+                            progress_callback(f"Added {len(new_ids)} IDs from pagination page")
+                        # Recursively handle more pagination
+                        self.handle_pagination(next_soup, all_ids, progress_callback)
+                        break
+            return next_page_found
+        except Exception as e:
+            if progress_callback:
+                progress_callback(f"Error handling pagination: {e}")
+            return False
+    def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
+        """Extract metadata from the copyist detail page"""
+        metadata = {
+            'cnmn_code': '',
+            'vid_sbn': '',
+            'vid_sbn_url': '',
+            'isni_code': '',
+            'isni_url': '',
+            'other_identifiers': '',
+            'biographical_note': '',
+            'bibliographical_sources': '',
+            'bibliographical_notes': '',
+            'names_in_manuscript': '',
+            'date_of_creation': '',
+            'last_modification': '',
+            'page_title': '',
+            'copyist_name': ''
+        }
+        if not soup:
+            return metadata
+        # Extract page title
+        title_tag = soup.find('title')
+        if title_tag:
+            metadata['page_title'] = title_tag.get_text(strip=True)
+        # Try to extract copyist name
+        name_selectors = [
+            'h1', 'h2', '.title', '.copyist-name',
+            '[class*="name"]', '[class*="title"]'
+        ]
+        for selector in name_selectors:
+            element = soup.select_one(selector)
+            if element:
+                name_text = element.get_text(strip=True)
+                if name_text and len(name_text) > 2:
+                    metadata['copyist_name'] = name_text
+                    break
+        # Find the main data table
+        main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
+        if not main_table:
+            main_table = soup.find('table')
+        if not main_table:
+            return metadata
+        # Process table rows
+        rows = main_table.find_all('tr')
+        for row in rows:
+            try:
+                title_cell = row.find('td', class_='table-title')
+                if not title_cell:
+                    continue
+                title_div = title_cell.find('div', class_='table-title-item')
+                if not title_div:
+                    continue
+                field_name = title_div.get_text(strip=True)
+                data_cells = row.find_all('td')
+                data_cell = data_cells[1] if len(data_cells) > 1 else None
+                if not data_cell:
+                    continue
+                self.extract_cell_data(field_name, data_cell, metadata)
+            except (AttributeError, IndexError):
+                continue
+        return metadata
+    def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
+        """Extract data from table cells"""
+        try:
+            cell_classes = data_cell.get('class', [])
+            if 'table-text' in cell_classes:
+                text_item = data_cell.find('div', class_='table-text-item')
+                if text_item:
+                    value = text_item.get_text(strip=True)
+                    self.map_field_value(field_name, value, metadata)
+            elif 'table-link' in cell_classes:
+                text_item = data_cell.find('div', class_='table-text-item')
+                if text_item:
+                    link = text_item.find('a')
+                    if link:
+                        link_text = link.get_text(strip=True)
+                        link_url = link.get('href', '')
+                        self.map_field_link(field_name, link_text, link_url, metadata)
+                    else:
+                        value = text_item.get_text(strip=True)
+                        self.map_field_value(field_name, value, metadata)
+            elif 'table-list' in cell_classes:
+                values = []
+                list_containers = data_cell.find_all('div', class_='table-list-item')
+                if list_containers:
+                    for container in list_containers:
+                        text_items = container.find_all('div', class_='table-text-item')
+                        for item in text_items:
+                            try:
+                                link = item.find('a')
+                                if link:
+                                    link_text = link.get_text(strip=True)
+                                    link_url = link.get('href', '')
+                                    if link_url:
+                                        values.append(f"{link_text} ({link_url})")
+                                    else:
+                                        values.append(link_text)
+                                else:
+                                    text = item.get_text(strip=True)
+                                    if text:
+                                        values.append(text)
+                            except AttributeError:
+                                continue
+                else:
+                    text_items = data_cell.find_all('div', class_='table-text-item')
+                    for item in text_items:
+                        try:
+                            link = item.find('a')
+                            if link:
+                                link_text = link.get_text(strip=True)
+                                link_url = link.get('href', '')
+                                if link_url:
+                                    values.append(f"{link_text} ({link_url})")
+                                else:
+                                    values.append(link_text)
+                            else:
+                                text = item.get_text(strip=True)
+                                if text:
+                                    values.append(text)
+                        except AttributeError:
+                            continue
+                self.map_field_list(field_name, values, metadata)
+            elif 'table-text-html' in cell_classes:
+                text_item = data_cell.find('div', class_='table-text-item')
+                if text_item:
+                    value = ' '.join(text_item.get_text(strip=True).split())
+                    self.map_field_value(field_name, value, metadata)
+        except (AttributeError, TypeError):
+            pass
+    def map_field_value(self, field_name: str, value: str, metadata: Dict):
+        """Map field values to metadata keys"""
+        field_mapping = {
+            'CNMN code': 'cnmn_code',
+            'Date of creation': 'date_of_creation',
+            'Last modification': 'last_modification',
+            'Biographical note': 'biographical_note',
+            'Bibliographical notes': 'bibliographical_notes'
+        }
+        mapped_key = field_mapping.get(field_name)
+        if mapped_key and mapped_key in metadata:
+            metadata[mapped_key] = value
+    def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
+        """Map field links to metadata"""
+        if field_name == 'VID SBN':
+            metadata['vid_sbn'] = link_text
+            metadata['vid_sbn_url'] = link_url
+        elif field_name == 'Codice ISNI':
+            metadata['isni_code'] = link_text
+            metadata['isni_url'] = link_url
+    def map_field_list(self, field_name: str, values: List, metadata: Dict):
+        """Map field lists to metadata"""
+        joined_values = '; '.join(str(v) for v in values if v)
+        if field_name == 'Other identifiers':
+            metadata['other_identifiers'] = joined_values
+        elif field_name == 'Bibliographical sources':
+            metadata['bibliographical_sources'] = joined_values
+        elif field_name == 'Names in manuscript':
+            metadata['names_in_manuscript'] = joined_values
+    def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
+        """Scrape all copyists with Selenium"""
+        try:
+            # Discover all copyist IDs
+            copyist_ids = self.discover_all_copyist_ids(progress_callback)
+            if not copyist_ids:
+                return pd.DataFrame(), "No copyist IDs found"
+            if progress_callback:
+                progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
+            # Limit entries if specified
+            if max_entries and max_entries > 0:
+                copyist_ids = copyist_ids[:max_entries]
+                if progress_callback:
+                    progress_callback(f"Limited to first {max_entries} entries for testing")
+            # Process each copyist
+            all_metadata = []
+            total_ids = len(copyist_ids)
+            successful_scrapes = 0
+            failed_scrapes = 0
+            for i, copyist_id in enumerate(copyist_ids, 1):
+                if progress_callback:
+                    progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
+                detail_url = f"{self.detail_base_url}{copyist_id}"
+                # Get detailed metadata using Selenium
+                detail_soup = self.get_page_with_selenium(
+                    detail_url,
+                    wait_for_element="table",
+                    timeout=10
+                )
+                if detail_soup:
+                    metadata = self.extract_metadata_from_table(detail_soup)
+                    combined_data = {
+                        'copyist_id': copyist_id,
+                        'detail_url': detail_url,
+                        'scrape_order': i,
+                        'scrape_timestamp': datetime.now().isoformat(),
+                        **metadata
+                    }
+                    all_metadata.append(combined_data)
+                    successful_scrapes += 1
+                else:
+                    failed_scrapes += 1
+                    if progress_callback:
+                        progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
+                # Progress update
+                if i % 50 == 0 and progress_callback:
+                    progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
+                # Delay between requests
+                if delay > 0:
+                    time.sleep(delay)
+            df = pd.DataFrame(all_metadata)
+            success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
+            return df, success_msg
+        except Exception as e:
+            return pd.DataFrame(), f"Error during scraping: {str(e)}"
+    def cleanup(self):
+        """Clean up resources"""
+        if self.driver:
+            self.driver.quit()
+            self.driver = None
+    def __del__(self):
+        """Destructor to ensure cleanup"""
+        self.cleanup()
 class ManusCopistaMetadataScraper:
     def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
             progress_callback(f"Found {len(all_ids)} copyist IDs.")
         return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
     def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
+        """Extract copyist IDs from the table with id 'authorities-results-content'"""
         ids = set()
         if not soup:
             return []
+        table_body = soup.find('tbody', id='authorities-results-content')
+        if not table_body:
+            return []
+        links = table_body.find_all('a', href=True)
         for link in links:
+            href = link['href']
+            if 'detail/' in href:
+                match = re.search(r'detail/(\d+)', href)
+                if match:
+                    ids.add(match.group(1))
         return list(ids)
+    def extract_copyist_id_from_url(self, url: str) -> Optional[str]:
+        """Extract copyist ID from a URL"""
+        patterns = [
+            r'/manus-authorities/detail/(\d+)',
+            r'copisti2.*?detail/(\d+)',
+            r'/detail/(\d+)',
+            r'authorities.*?(\d{5,7})',
+            r'copista.*?(\d{5,7})'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, url, re.IGNORECASE)
+            if match:
+                return match.group(1)
+        return None
     def is_potential_copyist_id(self, id_str: str) -> bool:
         """Check if a string looks like a copyist ID"""
         if not id_str or not id_str.isdigit():
             return 100000 <= id_num <= 999999
         except ValueError:
             return False
     def is_valid_copyist_id(self, id_str: str) -> bool:
         """Check if an ID corresponds to a valid copyist page"""
                 if not title_cell:
                     continue
                 title_div = title_cell.find('div', class_='table-title-item')
                 if not title_div:
                     continue
                 field_name = title_div.get_text(strip=True)
+                # Get the data cell (should be the second td in the row)
                 data_cells = row.find_all('td')
+                if len(data_cells) < 2:
                     continue
+                data_cell = data_cells[1]
+                # Extract data based on cell type
                 self.extract_cell_data(field_name, data_cell, metadata)
             except (AttributeError, IndexError):
         return metadata
     def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
+        """Extract data from table cells based on their class structure"""
         try:
             cell_classes = data_cell.get('class', [])
+            # Handle text cells
             if 'table-text' in cell_classes:
                 text_item = data_cell.find('div', class_='table-text-item')
                 if text_item:
                     value = text_item.get_text(strip=True)
                     self.map_field_value(field_name, value, metadata)
+            # Handle link cells
             elif 'table-link' in cell_classes:
                 text_item = data_cell.find('div', class_='table-text-item')
                 if text_item:
                         link_url = link.get('href', '')
                         self.map_field_link(field_name, link_text, link_url, metadata)
                     else:
+                        # No link, just text
                         value = text_item.get_text(strip=True)
                         self.map_field_value(field_name, value, metadata)
+            # Handle list cells
             elif 'table-list' in cell_classes:
                 values = []
+                # Look for list containers
                 list_containers = data_cell.find_all('div', class_='table-list-item')
                 if list_containers:
                     for container in list_containers:
                         text_items = container.find_all('div', class_='table-text-item')
                         for item in text_items:
                             except AttributeError:
                                 continue
                 else:
+                    # Fallback: look for text items directly
                     text_items = data_cell.find_all('div', class_='table-text-item')
                     for item in text_items:
                         try:
                         except AttributeError:
                             continue
                 self.map_field_list(field_name, values, metadata)
+            # Handle HTML text cells
             elif 'table-text-html' in cell_classes:
                 text_item = data_cell.find('div', class_='table-text-item')
                 if text_item:
+                    # Clean HTML and get text
                     value = ' '.join(text_item.get_text(strip=True).split())
                     self.map_field_value(field_name, value, metadata)
             pass
     def map_field_value(self, field_name: str, value: str, metadata: Dict):
+        """Map field values to the appropriate metadata keys"""
         field_mapping = {
             'CNMN code': 'cnmn_code',
             'Date of creation': 'date_of_creation',
             metadata[mapped_key] = value
     def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
+        """Map field links to metadata"""
         if field_name == 'VID SBN':
             metadata['vid_sbn'] = link_text
             metadata['vid_sbn_url'] = link_url
             metadata['isni_url'] = link_url
     def map_field_list(self, field_name: str, values: List, metadata: Dict):
+        """Map field lists to metadata"""
         joined_values = '; '.join(str(v) for v in values if v)
         if field_name == 'Other identifiers':
         elif field_name == 'Names in manuscript':
             metadata['names_in_manuscript'] = joined_values
+    def scrape_copyist_by_id(self, copyist_id: str) -> Dict:
+        """Scrape a single copyist by ID"""
+        detail_url = f"{self.detail_base_url}{copyist_id}"
+        # Get the detail page
+        detail_soup = self.get_page_content(detail_url)
+        if not detail_soup:
+            return {'error': f'Could not fetch data for copyist ID {copyist_id}'}
+        # Extract metadata
+        metadata = self.extract_metadata_from_table(detail_soup)
+        # Add basic info
+        metadata['copyist_id'] = copyist_id
+        metadata['detail_url'] = detail_url
+        metadata['scrape_timestamp'] = datetime.now().isoformat()
+        return metadata
+    def scrape_multiple_copyists(self, copyist_ids: List[str], delay: float = 1.0, progress_callback=None) -> pd.DataFrame:
+        """Scrape multiple copyists by their IDs"""
+        all_metadata = []
+        for i, copyist_id in enumerate(copyist_ids, 1):
+            if progress_callback:
+                progress_callback(f"Processing {i}/{len(copyist_ids)}: Copyist ID {copyist_id}")
+            metadata = self.scrape_copyist_by_id(copyist_id)
+            if 'error' not in metadata:
+                metadata['scrape_order'] = i
+                all_metadata.append(metadata)
+            else:
+                if progress_callback:
+                    progress_callback(f"Failed to scrape copyist ID {copyist_id}: {metadata['error']}")
+            # Delay between requests
+            if delay > 0:
+                time.sleep(delay)
+        return pd.DataFrame(all_metadata)
     def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
+        """Scrape all copyists with progress updates"""
         try:
             # Discover all copyist IDs
             copyist_ids = self.discover_all_copyist_ids(progress_callback)
             if progress_callback:
                 progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
+            # Limit entries if specified
             if max_entries and max_entries > 0:
                 copyist_ids = copyist_ids[:max_entries]
                 if progress_callback:
                     progress_callback(f"Limited to first {max_entries} entries for testing")
+            # Scrape the copyists
+            df = self.scrape_multiple_copyists(copyist_ids, delay, progress_callback)
+            success_msg = f"Successfully scraped {len(df)} copyist records out of {len(copyist_ids)} discovered IDs"
             return df, success_msg
         except Exception as e:
             return pd.DataFrame(), f"Error during scraping: {str(e)}"
+# Gradio Interface Functions
+def create_gradio_interface():
+    """Create and return the Gradio interface"""
+    def run_scraper_selenium(delay, max_entries, progress=gr.Progress()):
+        """Run the Selenium scraper with progress updates"""
+        scraper = None
+        try:
+            def update_progress(message):
+                progress(message)
+            scraper = ManusCopistaSeleniumScraper()
+            df, status = scraper.scrape_all_copyists_with_progress(
+                delay=delay,
+                max_entries=max_entries if max_entries > 0 else None,
+                progress_callback=update_progress
             )
+            if df.empty:
+                return None, f"No data scraped. Status: {status}"
+            # Create CSV output
+            csv_output = io.StringIO()
+            df.to_csv(csv_output, index=False)
+            csv_content = csv_output.getvalue()
+            return csv_content, f"Success! {status}"
+        except Exception as e:
+            return None, f"Error: {str(e)}"
+        finally:
+            if scraper:
+                scraper.cleanup()
+    def run_scraper_requests(delay, max_entries, progress=gr.Progress()):
+        """Run the requests-based scraper with progress updates"""
+        try:
+            def update_progress(message):
+                progress(message)
+            scraper = ManusCopistaMetadataScraper()
+            df, status = scraper.scrape_all_copyists_with_progress(
+                delay=delay,
+                max_entries=max_entries if max_entries > 0 else None,
+                progress_callback=update_progress
             )
+            if df.empty:
+                return None, f"No data scraped. Status: {status}"
+            # Create CSV output
+            csv_output = io.StringIO()
+            df.to_csv(csv_output, index=False)
+            csv_content = csv_output.getvalue()
+            return csv_content, f"Success! {status}"
+        except Exception as e:
+            return None, f"Error: {str(e)}"
+    def test_discovery(progress=gr.Progress()):
+        """Test the discovery method"""
+        try:
+            def update_progress(message):
+                progress(message)
+            scraper = ManusCopistaMetadataScraper()
+            results = scraper.test_discovery_method(progress_callback=update_progress)
+            return json.dumps(results, indent=2), "Discovery test completed"
+        except Exception as e:
+            return None, f"Error: {str(e)}"
+    with gr.Blocks(title="Manus Copista Scraper") as interface:
+        gr.Markdown("# Manus Copista Metadata Scraper")
+        gr.Markdown("Scrape copyist metadata from the Manus database using either Selenium or requests.")
+        with gr.Tab("Selenium Scraper (Recommended)"):
+            gr.Markdown("### Selenium-based scraper (handles JavaScript)")
+            with gr.Row():
+                selenium_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
+                selenium_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
+            selenium_run_btn = gr.Button("Run Selenium Scraper", variant="primary")
+            selenium_status = gr.Textbox(label="Status", lines=3)
+            selenium_output = gr.File(label="Download CSV")
+            selenium_run_btn.click(
+                run_scraper_selenium,
+                inputs=[selenium_delay, selenium_max_entries],
+                outputs=[selenium_output, selenium_status]
+            )
+        with gr.Tab("Requests Scraper"):
+            gr.Markdown("### Requests-based scraper (faster, may miss JavaScript content)")
+            with gr.Row():
+                requests_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
+                requests_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
+            requests_run_btn = gr.Button("Run Requests Scraper", variant="primary")
+            requests_status = gr.Textbox(label="Status", lines=3)
+            requests_output = gr.File(label="Download CSV")
+            requests_run_btn.click(
+                run_scraper_requests,
+                inputs=[requests_delay, requests_max_entries],
+                outputs=[requests_output, requests_status]
             )
+        with gr.Tab("Discovery Test"):
+            gr.Markdown("### Test the ID discovery process")
+            test_btn = gr.Button("Test Discovery Method", variant="secondary")
+            test_status = gr.Textbox(label="Status", lines=2)
+            test_output = gr.Textbox(label="Test Results", lines=20)
+            test_btn.click(
+                test_discovery,
+                outputs=[test_output, test_status]
             )
+        gr.Markdown("---")
+        gr.Markdown("**Note:** The Selenium scraper is recommended as it can handle JavaScript content. The requests scraper is faster but may miss some data.")
+    return interface
+# Main execution
 if __name__ == "__main__":
+    # Create and launch the interface
+    interface = create_gradio_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )