Spaces:

kambris
/

Soup

Runtime error

App Files Files Community

kambris commited on Jul 3

Commit

3bd337f

verified ·

1 Parent(s): d1c8665

Update app.py

Browse files

Files changed (1) hide show

app.py +251 -1009

app.py CHANGED Viewed

@@ -1,1092 +1,334 @@
-import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import pandas as pd
-from urllib.parse import urljoin, urlparse
 import time
 import re
 from typing import Dict, List, Optional
 import json
-import io
 from datetime import datetime
-import os
-# Selenium imports
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import TimeoutException, WebDriverException
-from webdriver_manager.chrome import ChromeDriverManager
-class ManusCopistaSeleniumScraper:
-    def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
-        self.base_url = base_url
         self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
         self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
-        self.driver = None
-        self.setup_driver()
-    def setup_driver(self):
-        """Setup Chrome driver with appropriate options"""
-        chrome_options = Options()
-        chrome_options.add_argument("--headless")
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--disable-dev-shm-usage")
-        chrome_options.add_argument("--disable-gpu")
-        chrome_options.add_argument("--window-size=1920,1080")
-        chrome_options.add_argument("--disable-extensions")
-        chrome_options.add_argument("--disable-plugins")
-        chrome_options.add_argument("--disable-images")
-        chrome_options.add_argument("--disable-javascript")
-        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-        try:
-            # Try to use ChromeDriverManager for automatic driver management
-            service = webdriver.ChromeService(ChromeDriverManager().install())
-            self.driver = webdriver.Chrome(service=service, options=chrome_options)
-        except Exception as e:
-            print(f"Error setting up ChromeDriver with manager: {e}")
-            try:
-                # Fallback to system Chrome driver
-                self.driver = webdriver.Chrome(options=chrome_options)
-            except Exception as e2:
-                print(f"Error setting up system ChromeDriver: {e2}")
-                raise Exception("Could not initialize Chrome driver")
-    def get_page_with_selenium(self, url: str, wait_for_element: str = None, timeout: int = 10) -> Optional[BeautifulSoup]:
-        """Get page content using Selenium to handle JavaScript"""
         try:
-            self.driver.get(url)
-            # Wait for specific element if provided
-            if wait_for_element:
-                WebDriverWait(self.driver, timeout).until(
-                    EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element))
-                )
-            else:
-                # Default wait for page to load
-                time.sleep(3)
-            # Get page source and parse with BeautifulSoup
-            page_source = self.driver.page_source
-            return BeautifulSoup(page_source, 'html.parser')
-        except TimeoutException:
-            print(f"Timeout waiting for element {wait_for_element} on {url}")
-            return None
-        except WebDriverException as e:
-            print(f"WebDriver error on {url}: {e}")
             return None
         except Exception as e:
-            print(f"Error fetching {url}: {e}")
             return None
-    def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
-        """Discover all copyist IDs from the browse page using Selenium"""
-        all_ids = set()
-        if progress_callback:
-            progress_callback(f"Fetching copyist list from: {self.browse_url}")
-        # Wait for the results table to load
-        soup = self.get_page_with_selenium(
-            self.browse_url,
-            wait_for_element="tbody#authorities-results-content",
-            timeout=15
-        )
-        if not soup:
-            if progress_callback:
-                progress_callback("Failed to fetch the copyist list page.")
-            return []
-        # Extract IDs from the table
-        page_ids = self.extract_copyist_ids_from_table(soup)
-        all_ids.update(page_ids)
-        if progress_callback:
-            progress_callback(f"Found {len(all_ids)} copyist IDs from main page.")
-        # Check for pagination and get additional pages
-        pagination_handled = self.handle_pagination(soup, all_ids, progress_callback)
-        if progress_callback:
-            progress_callback(f"Total copyist IDs discovered: {len(all_ids)}")
-        return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
-    def extract_copyist_ids_from_table(self, soup: BeautifulSoup) -> List[str]:
-        """Extract copyist IDs from the results table"""
         ids = set()
-        if not soup:
-            return []
-        # Look for the specific table body
-        table_body = soup.find('tbody', id='authorities-results-content')
-        if not table_body:
-            # Fallback: look for any table with copyist links
-            table_body = soup.find('tbody')
-        if not table_body:
-            return []
-        # Find all links in the table
-        links = table_body.find_all('a', href=True)
         for link in links:
-            href = link['href']
-            if 'detail/' in href:
-                match = re.search(r'detail/(\d+)', href)
-                if match:
-                    ids.add(match.group(1))
         return list(ids)
-    def handle_pagination(self, soup: BeautifulSoup, all_ids: set, progress_callback=None) -> bool:
-        """Handle pagination to get all copyist IDs"""
-        try:
-            # Look for pagination controls
-            pagination_links = soup.find_all('a', href=True)
-            next_page_found = False
-            for link in pagination_links:
-                link_text = link.get_text(strip=True).lower()
-                href = link.get('href', '')
-                # Look for "next" or page numbers
-                if ('next' in link_text or 'seguente' in link_text or
-                    (link_text.isdigit() and int(link_text) > 1)):
-                    next_page_found = True
-                    if progress_callback:
-                        progress_callback(f"Found pagination link: {link_text}")
-                    # Navigate to next page
-                    full_url = urljoin(self.base_url, href)
-                    next_soup = self.get_page_with_selenium(
-                        full_url,
-                        wait_for_element="tbody#authorities-results-content",
-                        timeout=15
-                    )
-                    if next_soup:
-                        new_ids = self.extract_copyist_ids_from_table(next_soup)
-                        all_ids.update(new_ids)
-                        if progress_callback:
-                            progress_callback(f"Added {len(new_ids)} IDs from pagination page")
-                        # Recursively handle more pagination
-                        self.handle_pagination(next_soup, all_ids, progress_callback)
-                        break
-            return next_page_found
-        except Exception as e:
-            if progress_callback:
-                progress_callback(f"Error handling pagination: {e}")
-            return False
-    def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
-        """Extract metadata from the copyist detail page"""
-        metadata = {
-            'cnmn_code': '',
-            'vid_sbn': '',
-            'vid_sbn_url': '',
-            'isni_code': '',
-            'isni_url': '',
-            'other_identifiers': '',
-            'biographical_note': '',
-            'bibliographical_sources': '',
-            'bibliographical_notes': '',
-            'names_in_manuscript': '',
-            'date_of_creation': '',
-            'last_modification': '',
-            'page_title': '',
-            'copyist_name': ''
-        }
-        if not soup:
-            return metadata
-        # Extract page title
-        title_tag = soup.find('title')
-        if title_tag:
-            metadata['page_title'] = title_tag.get_text(strip=True)
-        # Try to extract copyist name
-        name_selectors = [
-            'h1', 'h2', '.title', '.copyist-name',
-            '[class*="name"]', '[class*="title"]'
-        ]
-        for selector in name_selectors:
-            element = soup.select_one(selector)
-            if element:
-                name_text = element.get_text(strip=True)
-                if name_text and len(name_text) > 2:
-                    metadata['copyist_name'] = name_text
-                    break
-        # Find the main data table
-        main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
-        if not main_table:
-            main_table = soup.find('table')
-        if not main_table:
-            return metadata
-        # Process table rows
-        rows = main_table.find_all('tr')
-        for row in rows:
-            try:
-                title_cell = row.find('td', class_='table-title')
-                if not title_cell:
-                    continue
-                title_div = title_cell.find('div', class_='table-title-item')
-                if not title_div:
-                    continue
-                field_name = title_div.get_text(strip=True)
-                data_cells = row.find_all('td')
-                data_cell = data_cells[1] if len(data_cells) > 1 else None
-                if not data_cell:
-                    continue
-                self.extract_cell_data(field_name, data_cell, metadata)
-            except (AttributeError, IndexError):
-                continue
-        return metadata
-    def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
-        """Extract data from table cells"""
-        try:
-            cell_classes = data_cell.get('class', [])
-            if 'table-text' in cell_classes:
-                text_item = data_cell.find('div', class_='table-text-item')
-                if text_item:
-                    value = text_item.get_text(strip=True)
-                    self.map_field_value(field_name, value, metadata)
-            elif 'table-link' in cell_classes:
-                text_item = data_cell.find('div', class_='table-text-item')
-                if text_item:
-                    link = text_item.find('a')
-                    if link:
-                        link_text = link.get_text(strip=True)
-                        link_url = link.get('href', '')
-                        self.map_field_link(field_name, link_text, link_url, metadata)
-                    else:
-                        value = text_item.get_text(strip=True)
-                        self.map_field_value(field_name, value, metadata)
-            elif 'table-list' in cell_classes:
-                values = []
-                list_containers = data_cell.find_all('div', class_='table-list-item')
-                if list_containers:
-                    for container in list_containers:
-                        text_items = container.find_all('div', class_='table-text-item')
-                        for item in text_items:
-                            try:
-                                link = item.find('a')
-                                if link:
-                                    link_text = link.get_text(strip=True)
-                                    link_url = link.get('href', '')
-                                    if link_url:
-                                        values.append(f"{link_text} ({link_url})")
-                                    else:
-                                        values.append(link_text)
-                                else:
-                                    text = item.get_text(strip=True)
-                                    if text:
-                                        values.append(text)
-                            except AttributeError:
-                                continue
-                else:
-                    text_items = data_cell.find_all('div', class_='table-text-item')
-                    for item in text_items:
-                        try:
-                            link = item.find('a')
-                            if link:
-                                link_text = link.get_text(strip=True)
-                                link_url = link.get('href', '')
-                                if link_url:
-                                    values.append(f"{link_text} ({link_url})")
-                                else:
-                                    values.append(link_text)
-                            else:
-                                text = item.get_text(strip=True)
-                                if text:
-                                    values.append(text)
-                        except AttributeError:
-                            continue
-                self.map_field_list(field_name, values, metadata)
-            elif 'table-text-html' in cell_classes:
-                text_item = data_cell.find('div', class_='table-text-item')
-                if text_item:
-                    value = ' '.join(text_item.get_text(strip=True).split())
-                    self.map_field_value(field_name, value, metadata)
-        except (AttributeError, TypeError):
-            pass
-    def map_field_value(self, field_name: str, value: str, metadata: Dict):
-        """Map field values to metadata keys"""
-        field_mapping = {
-            'CNMN code': 'cnmn_code',
-            'Date of creation': 'date_of_creation',
-            'Last modification': 'last_modification',
-            'Biographical note': 'biographical_note',
-            'Bibliographical notes': 'bibliographical_notes'
-        }
-        mapped_key = field_mapping.get(field_name)
-        if mapped_key and mapped_key in metadata:
-            metadata[mapped_key] = value
-    def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
-        """Map field links to metadata"""
-        if field_name == 'VID SBN':
-            metadata['vid_sbn'] = link_text
-            metadata['vid_sbn_url'] = link_url
-        elif field_name == 'Codice ISNI':
-            metadata['isni_code'] = link_text
-            metadata['isni_url'] = link_url
-    def map_field_list(self, field_name: str, values: List, metadata: Dict):
-        """Map field lists to metadata"""
-        joined_values = '; '.join(str(v) for v in values if v)
-        if field_name == 'Other identifiers':
-            metadata['other_identifiers'] = joined_values
-        elif field_name == 'Bibliographical sources':
-            metadata['bibliographical_sources'] = joined_values
-        elif field_name == 'Names in manuscript':
-            metadata['names_in_manuscript'] = joined_values
-    def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
-        """Scrape all copyists with Selenium"""
-        try:
-            # Discover all copyist IDs
-            copyist_ids = self.discover_all_copyist_ids(progress_callback)
-            if not copyist_ids:
-                return pd.DataFrame(), "No copyist IDs found"
-            if progress_callback:
-                progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
-            # Limit entries if specified
-            if max_entries and max_entries > 0:
-                copyist_ids = copyist_ids[:max_entries]
-                if progress_callback:
-                    progress_callback(f"Limited to first {max_entries} entries for testing")
-            # Process each copyist
-            all_metadata = []
-            total_ids = len(copyist_ids)
-            successful_scrapes = 0
-            failed_scrapes = 0
-            for i, copyist_id in enumerate(copyist_ids, 1):
-                if progress_callback:
-                    progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
-                detail_url = f"{self.detail_base_url}{copyist_id}"
-                # Get detailed metadata using Selenium
-                detail_soup = self.get_page_with_selenium(
-                    detail_url,
-                    wait_for_element="table",
-                    timeout=10
-                )
-                if detail_soup:
-                    metadata = self.extract_metadata_from_table(detail_soup)
-                    combined_data = {
-                        'copyist_id': copyist_id,
-                        'detail_url': detail_url,
-                        'scrape_order': i,
-                        'scrape_timestamp': datetime.now().isoformat(),
-                        **metadata
-                    }
-                    all_metadata.append(combined_data)
-                    successful_scrapes += 1
-                else:
-                    failed_scrapes += 1
-                    if progress_callback:
-                        progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
-                # Progress update
-                if i % 50 == 0 and progress_callback:
-                    progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
-                # Delay between requests
-                if delay > 0:
-                    time.sleep(delay)
-            df = pd.DataFrame(all_metadata)
-            success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
-            return df, success_msg
-        except Exception as e:
-            return pd.DataFrame(), f"Error during scraping: {str(e)}"
-    def cleanup(self):
-        """Clean up resources"""
-        if self.driver:
-            self.driver.quit()
-            self.driver = None
-    def __del__(self):
-        """Destructor to ensure cleanup"""
-        self.cleanup()
-class ManusCopistaMetadataScraper:
-    def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
-        self.base_url = base_url
-        self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
-        self.session = requests.Session()
-        # Add headers to mimic a real browser
-        self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.9,it;q=0.8',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none',
-            'Cache-Control': 'max-age=0',
-        })
-    def get_page_content(self, url: str) -> Optional[BeautifulSoup]:
-        """Fetch and parse a web page with error handling"""
-        try:
-            response = self.session.get(url, timeout=15)
-            response.raise_for_status()
-            # Handle different encodings
-            if response.encoding and response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
-                response.encoding = 'utf-8'
-            return BeautifulSoup(response.text, 'html.parser')
-        except requests.RequestException as e:
-            print(f"Error fetching {url}: {e}")
-            return None
-    def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
-        """Discover all available copyist IDs from the browse page"""
-        all_ids = set()
-        # This is the key page where the real data table appears
-        url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
-        if progress_callback:
-            progress_callback(f"Fetching copyist list from: {url}")
-        soup = self.get_page_content(url)
-        if not soup:
-            if progress_callback:
-                progress_callback("Failed to fetch the copyist list page.")
-            return []
-        page_ids = self.extract_copyist_ids_from_page(soup)
-        all_ids.update(page_ids)
-        if progress_callback:
-            progress_callback(f"Found {len(all_ids)} copyist IDs.")
-        return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
-    def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
-        """Extract copyist IDs from the table with id 'authorities-results-content'"""
-        ids = set()
-        if not soup:
-            return []
-        table_body = soup.find('tbody', id='authorities-results-content')
-        if not table_body:
-            return []
-        links = table_body.find_all('a', href=True)
-        for link in links:
-            href = link['href']
-            if 'detail/' in href:
-                match = re.search(r'detail/(\d+)', href)
-                if match:
-                    ids.add(match.group(1))
-        return list(ids)
-    def extract_copyist_id_from_url(self, url: str) -> Optional[str]:
-        """Extract copyist ID from a URL"""
-        patterns = [
-            r'/manus-authorities/detail/(\d+)',
-            r'copisti2.*?detail/(\d+)',
-            r'/detail/(\d+)',
-            r'authorities.*?(\d{5,7})',
-            r'copista.*?(\d{5,7})'
-        ]
-        for pattern in patterns:
-            match = re.search(pattern, url, re.IGNORECASE)
-            if match:
-                return match.group(1)
-        return None
-    def is_potential_copyist_id(self, id_str: str) -> bool:
-        """Check if a string looks like a copyist ID"""
-        if not id_str or not id_str.isdigit():
-            return False
-        # IDs are typically 5-7 digits and within a reasonable range
-        if len(id_str) < 5 or len(id_str) > 7:
-            return False
-        # Basic range check (copyist IDs seem to be in 6-digit range)
-        try:
-            id_num = int(id_str)
-            return 100000 <= id_num <= 999999
-        except ValueError:
-            return False
-    def is_valid_copyist_id(self, id_str: str) -> bool:
-        """Check if an ID corresponds to a valid copyist page"""
-        if not id_str or not id_str.isdigit():
-            return False
-        # IDs are typically 5-7 digits
-        if len(id_str) < 5 or len(id_str) > 7:
-            return False
-        # Quick HEAD request to check if page exists
         try:
-            detail_url = f"{self.detail_base_url}{id_str}"
-            response = self.session.head(detail_url, timeout=3)
             return response.status_code == 200
         except:
             return False
-    def find_pagination_links(self, soup: BeautifulSoup) -> List[str]:
-        """Find pagination links to get all pages of copyists"""
-        pagination_urls = []
         if not soup:
-            return pagination_urls
-        # Look for pagination elements with more specific selectors
-        pagination_selectors = [
-            'nav[aria-label*="pagination"]',
-            'nav[class*="pagination"]',
-            '.pagination',
-            '.pager',
-            '.page-navigation',
-            '[class*="page-"]'
-        ]
-        for selector in pagination_selectors:
-            pagination_container = soup.select_one(selector)
-            if pagination_container:
-                links = pagination_container.find_all('a', href=True)
-                for link in links:
-                    href = link.get('href', '')
-                    if href and href not in ['#', 'javascript:void(0)']:
-                        full_url = urljoin(self.base_url, href)
-                        if full_url not in pagination_urls and full_url != self.base_url:
-                            # Avoid duplicate URLs and navigation loops
-                            if not any(existing_url in full_url or full_url in existing_url for existing_url in pagination_urls):
-                                pagination_urls.append(full_url)
-        # Also look for numbered page links or next/previous buttons
-        all_links = soup.find_all('a', href=True)
-        for link in all_links:
-            link_text = link.get_text(strip=True).lower()
-            href = link.get('href', '')
-            # Look for pagination indicators
-            pagination_keywords = ['next', 'seguente', 'avanti', 'previous', 'precedente', 'indietro']
-            if (any(keyword in link_text for keyword in pagination_keywords) or
-                (link_text.isdigit() and int(link_text) <= 100)):  # Reasonable page number
-                if href and href not in ['#', 'javascript:void(0)']:
-                    full_url = urljoin(self.base_url, href)
-                    if (full_url not in pagination_urls and
-                        full_url != self.base_url and
-                        'copisti' in full_url):  # Ensure it's still in the copyist section
-                        pagination_urls.append(full_url)
-        # Remove duplicates and limit to prevent infinite loops
-        unique_urls = []
-        for url in pagination_urls:
-            if url not in unique_urls:
-                unique_urls.append(url)
-        return unique_urls[:20]  # Reasonable limit
-    def test_discovery_method(self, progress_callback=None) -> Dict:
-        """Test method to debug the ID discovery process"""
-        if progress_callback:
-            progress_callback("Starting discovery test...")
-        main_soup = self.get_page_content(self.base_url)
-        if not main_soup:
-            return {"error": "Could not fetch main page"}
-        results = {
-            "page_title": main_soup.find('title').get_text(strip=True) if main_soup.find('title') else "No title",
-            "total_links": len(main_soup.find_all('a', href=True)),
-            "copyist_links": [],
-            "pagination_links": [],
-            "page_structure": []
-        }
-        # Analyze all links
-        links = main_soup.find_all('a', href=True)
-        for link in links:
-            href = link.get('href', '')
-            text = link.get_text(strip=True)
-            if 'detail' in href or 'copista' in href.lower() or 'authorities' in href:
-                copyist_id = self.extract_copyist_id_from_url(href)
-                results["copyist_links"].append({
-                    "href": href,
-                    "text": text,
-                    "extracted_id": copyist_id
-                })
-        # Check page structure
-        for tag in ['table', 'ul', 'ol', 'div']:
-            elements = main_soup.find_all(tag, class_=True)
-            for elem in elements[:5]:  # Limit for debugging
-                classes = ' '.join(elem.get('class', []))
-                results["page_structure"].append(f"{tag}: {classes}")
-        return results
-    def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
-        """Extract metadata from the copyist detail page table - Updated for better structure handling"""
-        metadata = {
-            'cnmn_code': '',
-            'vid_sbn': '',
-            'vid_sbn_url': '',
-            'isni_code': '',
-            'isni_url': '',
-            'other_identifiers': '',
-            'biographical_note': '',
-            'bibliographical_sources': '',
-            'bibliographical_notes': '',
-            'names_in_manuscript': '',
-            'date_of_creation': '',
-            'last_modification': '',
-            'page_title': '',
-            'copyist_name': ''
-        }
-        if not soup:
-            return metadata
-        # Extract page title
-        title_tag = soup.find('title')
-        if title_tag:
-            metadata['page_title'] = title_tag.get_text(strip=True)
-        # Try to extract copyist name from various possible locations
-        name_selectors = [
-            'h1', 'h2', '.title', '.copyist-name',
-            '[class*="name"]', '[class*="title"]'
-        ]
-        for selector in name_selectors:
-            element = soup.select_one(selector)
-            if element:
-                name_text = element.get_text(strip=True)
-                if name_text and len(name_text) > 2:
-                    metadata['copyist_name'] = name_text
-                    break
-        # Find the main data table - look for the specific table structure
-        main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
-        if not main_table:
-            # Fallback: look for any table
-            main_table = soup.find('table')
-        if not main_table:
-            return metadata
-        # Process each row in the table
-        rows = main_table.find_all('tr')
         for row in rows:
-            try:
-                # Get the title cell
-                title_cell = row.find('td', class_='table-title')
-                if not title_cell:
-                    continue
-                title_div = title_cell.find('div', class_='table-title-item')
-                if not title_div:
-                    continue
-                field_name = title_div.get_text(strip=True)
-                # Get the data cell (should be the second td in the row)
-                data_cells = row.find_all('td')
-                if len(data_cells) < 2:
-                    continue
-                data_cell = data_cells[1]
-                # Extract data based on cell type
-                self.extract_cell_data(field_name, data_cell, metadata)
-            except (AttributeError, IndexError):
-                continue
-        return metadata
-    def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
-        """Extract data from table cells based on their class structure"""
-        try:
-            cell_classes = data_cell.get('class', [])
-            # Handle text cells
-            if 'table-text' in cell_classes:
-                text_item = data_cell.find('div', class_='table-text-item')
-                if text_item:
-                    value = text_item.get_text(strip=True)
-                    self.map_field_value(field_name, value, metadata)
-            # Handle link cells
-            elif 'table-link' in cell_classes:
-                text_item = data_cell.find('div', class_='table-text-item')
-                if text_item:
-                    link = text_item.find('a')
                     if link:
-                        link_text = link.get_text(strip=True)
-                        link_url = link.get('href', '')
-                        self.map_field_link(field_name, link_text, link_url, metadata)
                     else:
-                        # No link, just text
-                        value = text_item.get_text(strip=True)
-                        self.map_field_value(field_name, value, metadata)
-            # Handle list cells
-            elif 'table-list' in cell_classes:
-                values = []
-                # Look for list containers
-                list_containers = data_cell.find_all('div', class_='table-list-item')
-                if list_containers:
-                    for container in list_containers:
-                        text_items = container.find_all('div', class_='table-text-item')
-                        for item in text_items:
-                            try:
-                                link = item.find('a')
-                                if link:
-                                    link_text = link.get_text(strip=True)
-                                    link_url = link.get('href', '')
-                                    if link_url:
-                                        values.append(f"{link_text} ({link_url})")
-                                    else:
-                                        values.append(link_text)
-                                else:
-                                    text = item.get_text(strip=True)
-                                    if text:
-                                        values.append(text)
-                            except AttributeError:
-                                continue
-                else:
-                    # Fallback: look for text items directly
-                    text_items = data_cell.find_all('div', class_='table-text-item')
-                    for item in text_items:
-                        try:
-                            link = item.find('a')
-                            if link:
-                                link_text = link.get_text(strip=True)
-                                link_url = link.get('href', '')
-                                if link_url:
-                                    values.append(f"{link_text} ({link_url})")
-                                else:
-                                    values.append(link_text)
-                            else:
-                                text = item.get_text(strip=True)
-                                if text:
-                                    values.append(text)
-                        except AttributeError:
-                            continue
-                self.map_field_list(field_name, values, metadata)
-            # Handle HTML text cells
-            elif 'table-text-html' in cell_classes:
-                text_item = data_cell.find('div', class_='table-text-item')
-                if text_item:
-                    # Clean HTML and get text
-                    value = ' '.join(text_item.get_text(strip=True).split())
-                    self.map_field_value(field_name, value, metadata)
-        except (AttributeError, TypeError):
-            pass
-    def map_field_value(self, field_name: str, value: str, metadata: Dict):
-        """Map field values to the appropriate metadata keys"""
-        field_mapping = {
-            'CNMN code': 'cnmn_code',
-            'Date of creation': 'date_of_creation',
-            'Last modification': 'last_modification',
-            'Biographical note': 'biographical_note',
-            'Bibliographical notes': 'bibliographical_notes'
-        }
-        mapped_key = field_mapping.get(field_name)
-        if mapped_key and mapped_key in metadata:
-            metadata[mapped_key] = value
-    def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
-        """Map field links to metadata"""
-        if field_name == 'VID SBN':
-            metadata['vid_sbn'] = link_text
-            metadata['vid_sbn_url'] = link_url
-        elif field_name == 'Codice ISNI':
-            metadata['isni_code'] = link_text
-            metadata['isni_url'] = link_url
-    def map_field_list(self, field_name: str, values: List, metadata: Dict):
-        """Map field lists to metadata"""
-        joined_values = '; '.join(str(v) for v in values if v)
-        if field_name == 'Other identifiers':
-            metadata['other_identifiers'] = joined_values
-        elif field_name == 'Bibliographical sources':
-            metadata['bibliographical_sources'] = joined_values
-        elif field_name == 'Names in manuscript':
-            metadata['names_in_manuscript'] = joined_values
-    def scrape_copyist_by_id(self, copyist_id: str) -> Dict:
-        """Scrape a single copyist by ID"""
-        detail_url = f"{self.detail_base_url}{copyist_id}"
-        # Get the detail page
-        detail_soup = self.get_page_content(detail_url)
-        if not detail_soup:
-            return {'error': f'Could not fetch data for copyist ID {copyist_id}'}
-        # Extract metadata
-        metadata = self.extract_metadata_from_table(detail_soup)
-        # Add basic info
-        metadata['copyist_id'] = copyist_id
-        metadata['detail_url'] = detail_url
-        metadata['scrape_timestamp'] = datetime.now().isoformat()
-        return metadata
-    def scrape_multiple_copyists(self, copyist_ids: List[str], delay: float = 1.0, progress_callback=None) -> pd.DataFrame:
-        """Scrape multiple copyists by their IDs"""
-        all_metadata = []
         for i, copyist_id in enumerate(copyist_ids, 1):
-            if progress_callback:
-                progress_callback(f"Processing {i}/{len(copyist_ids)}: Copyist ID {copyist_id}")
-            metadata = self.scrape_copyist_by_id(copyist_id)
-            if 'error' not in metadata:
-                metadata['scrape_order'] = i
-                all_metadata.append(metadata)
             else:
-                if progress_callback:
-                    progress_callback(f"Failed to scrape copyist ID {copyist_id}: {metadata['error']}")
             # Delay between requests
             if delay > 0:
                 time.sleep(delay)
-        return pd.DataFrame(all_metadata)
-    def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
-        """Scrape all copyists with progress updates"""
-        try:
-            # Discover all copyist IDs
-            copyist_ids = self.discover_all_copyist_ids(progress_callback)
-            if not copyist_ids:
-                return pd.DataFrame(), "No copyist IDs found"
-            if progress_callback:
-                progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
-            # Limit entries if specified
-            if max_entries and max_entries > 0:
-                copyist_ids = copyist_ids[:max_entries]
-                if progress_callback:
-                    progress_callback(f"Limited to first {max_entries} entries for testing")
-            # Scrape the copyists
-            df = self.scrape_multiple_copyists(copyist_ids, delay, progress_callback)
-            success_msg = f"Successfully scraped {len(df)} copyist records out of {len(copyist_ids)} discovered IDs"
-            return df, success_msg
-        except Exception as e:
-            return pd.DataFrame(), f"Error during scraping: {str(e)}"
-# Gradio Interface Functions
-def create_gradio_interface():
-    """Create and return the Gradio interface"""
-    def run_scraper_selenium(delay, max_entries, progress=gr.Progress()):
-        """Run the Selenium scraper with progress updates"""
-        scraper = None
-        try:
-            def update_progress(message):
-                progress(message)
-            scraper = ManusCopistaSeleniumScraper()
-            df, status = scraper.scrape_all_copyists_with_progress(
-                delay=delay,
-                max_entries=max_entries if max_entries > 0 else None,
-                progress_callback=update_progress
-            )
-            if df.empty:
-                return None, f"No data scraped. Status: {status}"
-            # Create CSV output
-            csv_output = io.StringIO()
-            df.to_csv(csv_output, index=False)
-            csv_content = csv_output.getvalue()
-            return csv_content, f"Success! {status}"
-        except Exception as e:
-            return None, f"Error: {str(e)}"
-        finally:
-            if scraper:
-                scraper.cleanup()
-    def run_scraper_requests(delay, max_entries, progress=gr.Progress()):
-        """Run the requests-based scraper with progress updates"""
-        try:
-            def update_progress(message):
-                progress(message)
-            scraper = ManusCopistaMetadataScraper()
-            df, status = scraper.scrape_all_copyists_with_progress(
-                delay=delay,
-                max_entries=max_entries if max_entries > 0 else None,
-                progress_callback=update_progress
-            )
-            if df.empty:
-                return None, f"No data scraped. Status: {status}"
-            # Create CSV output
-            csv_output = io.StringIO()
-            df.to_csv(csv_output, index=False)
-            csv_content = csv_output.getvalue()
-            return csv_content, f"Success! {status}"
-        except Exception as e:
-            return None, f"Error: {str(e)}"
-    def test_discovery(progress=gr.Progress()):
-        """Test the discovery method"""
-        try:
-            def update_progress(message):
-                progress(message)
-            scraper = ManusCopistaMetadataScraper()
-            results = scraper.test_discovery_method(progress_callback=update_progress)
-            return json.dumps(results, indent=2), "Discovery test completed"
-        except Exception as e:
-            return None, f"Error: {str(e)}"
-    with gr.Blocks(title="Manus Copista Scraper") as interface:
-        gr.Markdown("# Manus Copista Metadata Scraper")
-        gr.Markdown("Scrape copyist metadata from the Manus database using either Selenium or requests.")
-        with gr.Tab("Selenium Scraper (Recommended)"):
-            gr.Markdown("### Selenium-based scraper (handles JavaScript)")
-            with gr.Row():
-                selenium_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
-                selenium_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
-            selenium_run_btn = gr.Button("Run Selenium Scraper", variant="primary")
-            selenium_status = gr.Textbox(label="Status", lines=3)
-            selenium_output = gr.File(label="Download CSV")
-            selenium_run_btn.click(
-                run_scraper_selenium,
-                inputs=[selenium_delay, selenium_max_entries],
-                outputs=[selenium_output, selenium_status]
-            )
-        with gr.Tab("Requests Scraper"):
-            gr.Markdown("### Requests-based scraper (faster, may miss JavaScript content)")
-            with gr.Row():
-                requests_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
-                requests_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
-            requests_run_btn = gr.Button("Run Requests Scraper", variant="primary")
-            requests_status = gr.Textbox(label="Status", lines=3)
-            requests_output = gr.File(label="Download CSV")
-            requests_run_btn.click(
-                run_scraper_requests,
-                inputs=[requests_delay, requests_max_entries],
-                outputs=[requests_output, requests_status]
-            )
-        with gr.Tab("Discovery Test"):
-            gr.Markdown("### Test the ID discovery process")
-            test_btn = gr.Button("Test Discovery Method", variant="secondary")
-            test_status = gr.Textbox(label="Status", lines=2)
-            test_output = gr.Textbox(label="Test Results", lines=20)
-            test_btn.click(
-                test_discovery,
-                outputs=[test_output, test_status]
-            )
-        gr.Markdown("---")
-        gr.Markdown("**Note:** The Selenium scraper is recommended as it can handle JavaScript content. The requests scraper is faster but may miss some data.")
-    return interface
-# Main execution
 if __name__ == "__main__":
-    # Create and launch the interface
-    interface = create_gradio_interface()
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        debug=True
-    )

 import requests
 from bs4 import BeautifulSoup
 import pandas as pd
 import time
 import re
 from typing import Dict, List, Optional
 import json
 from datetime import datetime
+import io
+class ManusCopistaRequestsScraper:
+    def __init__(self):
+        self.base_url = "https://manus.iccu.sbn.it"
         self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
         self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
+        # Setup session with proper headers
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+        })
+    def get_page(self, url: str) -> Optional[BeautifulSoup]:
+        """Fetch a page and return BeautifulSoup object"""
         try:
+            print(f"Fetching: {url}")
+            response = self.session.get(url, timeout=15)
+            response.raise_for_status()
+            # Check if we got a proper response
+            if response.status_code != 200:
+                print(f"Bad status code: {response.status_code}")
+                return None
+            return BeautifulSoup(response.text, 'html.parser')
+        except requests.exceptions.RequestException as e:
+            print(f"Request error for {url}: {e}")
             return None
         except Exception as e:
+            print(f"Unexpected error for {url}: {e}")
             return None
+    def discover_copyist_ids(self) -> List[str]:
+        """Discover copyist IDs from the browse page"""
+        print("Discovering copyist IDs...")
+        # Try different approaches to get the data
+        urls_to_try = [
+            self.browse_url,
+            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
+            "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
+            "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
+        ]
+        all_ids = set()
+        for url in urls_to_try:
+            soup = self.get_page(url)
+            if soup:
+                ids = self.extract_ids_from_page(soup)
+                all_ids.update(ids)
+                print(f"Found {len(ids)} IDs from {url}")
+                # If we found IDs, try to get more from pagination
+                if ids:
+                    pagination_ids = self.handle_pagination(soup, url)
+                    all_ids.update(pagination_ids)
+        # If no IDs found from browse page, try a range-based approach
+        if not all_ids:
+            print("No IDs found from browse page, trying range-based discovery...")
+            all_ids = self.discover_ids_by_range()
+        return sorted(list(all_ids))
+    def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
+        """Extract copyist IDs from a page"""
         ids = set()
+        # Look for links that contain detail/ followed by numbers
+        links = soup.find_all('a', href=True)
         for link in links:
+            href = link.get('href', '')
+            match = re.search(r'detail/(\d+)', href)
+            if match:
+                copyist_id = match.group(1)
+                if len(copyist_id) >= 5:  # Valid ID length
+                    ids.add(copyist_id)
+        # Also look for any numbers that might be IDs in the page
+        text = soup.get_text()
+        numbers = re.findall(r'\b\d{6,7}\b', text)
+        for num in numbers:
+            if self.is_valid_id_format(num):
+                ids.add(num)
         return list(ids)
+    def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
+        """Handle pagination to get more IDs"""
+        all_ids = set()
+        # Look for pagination links
+        pagination_links = []
+        links = soup.find_all('a', href=True)
+        for link in links:
+            href = link.get('href', '')
+            text = link.get_text(strip=True).lower()
+            # Look for next page or numbered pages
+            if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
+                if href and href.startswith('/'):
+                    full_url = self.base_url + href
+                    pagination_links.append(full_url)
+        # Visit pagination pages
+        for page_url in pagination_links[:10]:  # Limit to prevent infinite loops
+            print(f"Checking pagination page: {page_url}")
+            page_soup = self.get_page(page_url)
+            if page_soup:
+                page_ids = self.extract_ids_from_page(page_soup)
+                all_ids.update(page_ids)
+                time.sleep(1)  # Be respectful
+        return list(all_ids)
+    def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
+        """Discover IDs by testing a range of potential IDs"""
+        print(f"Testing range-based discovery with {sample_size} samples...")
+        valid_ids = []
+        # Test a sample of IDs in the range
+        import random
+        test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))
+        for i, test_id in enumerate(test_ids):
+            if i % 100 == 0:
+                print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")
+            if self.test_id_exists(str(test_id)):
+                valid_ids.append(str(test_id))
+            time.sleep(0.1)  # Small delay
+        return valid_ids
+    def test_id_exists(self, copyist_id: str) -> bool:
+        """Test if a copyist ID exists by making a HEAD request"""
+        url = f"{self.detail_base_url}{copyist_id}"
         try:
+            response = self.session.head(url, timeout=5)
             return response.status_code == 200
         except:
             return False
+    def is_valid_id_format(self, id_str: str) -> bool:
+        """Check if string looks like a valid copyist ID"""
+        if not id_str.isdigit():
+            return False
+        return 5 <= len(id_str) <= 7
+    def scrape_copyist_detail(self, copyist_id: str) -> Dict:
+        """Scrape detailed information for a single copyist"""
+        url = f"{self.detail_base_url}{copyist_id}"
+        soup = self.get_page(url)
         if not soup:
+            return {'error': f'Could not fetch page for ID {copyist_id}'}
+        # Extract basic info
+        data = {
+            'copyist_id': copyist_id,
+            'detail_url': url,
+            'scrape_timestamp': datetime.now().isoformat()
+        }
+        # Extract title
+        title = soup.find('title')
+        if title:
+            data['page_title'] = title.get_text(strip=True)
+        # Extract main content
+        self.extract_copyist_data(soup, data)
+        return data
+    def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
+        """Extract copyist data from the page"""
+        # Try to find the main content table
+        table = soup.find('table', class_='table')
+        if not table:
+            table = soup.find('table')
+        if table:
+            self.extract_table_data(table, data)
+        # Try to extract name from various locations
+        name_candidates = []
+        # Look in headings
+        for heading in soup.find_all(['h1', 'h2', 'h3']):
+            text = heading.get_text(strip=True)
+            if text and len(text) > 2:
+                name_candidates.append(text)
+        # Look in title
+        if 'page_title' in data:
+            title_parts = data['page_title'].split(' - ')
+            for part in title_parts:
+                if part.strip() and len(part.strip()) > 2:
+                    name_candidates.append(part.strip())
+        # Set the most likely name
+        if name_candidates:
+            data['copyist_name'] = name_candidates[0]
+    def extract_table_data(self, table, data: Dict):
+        """Extract data from the main table"""
+        rows = table.find_all('tr')
         for row in rows:
+            cells = row.find_all(['td', 'th'])
+            if len(cells) >= 2:
+                key_cell = cells[0]
+                value_cell = cells[1]
+                key = key_cell.get_text(strip=True).lower()
+                value = value_cell.get_text(strip=True)
+                # Map common fields
+                if 'cnmn' in key:
+                    data['cnmn_code'] = value
+                elif 'sbn' in key:
+                    data['vid_sbn'] = value
+                    link = value_cell.find('a')
+                    if link:
+                        data['vid_sbn_url'] = link.get('href', '')
+                elif 'isni' in key:
+                    data['isni_code'] = value
+                    link = value_cell.find('a')
                     if link:
+                        data['isni_url'] = link.get('href', '')
+                elif 'biographical' in key or 'biografica' in key:
+                    data['biographical_note'] = value
+                elif 'bibliographical' in key or 'bibliografia' in key:
+                    if 'source' in key:
+                        data['bibliographical_sources'] = value
                     else:
+                        data['bibliographical_notes'] = value
+                elif 'name' in key and 'manuscript' in key:
+                    data['names_in_manuscript'] = value
+                elif 'creation' in key or 'creazione' in key:
+                    data['date_of_creation'] = value
+                elif 'modification' in key or 'modifica' in key:
+                    data['last_modification'] = value
+                elif 'identifier' in key:
+                    data['other_identifiers'] = value
+    def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
+        """Scrape all copyists"""
+        print("Starting full scrape...")
+        # Discover IDs
+        copyist_ids = self.discover_copyist_ids()
+        print(f"Found {len(copyist_ids)} copyist IDs")
+        if not copyist_ids:
+            print("No copyist IDs found!")
+            return pd.DataFrame()
+        # Limit if requested
+        if max_entries and max_entries > 0:
+            copyist_ids = copyist_ids[:max_entries]
+            print(f"Limited to {max_entries} entries")
+        # Scrape each copyist
+        all_data = []
         for i, copyist_id in enumerate(copyist_ids, 1):
+            print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")
+            data = self.scrape_copyist_detail(copyist_id)
+            if 'error' not in data:
+                data['scrape_order'] = i
+                all_data.append(data)
             else:
+                print(f"Error scraping {copyist_id}: {data['error']}")
             # Delay between requests
             if delay > 0:
                 time.sleep(delay)
+        df = pd.DataFrame(all_data)
+        print(f"Successfully scraped {len(df)} copyists")
+        return df
+# Simple usage example
+def main():
+    """Main function to run the scraper"""
+    scraper = ManusCopistaRequestsScraper()
+    # Test with a small number first
+    print("Testing with 10 entries...")
+    df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)
+    if not df.empty:
+        print(f"Successfully scraped {len(df)} copyists")
+        print("\nColumns:", df.columns.tolist())
+        print("\nFirst few rows:")
+        print(df.head())
+        # Save to CSV
+        filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+        df.to_csv(filename, index=False)
+        print(f"\nSaved to {filename}")
+    else:
+        print("No data scraped!")
 if __name__ == "__main__":
+    main()