kambris commited on
Commit
3bd337f
·
verified ·
1 Parent(s): d1c8665

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +251 -1009
app.py CHANGED
@@ -1,1092 +1,334 @@
1
- import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import pandas as pd
5
- from urllib.parse import urljoin, urlparse
6
  import time
7
  import re
8
  from typing import Dict, List, Optional
9
  import json
10
- import io
11
  from datetime import datetime
12
- import os
13
-
14
- # Selenium imports
15
- from selenium import webdriver
16
- from selenium.webdriver.common.by import By
17
- from selenium.webdriver.support.ui import WebDriverWait
18
- from selenium.webdriver.support import expected_conditions as EC
19
- from selenium.webdriver.chrome.options import Options
20
- from selenium.common.exceptions import TimeoutException, WebDriverException
21
- from webdriver_manager.chrome import ChromeDriverManager
22
 
23
- class ManusCopistaSeleniumScraper:
24
- def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
25
- self.base_url = base_url
26
  self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
27
  self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
28
- self.driver = None
29
- self.setup_driver()
30
-
31
- def setup_driver(self):
32
- """Setup Chrome driver with appropriate options"""
33
- chrome_options = Options()
34
- chrome_options.add_argument("--headless")
35
- chrome_options.add_argument("--no-sandbox")
36
- chrome_options.add_argument("--disable-dev-shm-usage")
37
- chrome_options.add_argument("--disable-gpu")
38
- chrome_options.add_argument("--window-size=1920,1080")
39
- chrome_options.add_argument("--disable-extensions")
40
- chrome_options.add_argument("--disable-plugins")
41
- chrome_options.add_argument("--disable-images")
42
- chrome_options.add_argument("--disable-javascript")
43
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
44
 
45
- try:
46
- # Try to use ChromeDriverManager for automatic driver management
47
- service = webdriver.ChromeService(ChromeDriverManager().install())
48
- self.driver = webdriver.Chrome(service=service, options=chrome_options)
49
- except Exception as e:
50
- print(f"Error setting up ChromeDriver with manager: {e}")
51
- try:
52
- # Fallback to system Chrome driver
53
- self.driver = webdriver.Chrome(options=chrome_options)
54
- except Exception as e2:
55
- print(f"Error setting up system ChromeDriver: {e2}")
56
- raise Exception("Could not initialize Chrome driver")
 
57
 
58
- def get_page_with_selenium(self, url: str, wait_for_element: str = None, timeout: int = 10) -> Optional[BeautifulSoup]:
59
- """Get page content using Selenium to handle JavaScript"""
60
  try:
61
- self.driver.get(url)
 
 
62
 
63
- # Wait for specific element if provided
64
- if wait_for_element:
65
- WebDriverWait(self.driver, timeout).until(
66
- EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element))
67
- )
68
- else:
69
- # Default wait for page to load
70
- time.sleep(3)
71
 
72
- # Get page source and parse with BeautifulSoup
73
- page_source = self.driver.page_source
74
- return BeautifulSoup(page_source, 'html.parser')
75
 
76
- except TimeoutException:
77
- print(f"Timeout waiting for element {wait_for_element} on {url}")
78
- return None
79
- except WebDriverException as e:
80
- print(f"WebDriver error on {url}: {e}")
81
  return None
82
  except Exception as e:
83
- print(f"Error fetching {url}: {e}")
84
  return None
85
 
86
- def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
87
- """Discover all copyist IDs from the browse page using Selenium"""
88
- all_ids = set()
89
 
90
- if progress_callback:
91
- progress_callback(f"Fetching copyist list from: {self.browse_url}")
92
-
93
- # Wait for the results table to load
94
- soup = self.get_page_with_selenium(
95
- self.browse_url,
96
- wait_for_element="tbody#authorities-results-content",
97
- timeout=15
98
- )
99
-
100
- if not soup:
101
- if progress_callback:
102
- progress_callback("Failed to fetch the copyist list page.")
103
- return []
104
-
105
- # Extract IDs from the table
106
- page_ids = self.extract_copyist_ids_from_table(soup)
107
- all_ids.update(page_ids)
108
 
109
- if progress_callback:
110
- progress_callback(f"Found {len(all_ids)} copyist IDs from main page.")
111
 
112
- # Check for pagination and get additional pages
113
- pagination_handled = self.handle_pagination(soup, all_ids, progress_callback)
 
 
 
 
 
 
 
 
 
114
 
115
- if progress_callback:
116
- progress_callback(f"Total copyist IDs discovered: {len(all_ids)}")
 
 
117
 
118
- return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
119
 
120
- def extract_copyist_ids_from_table(self, soup: BeautifulSoup) -> List[str]:
121
- """Extract copyist IDs from the results table"""
122
  ids = set()
123
 
124
- if not soup:
125
- return []
126
-
127
- # Look for the specific table body
128
- table_body = soup.find('tbody', id='authorities-results-content')
129
- if not table_body:
130
- # Fallback: look for any table with copyist links
131
- table_body = soup.find('tbody')
132
-
133
- if not table_body:
134
- return []
135
-
136
- # Find all links in the table
137
- links = table_body.find_all('a', href=True)
138
  for link in links:
139
- href = link['href']
140
- if 'detail/' in href:
141
- match = re.search(r'detail/(\d+)', href)
142
- if match:
143
- ids.add(match.group(1))
 
 
 
 
 
 
 
 
144
 
145
  return list(ids)
146
 
147
- def handle_pagination(self, soup: BeautifulSoup, all_ids: set, progress_callback=None) -> bool:
148
- """Handle pagination to get all copyist IDs"""
149
- try:
150
- # Look for pagination controls
151
- pagination_links = soup.find_all('a', href=True)
152
- next_page_found = False
153
-
154
- for link in pagination_links:
155
- link_text = link.get_text(strip=True).lower()
156
- href = link.get('href', '')
157
-
158
- # Look for "next" or page numbers
159
- if ('next' in link_text or 'seguente' in link_text or
160
- (link_text.isdigit() and int(link_text) > 1)):
161
-
162
- next_page_found = True
163
- if progress_callback:
164
- progress_callback(f"Found pagination link: {link_text}")
165
-
166
- # Navigate to next page
167
- full_url = urljoin(self.base_url, href)
168
- next_soup = self.get_page_with_selenium(
169
- full_url,
170
- wait_for_element="tbody#authorities-results-content",
171
- timeout=15
172
- )
173
-
174
- if next_soup:
175
- new_ids = self.extract_copyist_ids_from_table(next_soup)
176
- all_ids.update(new_ids)
177
- if progress_callback:
178
- progress_callback(f"Added {len(new_ids)} IDs from pagination page")
179
-
180
- # Recursively handle more pagination
181
- self.handle_pagination(next_soup, all_ids, progress_callback)
182
- break
183
-
184
- return next_page_found
185
-
186
- except Exception as e:
187
- if progress_callback:
188
- progress_callback(f"Error handling pagination: {e}")
189
- return False
190
-
191
- def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
192
- """Extract metadata from the copyist detail page"""
193
- metadata = {
194
- 'cnmn_code': '',
195
- 'vid_sbn': '',
196
- 'vid_sbn_url': '',
197
- 'isni_code': '',
198
- 'isni_url': '',
199
- 'other_identifiers': '',
200
- 'biographical_note': '',
201
- 'bibliographical_sources': '',
202
- 'bibliographical_notes': '',
203
- 'names_in_manuscript': '',
204
- 'date_of_creation': '',
205
- 'last_modification': '',
206
- 'page_title': '',
207
- 'copyist_name': ''
208
- }
209
-
210
- if not soup:
211
- return metadata
212
-
213
- # Extract page title
214
- title_tag = soup.find('title')
215
- if title_tag:
216
- metadata['page_title'] = title_tag.get_text(strip=True)
217
-
218
- # Try to extract copyist name
219
- name_selectors = [
220
- 'h1', 'h2', '.title', '.copyist-name',
221
- '[class*="name"]', '[class*="title"]'
222
- ]
223
-
224
- for selector in name_selectors:
225
- element = soup.select_one(selector)
226
- if element:
227
- name_text = element.get_text(strip=True)
228
- if name_text and len(name_text) > 2:
229
- metadata['copyist_name'] = name_text
230
- break
231
-
232
- # Find the main data table
233
- main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
234
- if not main_table:
235
- main_table = soup.find('table')
236
-
237
- if not main_table:
238
- return metadata
239
 
240
- # Process table rows
241
- rows = main_table.find_all('tr')
242
- for row in rows:
243
- try:
244
- title_cell = row.find('td', class_='table-title')
245
- if not title_cell:
246
- continue
247
-
248
- title_div = title_cell.find('div', class_='table-title-item')
249
- if not title_div:
250
- continue
251
-
252
- field_name = title_div.get_text(strip=True)
253
-
254
- data_cells = row.find_all('td')
255
- data_cell = data_cells[1] if len(data_cells) > 1 else None
256
- if not data_cell:
257
- continue
258
-
259
- self.extract_cell_data(field_name, data_cell, metadata)
260
-
261
- except (AttributeError, IndexError):
262
- continue
263
 
264
- return metadata
265
-
266
- def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
267
- """Extract data from table cells"""
268
- try:
269
- cell_classes = data_cell.get('class', [])
270
 
271
- if 'table-text' in cell_classes:
272
- text_item = data_cell.find('div', class_='table-text-item')
273
- if text_item:
274
- value = text_item.get_text(strip=True)
275
- self.map_field_value(field_name, value, metadata)
276
-
277
- elif 'table-link' in cell_classes:
278
- text_item = data_cell.find('div', class_='table-text-item')
279
- if text_item:
280
- link = text_item.find('a')
281
- if link:
282
- link_text = link.get_text(strip=True)
283
- link_url = link.get('href', '')
284
- self.map_field_link(field_name, link_text, link_url, metadata)
285
- else:
286
- value = text_item.get_text(strip=True)
287
- self.map_field_value(field_name, value, metadata)
288
-
289
- elif 'table-list' in cell_classes:
290
- values = []
291
- list_containers = data_cell.find_all('div', class_='table-list-item')
292
-
293
- if list_containers:
294
- for container in list_containers:
295
- text_items = container.find_all('div', class_='table-text-item')
296
- for item in text_items:
297
- try:
298
- link = item.find('a')
299
- if link:
300
- link_text = link.get_text(strip=True)
301
- link_url = link.get('href', '')
302
- if link_url:
303
- values.append(f"{link_text} ({link_url})")
304
- else:
305
- values.append(link_text)
306
- else:
307
- text = item.get_text(strip=True)
308
- if text:
309
- values.append(text)
310
- except AttributeError:
311
- continue
312
- else:
313
- text_items = data_cell.find_all('div', class_='table-text-item')
314
- for item in text_items:
315
- try:
316
- link = item.find('a')
317
- if link:
318
- link_text = link.get_text(strip=True)
319
- link_url = link.get('href', '')
320
- if link_url:
321
- values.append(f"{link_text} ({link_url})")
322
- else:
323
- values.append(link_text)
324
- else:
325
- text = item.get_text(strip=True)
326
- if text:
327
- values.append(text)
328
- except AttributeError:
329
- continue
330
-
331
- self.map_field_list(field_name, values, metadata)
332
-
333
- elif 'table-text-html' in cell_classes:
334
- text_item = data_cell.find('div', class_='table-text-item')
335
- if text_item:
336
- value = ' '.join(text_item.get_text(strip=True).split())
337
- self.map_field_value(field_name, value, metadata)
338
-
339
- except (AttributeError, TypeError):
340
- pass
341
-
342
- def map_field_value(self, field_name: str, value: str, metadata: Dict):
343
- """Map field values to metadata keys"""
344
- field_mapping = {
345
- 'CNMN code': 'cnmn_code',
346
- 'Date of creation': 'date_of_creation',
347
- 'Last modification': 'last_modification',
348
- 'Biographical note': 'biographical_note',
349
- 'Bibliographical notes': 'bibliographical_notes'
350
- }
351
 
352
- mapped_key = field_mapping.get(field_name)
353
- if mapped_key and mapped_key in metadata:
354
- metadata[mapped_key] = value
355
-
356
- def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
357
- """Map field links to metadata"""
358
- if field_name == 'VID SBN':
359
- metadata['vid_sbn'] = link_text
360
- metadata['vid_sbn_url'] = link_url
361
- elif field_name == 'Codice ISNI':
362
- metadata['isni_code'] = link_text
363
- metadata['isni_url'] = link_url
364
-
365
- def map_field_list(self, field_name: str, values: List, metadata: Dict):
366
- """Map field lists to metadata"""
367
- joined_values = '; '.join(str(v) for v in values if v)
368
-
369
- if field_name == 'Other identifiers':
370
- metadata['other_identifiers'] = joined_values
371
- elif field_name == 'Bibliographical sources':
372
- metadata['bibliographical_sources'] = joined_values
373
- elif field_name == 'Names in manuscript':
374
- metadata['names_in_manuscript'] = joined_values
375
-
376
- def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
377
- """Scrape all copyists with Selenium"""
378
- try:
379
- # Discover all copyist IDs
380
- copyist_ids = self.discover_all_copyist_ids(progress_callback)
381
-
382
- if not copyist_ids:
383
- return pd.DataFrame(), "No copyist IDs found"
384
-
385
- if progress_callback:
386
- progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
387
-
388
- # Limit entries if specified
389
- if max_entries and max_entries > 0:
390
- copyist_ids = copyist_ids[:max_entries]
391
- if progress_callback:
392
- progress_callback(f"Limited to first {max_entries} entries for testing")
393
-
394
- # Process each copyist
395
- all_metadata = []
396
- total_ids = len(copyist_ids)
397
- successful_scrapes = 0
398
- failed_scrapes = 0
399
-
400
- for i, copyist_id in enumerate(copyist_ids, 1):
401
- if progress_callback:
402
- progress_callback(f"Processing {i}/{total_ids}: Copyist ID {copyist_id}")
403
-
404
- detail_url = f"{self.detail_base_url}{copyist_id}"
405
-
406
- # Get detailed metadata using Selenium
407
- detail_soup = self.get_page_with_selenium(
408
- detail_url,
409
- wait_for_element="table",
410
- timeout=10
411
- )
412
-
413
- if detail_soup:
414
- metadata = self.extract_metadata_from_table(detail_soup)
415
-
416
- combined_data = {
417
- 'copyist_id': copyist_id,
418
- 'detail_url': detail_url,
419
- 'scrape_order': i,
420
- 'scrape_timestamp': datetime.now().isoformat(),
421
- **metadata
422
- }
423
-
424
- all_metadata.append(combined_data)
425
- successful_scrapes += 1
426
- else:
427
- failed_scrapes += 1
428
- if progress_callback:
429
- progress_callback(f"Failed to fetch data for copyist ID {copyist_id}")
430
-
431
- # Progress update
432
- if i % 50 == 0 and progress_callback:
433
- progress_callback(f"Progress: {i}/{total_ids} processed. Success: {successful_scrapes}, Failed: {failed_scrapes}")
434
-
435
- # Delay between requests
436
- if delay > 0:
437
- time.sleep(delay)
438
-
439
- df = pd.DataFrame(all_metadata)
440
- success_msg = f"Successfully scraped {successful_scrapes} copyist records. Failed: {failed_scrapes}. Total discovered: {total_ids}"
441
- return df, success_msg
442
-
443
- except Exception as e:
444
- return pd.DataFrame(), f"Error during scraping: {str(e)}"
445
-
446
- def cleanup(self):
447
- """Clean up resources"""
448
- if self.driver:
449
- self.driver.quit()
450
- self.driver = None
451
-
452
- def __del__(self):
453
- """Destructor to ensure cleanup"""
454
- self.cleanup()
455
-
456
-
457
- class ManusCopistaMetadataScraper:
458
- def __init__(self, base_url: str = "https://manus.iccu.sbn.it/en/copisti2"):
459
- self.base_url = base_url
460
- self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
461
- self.session = requests.Session()
462
- # Add headers to mimic a real browser
463
- self.session.headers.update({
464
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
465
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
466
- 'Accept-Language': 'en-US,en;q=0.9,it;q=0.8',
467
- 'Accept-Encoding': 'gzip, deflate, br',
468
- 'Connection': 'keep-alive',
469
- 'Upgrade-Insecure-Requests': '1',
470
- 'Sec-Fetch-Dest': 'document',
471
- 'Sec-Fetch-Mode': 'navigate',
472
- 'Sec-Fetch-Site': 'none',
473
- 'Cache-Control': 'max-age=0',
474
- })
475
 
476
- def get_page_content(self, url: str) -> Optional[BeautifulSoup]:
477
- """Fetch and parse a web page with error handling"""
478
- try:
479
- response = self.session.get(url, timeout=15)
480
- response.raise_for_status()
481
-
482
- # Handle different encodings
483
- if response.encoding and response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
484
- response.encoding = 'utf-8'
485
-
486
- return BeautifulSoup(response.text, 'html.parser')
487
- except requests.RequestException as e:
488
- print(f"Error fetching {url}: {e}")
489
- return None
490
-
491
- def discover_all_copyist_ids(self, progress_callback=None) -> List[str]:
492
- """Discover all available copyist IDs from the browse page"""
493
- all_ids = set()
494
-
495
- # This is the key page where the real data table appears
496
- url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
497
-
498
- if progress_callback:
499
- progress_callback(f"Fetching copyist list from: {url}")
500
-
501
- soup = self.get_page_content(url)
502
- if not soup:
503
- if progress_callback:
504
- progress_callback("Failed to fetch the copyist list page.")
505
- return []
506
-
507
- page_ids = self.extract_copyist_ids_from_page(soup)
508
- all_ids.update(page_ids)
509
-
510
- if progress_callback:
511
- progress_callback(f"Found {len(all_ids)} copyist IDs.")
512
-
513
- return sorted(list(all_ids), key=lambda x: int(x) if x.isdigit() else 0)
514
-
515
- def extract_copyist_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
516
- """Extract copyist IDs from the table with id 'authorities-results-content'"""
517
- ids = set()
518
-
519
- if not soup:
520
- return []
521
 
522
- table_body = soup.find('tbody', id='authorities-results-content')
523
- if not table_body:
524
- return []
525
-
526
- links = table_body.find_all('a', href=True)
527
- for link in links:
528
- href = link['href']
529
- if 'detail/' in href:
530
- match = re.search(r'detail/(\d+)', href)
531
- if match:
532
- ids.add(match.group(1))
533
-
534
- return list(ids)
535
-
536
- def extract_copyist_id_from_url(self, url: str) -> Optional[str]:
537
- """Extract copyist ID from a URL"""
538
- patterns = [
539
- r'/manus-authorities/detail/(\d+)',
540
- r'copisti2.*?detail/(\d+)',
541
- r'/detail/(\d+)',
542
- r'authorities.*?(\d{5,7})',
543
- r'copista.*?(\d{5,7})'
544
- ]
545
 
546
- for pattern in patterns:
547
- match = re.search(pattern, url, re.IGNORECASE)
548
- if match:
549
- return match.group(1)
550
 
551
- return None
552
-
553
- def is_potential_copyist_id(self, id_str: str) -> bool:
554
- """Check if a string looks like a copyist ID"""
555
- if not id_str or not id_str.isdigit():
556
- return False
557
 
558
- # IDs are typically 5-7 digits and within a reasonable range
559
- if len(id_str) < 5 or len(id_str) > 7:
560
- return False
 
 
 
 
 
561
 
562
- # Basic range check (copyist IDs seem to be in 6-digit range)
563
- try:
564
- id_num = int(id_str)
565
- return 100000 <= id_num <= 999999
566
- except ValueError:
567
- return False
568
 
569
- def is_valid_copyist_id(self, id_str: str) -> bool:
570
- """Check if an ID corresponds to a valid copyist page"""
571
- if not id_str or not id_str.isdigit():
572
- return False
573
-
574
- # IDs are typically 5-7 digits
575
- if len(id_str) < 5 or len(id_str) > 7:
576
- return False
577
-
578
- # Quick HEAD request to check if page exists
579
  try:
580
- detail_url = f"{self.detail_base_url}{id_str}"
581
- response = self.session.head(detail_url, timeout=3)
582
  return response.status_code == 200
583
  except:
584
  return False
585
 
586
- def find_pagination_links(self, soup: BeautifulSoup) -> List[str]:
587
- """Find pagination links to get all pages of copyists"""
588
- pagination_urls = []
 
 
 
 
 
 
 
589
 
590
  if not soup:
591
- return pagination_urls
592
-
593
- # Look for pagination elements with more specific selectors
594
- pagination_selectors = [
595
- 'nav[aria-label*="pagination"]',
596
- 'nav[class*="pagination"]',
597
- '.pagination',
598
- '.pager',
599
- '.page-navigation',
600
- '[class*="page-"]'
601
- ]
602
 
603
- for selector in pagination_selectors:
604
- pagination_container = soup.select_one(selector)
605
- if pagination_container:
606
- links = pagination_container.find_all('a', href=True)
607
- for link in links:
608
- href = link.get('href', '')
609
- if href and href not in ['#', 'javascript:void(0)']:
610
- full_url = urljoin(self.base_url, href)
611
- if full_url not in pagination_urls and full_url != self.base_url:
612
- # Avoid duplicate URLs and navigation loops
613
- if not any(existing_url in full_url or full_url in existing_url for existing_url in pagination_urls):
614
- pagination_urls.append(full_url)
615
 
616
- # Also look for numbered page links or next/previous buttons
617
- all_links = soup.find_all('a', href=True)
618
- for link in all_links:
619
- link_text = link.get_text(strip=True).lower()
620
- href = link.get('href', '')
621
-
622
- # Look for pagination indicators
623
- pagination_keywords = ['next', 'seguente', 'avanti', 'previous', 'precedente', 'indietro']
624
- if (any(keyword in link_text for keyword in pagination_keywords) or
625
- (link_text.isdigit() and int(link_text) <= 100)): # Reasonable page number
626
-
627
- if href and href not in ['#', 'javascript:void(0)']:
628
- full_url = urljoin(self.base_url, href)
629
- if (full_url not in pagination_urls and
630
- full_url != self.base_url and
631
- 'copisti' in full_url): # Ensure it's still in the copyist section
632
- pagination_urls.append(full_url)
633
 
634
- # Remove duplicates and limit to prevent infinite loops
635
- unique_urls = []
636
- for url in pagination_urls:
637
- if url not in unique_urls:
638
- unique_urls.append(url)
639
 
640
- return unique_urls[:20] # Reasonable limit
641
 
642
- def test_discovery_method(self, progress_callback=None) -> Dict:
643
- """Test method to debug the ID discovery process"""
644
- if progress_callback:
645
- progress_callback("Starting discovery test...")
 
 
646
 
647
- main_soup = self.get_page_content(self.base_url)
648
- if not main_soup:
649
- return {"error": "Could not fetch main page"}
650
 
651
- results = {
652
- "page_title": main_soup.find('title').get_text(strip=True) if main_soup.find('title') else "No title",
653
- "total_links": len(main_soup.find_all('a', href=True)),
654
- "copyist_links": [],
655
- "pagination_links": [],
656
- "page_structure": []
657
- }
658
 
659
- # Analyze all links
660
- links = main_soup.find_all('a', href=True)
661
- for link in links:
662
- href = link.get('href', '')
663
- text = link.get_text(strip=True)
664
-
665
- if 'detail' in href or 'copista' in href.lower() or 'authorities' in href:
666
- copyist_id = self.extract_copyist_id_from_url(href)
667
- results["copyist_links"].append({
668
- "href": href,
669
- "text": text,
670
- "extracted_id": copyist_id
671
- })
672
 
673
- # Check page structure
674
- for tag in ['table', 'ul', 'ol', 'div']:
675
- elements = main_soup.find_all(tag, class_=True)
676
- for elem in elements[:5]: # Limit for debugging
677
- classes = ' '.join(elem.get('class', []))
678
- results["page_structure"].append(f"{tag}: {classes}")
679
 
680
- return results
 
 
681
 
682
- def extract_metadata_from_table(self, soup: BeautifulSoup) -> Dict:
683
- """Extract metadata from the copyist detail page table - Updated for better structure handling"""
684
- metadata = {
685
- 'cnmn_code': '',
686
- 'vid_sbn': '',
687
- 'vid_sbn_url': '',
688
- 'isni_code': '',
689
- 'isni_url': '',
690
- 'other_identifiers': '',
691
- 'biographical_note': '',
692
- 'bibliographical_sources': '',
693
- 'bibliographical_notes': '',
694
- 'names_in_manuscript': '',
695
- 'date_of_creation': '',
696
- 'last_modification': '',
697
- 'page_title': '',
698
- 'copyist_name': ''
699
- }
700
-
701
- if not soup:
702
- return metadata
703
-
704
- # Extract page title
705
- title_tag = soup.find('title')
706
- if title_tag:
707
- metadata['page_title'] = title_tag.get_text(strip=True)
708
-
709
- # Try to extract copyist name from various possible locations
710
- name_selectors = [
711
- 'h1', 'h2', '.title', '.copyist-name',
712
- '[class*="name"]', '[class*="title"]'
713
- ]
714
-
715
- for selector in name_selectors:
716
- element = soup.select_one(selector)
717
- if element:
718
- name_text = element.get_text(strip=True)
719
- if name_text and len(name_text) > 2:
720
- metadata['copyist_name'] = name_text
721
- break
722
-
723
- # Find the main data table - look for the specific table structure
724
- main_table = soup.find('table', class_=['table', 'table-1', 'table-sm'])
725
-
726
- if not main_table:
727
- # Fallback: look for any table
728
- main_table = soup.find('table')
729
-
730
- if not main_table:
731
- return metadata
732
-
733
- # Process each row in the table
734
- rows = main_table.find_all('tr')
735
 
736
  for row in rows:
737
- try:
738
- # Get the title cell
739
- title_cell = row.find('td', class_='table-title')
740
- if not title_cell:
741
- continue
742
-
743
- title_div = title_cell.find('div', class_='table-title-item')
744
- if not title_div:
745
- continue
746
-
747
- field_name = title_div.get_text(strip=True)
748
-
749
- # Get the data cell (should be the second td in the row)
750
- data_cells = row.find_all('td')
751
- if len(data_cells) < 2:
752
- continue
753
-
754
- data_cell = data_cells[1]
755
-
756
- # Extract data based on cell type
757
- self.extract_cell_data(field_name, data_cell, metadata)
758
 
759
- except (AttributeError, IndexError):
760
- continue
761
 
762
- return metadata
763
-
764
- def extract_cell_data(self, field_name: str, data_cell, metadata: Dict):
765
- """Extract data from table cells based on their class structure"""
766
- try:
767
- cell_classes = data_cell.get('class', [])
768
-
769
- # Handle text cells
770
- if 'table-text' in cell_classes:
771
- text_item = data_cell.find('div', class_='table-text-item')
772
- if text_item:
773
- value = text_item.get_text(strip=True)
774
- self.map_field_value(field_name, value, metadata)
775
-
776
- # Handle link cells
777
- elif 'table-link' in cell_classes:
778
- text_item = data_cell.find('div', class_='table-text-item')
779
- if text_item:
780
- link = text_item.find('a')
781
  if link:
782
- link_text = link.get_text(strip=True)
783
- link_url = link.get('href', '')
784
- self.map_field_link(field_name, link_text, link_url, metadata)
 
 
 
785
  else:
786
- # No link, just text
787
- value = text_item.get_text(strip=True)
788
- self.map_field_value(field_name, value, metadata)
789
-
790
- # Handle list cells
791
- elif 'table-list' in cell_classes:
792
- values = []
793
-
794
- # Look for list containers
795
- list_containers = data_cell.find_all('div', class_='table-list-item')
796
-
797
- if list_containers:
798
- for container in list_containers:
799
- text_items = container.find_all('div', class_='table-text-item')
800
- for item in text_items:
801
- try:
802
- link = item.find('a')
803
- if link:
804
- link_text = link.get_text(strip=True)
805
- link_url = link.get('href', '')
806
- if link_url:
807
- values.append(f"{link_text} ({link_url})")
808
- else:
809
- values.append(link_text)
810
- else:
811
- text = item.get_text(strip=True)
812
- if text:
813
- values.append(text)
814
- except AttributeError:
815
- continue
816
- else:
817
- # Fallback: look for text items directly
818
- text_items = data_cell.find_all('div', class_='table-text-item')
819
- for item in text_items:
820
- try:
821
- link = item.find('a')
822
- if link:
823
- link_text = link.get_text(strip=True)
824
- link_url = link.get('href', '')
825
- if link_url:
826
- values.append(f"{link_text} ({link_url})")
827
- else:
828
- values.append(link_text)
829
- else:
830
- text = item.get_text(strip=True)
831
- if text:
832
- values.append(text)
833
- except AttributeError:
834
- continue
835
-
836
- self.map_field_list(field_name, values, metadata)
837
-
838
- # Handle HTML text cells
839
- elif 'table-text-html' in cell_classes:
840
- text_item = data_cell.find('div', class_='table-text-item')
841
- if text_item:
842
- # Clean HTML and get text
843
- value = ' '.join(text_item.get_text(strip=True).split())
844
- self.map_field_value(field_name, value, metadata)
845
-
846
- except (AttributeError, TypeError):
847
- pass
848
-
849
- def map_field_value(self, field_name: str, value: str, metadata: Dict):
850
- """Map field values to the appropriate metadata keys"""
851
- field_mapping = {
852
- 'CNMN code': 'cnmn_code',
853
- 'Date of creation': 'date_of_creation',
854
- 'Last modification': 'last_modification',
855
- 'Biographical note': 'biographical_note',
856
- 'Bibliographical notes': 'bibliographical_notes'
857
- }
858
-
859
- mapped_key = field_mapping.get(field_name)
860
- if mapped_key and mapped_key in metadata:
861
- metadata[mapped_key] = value
862
-
863
- def map_field_link(self, field_name: str, link_text: str, link_url: str, metadata: Dict):
864
- """Map field links to metadata"""
865
- if field_name == 'VID SBN':
866
- metadata['vid_sbn'] = link_text
867
- metadata['vid_sbn_url'] = link_url
868
- elif field_name == 'Codice ISNI':
869
- metadata['isni_code'] = link_text
870
- metadata['isni_url'] = link_url
871
-
872
- def map_field_list(self, field_name: str, values: List, metadata: Dict):
873
- """Map field lists to metadata"""
874
- joined_values = '; '.join(str(v) for v in values if v)
875
-
876
- if field_name == 'Other identifiers':
877
- metadata['other_identifiers'] = joined_values
878
- elif field_name == 'Bibliographical sources':
879
- metadata['bibliographical_sources'] = joined_values
880
- elif field_name == 'Names in manuscript':
881
- metadata['names_in_manuscript'] = joined_values
882
-
883
- def scrape_copyist_by_id(self, copyist_id: str) -> Dict:
884
- """Scrape a single copyist by ID"""
885
- detail_url = f"{self.detail_base_url}{copyist_id}"
886
-
887
- # Get the detail page
888
- detail_soup = self.get_page_content(detail_url)
889
- if not detail_soup:
890
- return {'error': f'Could not fetch data for copyist ID {copyist_id}'}
891
-
892
- # Extract metadata
893
- metadata = self.extract_metadata_from_table(detail_soup)
894
-
895
- # Add basic info
896
- metadata['copyist_id'] = copyist_id
897
- metadata['detail_url'] = detail_url
898
- metadata['scrape_timestamp'] = datetime.now().isoformat()
899
-
900
- return metadata
901
-
902
- def scrape_multiple_copyists(self, copyist_ids: List[str], delay: float = 1.0, progress_callback=None) -> pd.DataFrame:
903
- """Scrape multiple copyists by their IDs"""
904
- all_metadata = []
905
 
906
  for i, copyist_id in enumerate(copyist_ids, 1):
907
- if progress_callback:
908
- progress_callback(f"Processing {i}/{len(copyist_ids)}: Copyist ID {copyist_id}")
909
 
910
- metadata = self.scrape_copyist_by_id(copyist_id)
911
 
912
- if 'error' not in metadata:
913
- metadata['scrape_order'] = i
914
- all_metadata.append(metadata)
915
  else:
916
- if progress_callback:
917
- progress_callback(f"Failed to scrape copyist ID {copyist_id}: {metadata['error']}")
918
 
919
  # Delay between requests
920
  if delay > 0:
921
  time.sleep(delay)
922
 
923
- return pd.DataFrame(all_metadata)
924
-
925
- def scrape_all_copyists_with_progress(self, delay: float = 1.0, max_entries: int = None, progress_callback=None):
926
- """Scrape all copyists with progress updates"""
927
- try:
928
- # Discover all copyist IDs
929
- copyist_ids = self.discover_all_copyist_ids(progress_callback)
930
-
931
- if not copyist_ids:
932
- return pd.DataFrame(), "No copyist IDs found"
933
-
934
- if progress_callback:
935
- progress_callback(f"Discovered {len(copyist_ids)} copyist IDs. Starting detailed scraping...")
936
-
937
- # Limit entries if specified
938
- if max_entries and max_entries > 0:
939
- copyist_ids = copyist_ids[:max_entries]
940
- if progress_callback:
941
- progress_callback(f"Limited to first {max_entries} entries for testing")
942
-
943
- # Scrape the copyists
944
- df = self.scrape_multiple_copyists(copyist_ids, delay, progress_callback)
945
-
946
- success_msg = f"Successfully scraped {len(df)} copyist records out of {len(copyist_ids)} discovered IDs"
947
- return df, success_msg
948
-
949
- except Exception as e:
950
- return pd.DataFrame(), f"Error during scraping: {str(e)}"
951
 
952
 
953
- # Gradio Interface Functions
954
- def create_gradio_interface():
955
- """Create and return the Gradio interface"""
956
-
957
- def run_scraper_selenium(delay, max_entries, progress=gr.Progress()):
958
- """Run the Selenium scraper with progress updates"""
959
- scraper = None
960
- try:
961
- def update_progress(message):
962
- progress(message)
963
-
964
- scraper = ManusCopistaSeleniumScraper()
965
- df, status = scraper.scrape_all_copyists_with_progress(
966
- delay=delay,
967
- max_entries=max_entries if max_entries > 0 else None,
968
- progress_callback=update_progress
969
- )
970
-
971
- if df.empty:
972
- return None, f"No data scraped. Status: {status}"
973
-
974
- # Create CSV output
975
- csv_output = io.StringIO()
976
- df.to_csv(csv_output, index=False)
977
- csv_content = csv_output.getvalue()
978
-
979
- return csv_content, f"Success! {status}"
980
-
981
- except Exception as e:
982
- return None, f"Error: {str(e)}"
983
- finally:
984
- if scraper:
985
- scraper.cleanup()
986
-
987
- def run_scraper_requests(delay, max_entries, progress=gr.Progress()):
988
- """Run the requests-based scraper with progress updates"""
989
- try:
990
- def update_progress(message):
991
- progress(message)
992
-
993
- scraper = ManusCopistaMetadataScraper()
994
- df, status = scraper.scrape_all_copyists_with_progress(
995
- delay=delay,
996
- max_entries=max_entries if max_entries > 0 else None,
997
- progress_callback=update_progress
998
- )
999
-
1000
- if df.empty:
1001
- return None, f"No data scraped. Status: {status}"
1002
-
1003
- # Create CSV output
1004
- csv_output = io.StringIO()
1005
- df.to_csv(csv_output, index=False)
1006
- csv_content = csv_output.getvalue()
1007
-
1008
- return csv_content, f"Success! {status}"
1009
-
1010
- except Exception as e:
1011
- return None, f"Error: {str(e)}"
1012
-
1013
- def test_discovery(progress=gr.Progress()):
1014
- """Test the discovery method"""
1015
- try:
1016
- def update_progress(message):
1017
- progress(message)
1018
-
1019
- scraper = ManusCopistaMetadataScraper()
1020
- results = scraper.test_discovery_method(progress_callback=update_progress)
1021
-
1022
- return json.dumps(results, indent=2), "Discovery test completed"
1023
-
1024
- except Exception as e:
1025
- return None, f"Error: {str(e)}"
1026
-
1027
- with gr.Blocks(title="Manus Copista Scraper") as interface:
1028
- gr.Markdown("# Manus Copista Metadata Scraper")
1029
- gr.Markdown("Scrape copyist metadata from the Manus database using either Selenium or requests.")
1030
-
1031
- with gr.Tab("Selenium Scraper (Recommended)"):
1032
- gr.Markdown("### Selenium-based scraper (handles JavaScript)")
1033
-
1034
- with gr.Row():
1035
- selenium_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
1036
- selenium_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
1037
-
1038
- selenium_run_btn = gr.Button("Run Selenium Scraper", variant="primary")
1039
- selenium_status = gr.Textbox(label="Status", lines=3)
1040
- selenium_output = gr.File(label="Download CSV")
1041
-
1042
- selenium_run_btn.click(
1043
- run_scraper_selenium,
1044
- inputs=[selenium_delay, selenium_max_entries],
1045
- outputs=[selenium_output, selenium_status]
1046
- )
1047
-
1048
- with gr.Tab("Requests Scraper"):
1049
- gr.Markdown("### Requests-based scraper (faster, may miss JavaScript content)")
1050
-
1051
- with gr.Row():
1052
- requests_delay = gr.Number(label="Delay between requests (seconds)", value=1.0, minimum=0.1)
1053
- requests_max_entries = gr.Number(label="Max entries (0 = all)", value=0, minimum=0)
1054
-
1055
- requests_run_btn = gr.Button("Run Requests Scraper", variant="primary")
1056
- requests_status = gr.Textbox(label="Status", lines=3)
1057
- requests_output = gr.File(label="Download CSV")
1058
-
1059
- requests_run_btn.click(
1060
- run_scraper_requests,
1061
- inputs=[requests_delay, requests_max_entries],
1062
- outputs=[requests_output, requests_status]
1063
- )
1064
-
1065
- with gr.Tab("Discovery Test"):
1066
- gr.Markdown("### Test the ID discovery process")
1067
-
1068
- test_btn = gr.Button("Test Discovery Method", variant="secondary")
1069
- test_status = gr.Textbox(label="Status", lines=2)
1070
- test_output = gr.Textbox(label="Test Results", lines=20)
1071
-
1072
- test_btn.click(
1073
- test_discovery,
1074
- outputs=[test_output, test_status]
1075
- )
1076
-
1077
- gr.Markdown("---")
1078
- gr.Markdown("**Note:** The Selenium scraper is recommended as it can handle JavaScript content. The requests scraper is faster but may miss some data.")
1079
-
1080
- return interface
1081
 
1082
 
1083
- # Main execution
1084
  if __name__ == "__main__":
1085
- # Create and launch the interface
1086
- interface = create_gradio_interface()
1087
- interface.launch(
1088
- server_name="0.0.0.0",
1089
- server_port=7860,
1090
- share=False,
1091
- debug=True
1092
- )
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import pandas as pd
 
4
  import time
5
  import re
6
  from typing import Dict, List, Optional
7
  import json
 
8
  from datetime import datetime
9
+ import io
 
 
 
 
 
 
 
 
 
10
 
11
+ class ManusCopistaRequestsScraper:
12
+ def __init__(self):
13
+ self.base_url = "https://manus.iccu.sbn.it"
14
  self.detail_base_url = "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/detail/"
15
  self.browse_url = "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # Setup session with proper headers
18
+ self.session = requests.Session()
19
+ self.session.headers.update({
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
21
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
22
+ 'Accept-Language': 'en-US,en;q=0.5',
23
+ 'Accept-Encoding': 'gzip, deflate, br',
24
+ 'Connection': 'keep-alive',
25
+ 'Upgrade-Insecure-Requests': '1',
26
+ 'Sec-Fetch-Dest': 'document',
27
+ 'Sec-Fetch-Mode': 'navigate',
28
+ 'Sec-Fetch-Site': 'none',
29
+ })
30
 
31
+ def get_page(self, url: str) -> Optional[BeautifulSoup]:
32
+ """Fetch a page and return BeautifulSoup object"""
33
  try:
34
+ print(f"Fetching: {url}")
35
+ response = self.session.get(url, timeout=15)
36
+ response.raise_for_status()
37
 
38
+ # Check if we got a proper response
39
+ if response.status_code != 200:
40
+ print(f"Bad status code: {response.status_code}")
41
+ return None
 
 
 
 
42
 
43
+ return BeautifulSoup(response.text, 'html.parser')
 
 
44
 
45
+ except requests.exceptions.RequestException as e:
46
+ print(f"Request error for {url}: {e}")
 
 
 
47
  return None
48
  except Exception as e:
49
+ print(f"Unexpected error for {url}: {e}")
50
  return None
51
 
52
+ def discover_copyist_ids(self) -> List[str]:
53
+ """Discover copyist IDs from the browse page"""
54
+ print("Discovering copyist IDs...")
55
 
56
+ # Try different approaches to get the data
57
+ urls_to_try = [
58
+ self.browse_url,
59
+ "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse",
60
+ "https://manus.iccu.sbn.it/en/copisti2/-/manus-authorities/browse?delta=50",
61
+ "https://manus.iccu.sbn.it/copisti2/-/manus-authorities/browse?delta=100"
62
+ ]
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ all_ids = set()
 
65
 
66
+ for url in urls_to_try:
67
+ soup = self.get_page(url)
68
+ if soup:
69
+ ids = self.extract_ids_from_page(soup)
70
+ all_ids.update(ids)
71
+ print(f"Found {len(ids)} IDs from {url}")
72
+
73
+ # If we found IDs, try to get more from pagination
74
+ if ids:
75
+ pagination_ids = self.handle_pagination(soup, url)
76
+ all_ids.update(pagination_ids)
77
 
78
+ # If no IDs found from browse page, try a range-based approach
79
+ if not all_ids:
80
+ print("No IDs found from browse page, trying range-based discovery...")
81
+ all_ids = self.discover_ids_by_range()
82
 
83
+ return sorted(list(all_ids))
84
 
85
+ def extract_ids_from_page(self, soup: BeautifulSoup) -> List[str]:
86
+ """Extract copyist IDs from a page"""
87
  ids = set()
88
 
89
+ # Look for links that contain detail/ followed by numbers
90
+ links = soup.find_all('a', href=True)
 
 
 
 
 
 
 
 
 
 
 
 
91
  for link in links:
92
+ href = link.get('href', '')
93
+ match = re.search(r'detail/(\d+)', href)
94
+ if match:
95
+ copyist_id = match.group(1)
96
+ if len(copyist_id) >= 5: # Valid ID length
97
+ ids.add(copyist_id)
98
+
99
+ # Also look for any numbers that might be IDs in the page
100
+ text = soup.get_text()
101
+ numbers = re.findall(r'\b\d{6,7}\b', text)
102
+ for num in numbers:
103
+ if self.is_valid_id_format(num):
104
+ ids.add(num)
105
 
106
  return list(ids)
107
 
108
+ def handle_pagination(self, soup: BeautifulSoup, base_url: str) -> List[str]:
109
+ """Handle pagination to get more IDs"""
110
+ all_ids = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # Look for pagination links
113
+ pagination_links = []
114
+ links = soup.find_all('a', href=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ for link in links:
117
+ href = link.get('href', '')
118
+ text = link.get_text(strip=True).lower()
 
 
 
119
 
120
+ # Look for next page or numbered pages
121
+ if any(word in text for word in ['next', 'seguente', 'page', 'pagina']) or text.isdigit():
122
+ if href and href.startswith('/'):
123
+ full_url = self.base_url + href
124
+ pagination_links.append(full_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Visit pagination pages
127
+ for page_url in pagination_links[:10]: # Limit to prevent infinite loops
128
+ print(f"Checking pagination page: {page_url}")
129
+ page_soup = self.get_page(page_url)
130
+ if page_soup:
131
+ page_ids = self.extract_ids_from_page(page_soup)
132
+ all_ids.update(page_ids)
133
+ time.sleep(1) # Be respectful
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ return list(all_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ def discover_ids_by_range(self, start_id: int = 100000, end_id: int = 999999, sample_size: int = 1000) -> List[str]:
138
+ """Discover IDs by testing a range of potential IDs"""
139
+ print(f"Testing range-based discovery with {sample_size} samples...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ valid_ids = []
 
 
 
142
 
143
+ # Test a sample of IDs in the range
144
+ import random
145
+ test_ids = random.sample(range(start_id, end_id), min(sample_size, end_id - start_id))
 
 
 
146
 
147
+ for i, test_id in enumerate(test_ids):
148
+ if i % 100 == 0:
149
+ print(f"Tested {i}/{len(test_ids)} IDs, found {len(valid_ids)} valid")
150
+
151
+ if self.test_id_exists(str(test_id)):
152
+ valid_ids.append(str(test_id))
153
+
154
+ time.sleep(0.1) # Small delay
155
 
156
+ return valid_ids
 
 
 
 
 
157
 
158
+ def test_id_exists(self, copyist_id: str) -> bool:
159
+ """Test if a copyist ID exists by making a HEAD request"""
160
+ url = f"{self.detail_base_url}{copyist_id}"
 
 
 
 
 
 
 
161
  try:
162
+ response = self.session.head(url, timeout=5)
 
163
  return response.status_code == 200
164
  except:
165
  return False
166
 
167
+ def is_valid_id_format(self, id_str: str) -> bool:
168
+ """Check if string looks like a valid copyist ID"""
169
+ if not id_str.isdigit():
170
+ return False
171
+ return 5 <= len(id_str) <= 7
172
+
173
+ def scrape_copyist_detail(self, copyist_id: str) -> Dict:
174
+ """Scrape detailed information for a single copyist"""
175
+ url = f"{self.detail_base_url}{copyist_id}"
176
+ soup = self.get_page(url)
177
 
178
  if not soup:
179
+ return {'error': f'Could not fetch page for ID {copyist_id}'}
 
 
 
 
 
 
 
 
 
 
180
 
181
+ # Extract basic info
182
+ data = {
183
+ 'copyist_id': copyist_id,
184
+ 'detail_url': url,
185
+ 'scrape_timestamp': datetime.now().isoformat()
186
+ }
 
 
 
 
 
 
187
 
188
+ # Extract title
189
+ title = soup.find('title')
190
+ if title:
191
+ data['page_title'] = title.get_text(strip=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ # Extract main content
194
+ self.extract_copyist_data(soup, data)
 
 
 
195
 
196
+ return data
197
 
198
+ def extract_copyist_data(self, soup: BeautifulSoup, data: Dict):
199
+ """Extract copyist data from the page"""
200
+ # Try to find the main content table
201
+ table = soup.find('table', class_='table')
202
+ if not table:
203
+ table = soup.find('table')
204
 
205
+ if table:
206
+ self.extract_table_data(table, data)
 
207
 
208
+ # Try to extract name from various locations
209
+ name_candidates = []
 
 
 
 
 
210
 
211
+ # Look in headings
212
+ for heading in soup.find_all(['h1', 'h2', 'h3']):
213
+ text = heading.get_text(strip=True)
214
+ if text and len(text) > 2:
215
+ name_candidates.append(text)
 
 
 
 
 
 
 
 
216
 
217
+ # Look in title
218
+ if 'page_title' in data:
219
+ title_parts = data['page_title'].split(' - ')
220
+ for part in title_parts:
221
+ if part.strip() and len(part.strip()) > 2:
222
+ name_candidates.append(part.strip())
223
 
224
+ # Set the most likely name
225
+ if name_candidates:
226
+ data['copyist_name'] = name_candidates[0]
227
 
228
+ def extract_table_data(self, table, data: Dict):
229
+ """Extract data from the main table"""
230
+ rows = table.find_all('tr')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  for row in rows:
233
+ cells = row.find_all(['td', 'th'])
234
+ if len(cells) >= 2:
235
+ key_cell = cells[0]
236
+ value_cell = cells[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ key = key_cell.get_text(strip=True).lower()
239
+ value = value_cell.get_text(strip=True)
240
 
241
+ # Map common fields
242
+ if 'cnmn' in key:
243
+ data['cnmn_code'] = value
244
+ elif 'sbn' in key:
245
+ data['vid_sbn'] = value
246
+ link = value_cell.find('a')
247
+ if link:
248
+ data['vid_sbn_url'] = link.get('href', '')
249
+ elif 'isni' in key:
250
+ data['isni_code'] = value
251
+ link = value_cell.find('a')
 
 
 
 
 
 
 
 
252
  if link:
253
+ data['isni_url'] = link.get('href', '')
254
+ elif 'biographical' in key or 'biografica' in key:
255
+ data['biographical_note'] = value
256
+ elif 'bibliographical' in key or 'bibliografia' in key:
257
+ if 'source' in key:
258
+ data['bibliographical_sources'] = value
259
  else:
260
+ data['bibliographical_notes'] = value
261
+ elif 'name' in key and 'manuscript' in key:
262
+ data['names_in_manuscript'] = value
263
+ elif 'creation' in key or 'creazione' in key:
264
+ data['date_of_creation'] = value
265
+ elif 'modification' in key or 'modifica' in key:
266
+ data['last_modification'] = value
267
+ elif 'identifier' in key:
268
+ data['other_identifiers'] = value
269
+
270
+ def scrape_all_copyists(self, delay: float = 1.0, max_entries: int = None) -> pd.DataFrame:
271
+ """Scrape all copyists"""
272
+ print("Starting full scrape...")
273
+
274
+ # Discover IDs
275
+ copyist_ids = self.discover_copyist_ids()
276
+ print(f"Found {len(copyist_ids)} copyist IDs")
277
+
278
+ if not copyist_ids:
279
+ print("No copyist IDs found!")
280
+ return pd.DataFrame()
281
+
282
+ # Limit if requested
283
+ if max_entries and max_entries > 0:
284
+ copyist_ids = copyist_ids[:max_entries]
285
+ print(f"Limited to {max_entries} entries")
286
+
287
+ # Scrape each copyist
288
+ all_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
  for i, copyist_id in enumerate(copyist_ids, 1):
291
+ print(f"Scraping {i}/{len(copyist_ids)}: ID {copyist_id}")
 
292
 
293
+ data = self.scrape_copyist_detail(copyist_id)
294
 
295
+ if 'error' not in data:
296
+ data['scrape_order'] = i
297
+ all_data.append(data)
298
  else:
299
+ print(f"Error scraping {copyist_id}: {data['error']}")
 
300
 
301
  # Delay between requests
302
  if delay > 0:
303
  time.sleep(delay)
304
 
305
+ df = pd.DataFrame(all_data)
306
+ print(f"Successfully scraped {len(df)} copyists")
307
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
 
310
+ # Simple usage example
311
+ def main():
312
+ """Main function to run the scraper"""
313
+ scraper = ManusCopistaRequestsScraper()
314
+
315
+ # Test with a small number first
316
+ print("Testing with 10 entries...")
317
+ df = scraper.scrape_all_copyists(delay=1.0, max_entries=10)
318
+
319
+ if not df.empty:
320
+ print(f"Successfully scraped {len(df)} copyists")
321
+ print("\nColumns:", df.columns.tolist())
322
+ print("\nFirst few rows:")
323
+ print(df.head())
324
+
325
+ # Save to CSV
326
+ filename = f"manus_copyists_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
327
+ df.to_csv(filename, index=False)
328
+ print(f"\nSaved to {filename}")
329
+ else:
330
+ print("No data scraped!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
 
 
333
  if __name__ == "__main__":
334
+ main()