bmv2021 commited on
Commit
29b3738
·
1 Parent(s): cf15415

added image display capabilities

Browse files
Files changed (1) hide show
  1. bpl_scraper.py +177 -0
bpl_scraper.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import os
4
+ import json
5
+ import re
6
+ from typing import List, Dict
7
+ import logging
8
+ from urllib.parse import urljoin, urlparse
9
+
10
+ class DigitalCommonwealthScraper:
11
+ def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
12
+ """
13
+ Initialize the scraper with base URL and logging
14
+
15
+ :param base_url: Base URL for Digital Commonwealth
16
+ """
17
+ self.base_url = base_url
18
+ logging.basicConfig(level=logging.INFO)
19
+ self.logger = logging.getLogger(__name__)
20
+
21
+ # Headers to mimic browser request
22
+ self.headers = {
23
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
24
+ }
25
+
26
+ def fetch_page(self, url: str) -> requests.Response:
27
+ """
28
+ Fetch webpage content with error handling
29
+
30
+ :param url: URL to fetch
31
+ :return: Response object
32
+ """
33
+ try:
34
+ response = requests.get(url, headers=self.headers)
35
+ response.raise_for_status()
36
+ return response
37
+ except requests.RequestException as e:
38
+ self.logger.error(f"Error fetching {url}: {e}")
39
+ return None
40
+
41
+ def extract_json_metadata(self, url: str) -> Dict:
42
+ """
43
+ Extract JSON metadata from the page
44
+
45
+ :param url: URL of the page
46
+ :return: Dictionary of metadata
47
+ """
48
+ json_url = f"{url}.json"
49
+ response = self.fetch_page(json_url)
50
+
51
+ if response:
52
+ try:
53
+ return response.json()
54
+ except json.JSONDecodeError:
55
+ self.logger.error(f"Could not parse JSON from {json_url}")
56
+ return {}
57
+ return {}
58
+
59
+ def extract_images(self, url: str) -> List[Dict]:
60
+ """
61
+ Extract images from the page
62
+
63
+ :param url: URL of the page to scrape
64
+ :return: List of image dictionaries
65
+ """
66
+ # Fetch page content
67
+ response = self.fetch_page(url)
68
+ if not response:
69
+ return []
70
+
71
+ # Parse HTML
72
+ soup = BeautifulSoup(response.text, 'html.parser')
73
+
74
+ # Extract JSON metadata
75
+ metadata = self.extract_json_metadata(url)
76
+
77
+ # List to store images
78
+ images = []
79
+
80
+ # Strategy 1: Look for image viewers or specific image containers
81
+ image_containers = [
82
+ soup.find('div', class_='viewer-container'),
83
+ soup.find('div', class_='image-viewer'),
84
+ soup.find('div', id='image-container')
85
+ ]
86
+
87
+ # Strategy 2: Find all image tags
88
+ img_tags = soup.find_all('img')
89
+
90
+ # Combine image sources
91
+ for img in img_tags:
92
+ # Get image source
93
+ src = img.get('src')
94
+ if not src:
95
+ continue
96
+
97
+ # Resolve relative URLs
98
+ full_src = urljoin(url, src)
99
+
100
+ # Extract alt text or use filename
101
+ alt = img.get('alt', os.path.basename(urlparse(full_src).path))
102
+
103
+ # Create image dictionary
104
+ image_info = {
105
+ 'url': full_src,
106
+ 'alt': alt,
107
+ 'source_page': url
108
+ }
109
+
110
+ # Try to add metadata if available
111
+ if metadata:
112
+ try:
113
+ # Extract relevant metadata from JSON if possible
114
+ image_info['metadata'] = {
115
+ 'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
116
+ 'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
117
+ 'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
118
+ }
119
+ except Exception as e:
120
+ self.logger.warning(f"Error extracting metadata: {e}")
121
+
122
+ images.append(image_info)
123
+
124
+ return images
125
+
126
+ def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
127
+ """
128
+ Download images to local directory
129
+
130
+ :param images: List of image dictionaries
131
+ :param output_dir: Directory to save images
132
+ :return: List of downloaded file paths
133
+ """
134
+ # Create output directory
135
+ os.makedirs(output_dir, exist_ok=True)
136
+
137
+ downloaded_files = []
138
+
139
+ for i, image in enumerate(images):
140
+ try:
141
+ response = requests.get(image['url'], headers=self.headers)
142
+ response.raise_for_status()
143
+
144
+ # Generate filename
145
+ ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
146
+ filename = os.path.join(output_dir, f'image_{i}{ext}')
147
+
148
+ with open(filename, 'wb') as f:
149
+ f.write(response.content)
150
+
151
+ downloaded_files.append(filename)
152
+ self.logger.info(f"Downloaded: {filename}")
153
+
154
+ except Exception as e:
155
+ self.logger.error(f"Error downloading {image['url']}: {e}")
156
+
157
+ return downloaded_files
158
+
159
+ #def main():
160
+ # Example usage
161
+ # scraper = DigitalCommonwealthScraper()
162
+ #
163
+ # Example URL from input
164
+ # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
165
+
166
+ # Extract images
167
+ #images = scraper.extract_images(url)
168
+
169
+ # Print image information
170
+ #for img in images:
171
+ # print(json.dumps(img, indent=2))
172
+
173
+ # Optional: Download images
174
+ #scraper.download_images(images)
175
+
176
+ #if __name__ == "__main__":
177
+ # main()