Dan Foley commited on
Commit
b842841
·
unverified ·
1 Parent(s): 1e0a968

Delete bpl_scraper.py

Browse files
Files changed (1) hide show
  1. bpl_scraper.py +0 -177
bpl_scraper.py DELETED
@@ -1,177 +0,0 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import os
4
- import json
5
- import re
6
- from typing import List, Dict
7
- import logging
8
- from urllib.parse import urljoin, urlparse
9
-
10
- class DigitalCommonwealthScraper:
11
- def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
12
- """
13
- Initialize the scraper with base URL and logging
14
-
15
- :param base_url: Base URL for Digital Commonwealth
16
- """
17
- self.base_url = base_url
18
- logging.basicConfig(level=logging.INFO)
19
- self.logger = logging.getLogger(__name__)
20
-
21
- # Headers to mimic browser request
22
- self.headers = {
23
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
24
- }
25
-
26
- def fetch_page(self, url: str) -> requests.Response:
27
- """
28
- Fetch webpage content with error handling
29
-
30
- :param url: URL to fetch
31
- :return: Response object
32
- """
33
- try:
34
- response = requests.get(url, headers=self.headers)
35
- response.raise_for_status()
36
- return response
37
- except requests.RequestException as e:
38
- self.logger.error(f"Error fetching {url}: {e}")
39
- return None
40
-
41
- def extract_json_metadata(self, url: str) -> Dict:
42
- """
43
- Extract JSON metadata from the page
44
-
45
- :param url: URL of the page
46
- :return: Dictionary of metadata
47
- """
48
- json_url = f"{url}.json"
49
- response = self.fetch_page(json_url)
50
-
51
- if response:
52
- try:
53
- return response.json()
54
- except json.JSONDecodeError:
55
- self.logger.error(f"Could not parse JSON from {json_url}")
56
- return {}
57
- return {}
58
-
59
- def extract_images(self, url: str) -> List[Dict]:
60
- """
61
- Extract images from the page
62
-
63
- :param url: URL of the page to scrape
64
- :return: List of image dictionaries
65
- """
66
- # Fetch page content
67
- response = self.fetch_page(url)
68
- if not response:
69
- return []
70
-
71
- # Parse HTML
72
- soup = BeautifulSoup(response.text, 'html.parser')
73
-
74
- # Extract JSON metadata
75
- metadata = self.extract_json_metadata(url)
76
-
77
- # List to store images
78
- images = []
79
-
80
- # Strategy 1: Look for image viewers or specific image containers
81
- image_containers = [
82
- soup.find('div', class_='viewer-container'),
83
- soup.find('div', class_='image-viewer'),
84
- soup.find('div', id='image-container')
85
- ]
86
-
87
- # Strategy 2: Find all image tags
88
- img_tags = soup.find_all('img')
89
-
90
- # Combine image sources
91
- for img in img_tags:
92
- # Get image source
93
- src = img.get('src')
94
- if not src:
95
- continue
96
-
97
- # Resolve relative URLs
98
- full_src = urljoin(url, src)
99
-
100
- # Extract alt text or use filename
101
- alt = img.get('alt', os.path.basename(urlparse(full_src).path))
102
-
103
- # Create image dictionary
104
- image_info = {
105
- 'url': full_src,
106
- 'alt': alt,
107
- 'source_page': url
108
- }
109
-
110
- # Try to add metadata if available
111
- if metadata:
112
- try:
113
- # Extract relevant metadata from JSON if possible
114
- image_info['metadata'] = {
115
- 'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
116
- 'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
117
- 'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
118
- }
119
- except Exception as e:
120
- self.logger.warning(f"Error extracting metadata: {e}")
121
-
122
- images.append(image_info)
123
-
124
- return images
125
-
126
- def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
127
- """
128
- Download images to local directory
129
-
130
- :param images: List of image dictionaries
131
- :param output_dir: Directory to save images
132
- :return: List of downloaded file paths
133
- """
134
- # Create output directory
135
- os.makedirs(output_dir, exist_ok=True)
136
-
137
- downloaded_files = []
138
-
139
- for i, image in enumerate(images):
140
- try:
141
- response = requests.get(image['url'], headers=self.headers)
142
- response.raise_for_status()
143
-
144
- # Generate filename
145
- ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
146
- filename = os.path.join(output_dir, f'image_{i}{ext}')
147
-
148
- with open(filename, 'wb') as f:
149
- f.write(response.content)
150
-
151
- downloaded_files.append(filename)
152
- self.logger.info(f"Downloaded: {filename}")
153
-
154
- except Exception as e:
155
- self.logger.error(f"Error downloading {image['url']}: {e}")
156
-
157
- return downloaded_files
158
-
159
- #def main():
160
- # Example usage
161
- # scraper = DigitalCommonwealthScraper()
162
- #
163
- # Example URL from input
164
- # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
165
-
166
- # Extract images
167
- #images = scraper.extract_images(url)
168
-
169
- # Print image information
170
- #for img in images:
171
- # print(json.dumps(img, indent=2))
172
-
173
- # Optional: Download images
174
- #scraper.download_images(images)
175
-
176
- #if __name__ == "__main__":
177
- # main()