bmv2021 commited on
Commit
f78d694
·
1 Parent(s): 09fe857

upload correct file

Browse files
Files changed (1) hide show
  1. image_scraper.py +1 -0
image_scraper.py ADDED
@@ -0,0 +1 @@
 
0
  def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
1
  """
2
  Initialize the scraper with base URL and logging
3
 
4
  :param base_url: Base URL for Digital Commonwealth
5
  """
6
  self.base_url = base_url
7
  logging.basicConfig(level=logging.INFO)
8
  self.logger = logging.getLogger(__name__)
9
 
10
  # Headers to mimic browser request
11
  self.headers = {
12
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13
  }
14
 
15
  def fetch_page(self, url: str) -> requests.Response:
16
  """
17
  Fetch webpage content with error handling
18
 
19
  :param url: URL to fetch
20
  :return: Response object
21
  """
22
  try:
23
  response = requests.get(url, headers=self.headers)
24
  response.raise_for_status()
25
  return response
26
  except requests.RequestException as e:
27
  self.logger.error(f"Error fetching {url}: {e}")
28
  return None
29
 
30
  def extract_json_metadata(self, url: str) -> Dict:
31
  """
32
  Extract JSON metadata from the page
33
 
34
  :param url: URL of the page
35
  :return: Dictionary of metadata
36
  """
37
  json_url = f"{url}.json"
38
  response = self.fetch_page(json_url)
39
 
40
  if response:
41
  try:
42
  return response.json()
43
  except json.JSONDecodeError:
44
  self.logger.error(f"Could not parse JSON from {json_url}")
45
  return {}
46
  return {}
47
 
48
  def extract_images(self, url: str) -> List[Dict]:
49
  """
50
  Extract images from the page
51
 
52
  :param url: URL of the page to scrape
53
  :return: List of image dictionaries
54
  """
55
  # Fetch page content
56
  response = self.fetch_page(url)
57
  if not response:
58
  return []
59
 
60
  # Parse HTML
61
  soup = BeautifulSoup(response.text, 'html.parser')
62
 
63
  # Extract JSON metadata
64
  metadata = self.extract_json_metadata(url)
65
 
66
  # List to store images
67
  images = []
68
 
69
  # Strategy 1: Look for image viewers or specific image containers
70
  image_containers = [
71
  soup.find('div', class_='viewer-container'),
72
  soup.find('div', class_='image-viewer'),
73
  soup.find('div', id='image-container')
74
  ]
75
 
76
  # Strategy 2: Find all image tags
77
  img_tags = soup.find_all('img')
78
 
79
  # Combine image sources
80
  for img in img_tags:
81
  # Get image source
82
  src = img.get('src')
83
  if not src:
84
  continue
85
 
86
  # Resolve relative URLs
87
  full_src = urljoin(url, src)
88
 
89
  # Extract alt text or use filename
90
  alt = img.get('alt', os.path.basename(urlparse(full_src).path))
91
 
92
  # Create image dictionary
93
  image_info = {
94
  'url': full_src,
95
  'alt': alt,
96
  'source_page': url
97
  }
98
 
99
  # Try to add metadata if available
100
  if metadata:
101
  try:
102
  # Extract relevant metadata from JSON if possible
103
  image_info['metadata'] = {
104
  'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
105
  'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
106
  'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
107
  }
108
  except Exception as e:
109
  self.logger.warning(f"Error extracting metadata: {e}")
110
 
111
  images.append(image_info)
112
 
113
  return images
114
 
115
  def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
116
  """
117
  Download images to local directory
118
 
119
  :param images: List of image dictionaries
120
  :param output_dir: Directory to save images
121
  :return: List of downloaded file paths
122
  """
123
  # Create output directory
124
  os.makedirs(output_dir, exist_ok=True)
125
 
126
  downloaded_files = []
127
 
128
  for i, image in enumerate(images):
129
  try:
130
  response = requests.get(image['url'], headers=self.headers)
131
  response.raise_for_status()
132
 
133
  # Generate filename
134
  ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
135
  filename = os.path.join(output_dir, f'image_{i}{ext}')
136
 
137
  with open(filename, 'wb') as f:
138
  f.write(response.content)
139
 
140
  downloaded_files.append(filename)
141
  self.logger.info(f"Downloaded: {filename}")
142
 
143
  except Exception as e:
144
  self.logger.error(f"Error downloading {image['url']}: {e}")
145
 
146
  return downloaded_files
 
1
+ import requests
2
  def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
3
  """
4
  Initialize the scraper with base URL and logging
5
 
6
  :param base_url: Base URL for Digital Commonwealth
7
  """
8
  self.base_url = base_url
9
  logging.basicConfig(level=logging.INFO)
10
  self.logger = logging.getLogger(__name__)
11
 
12
  # Headers to mimic browser request
13
  self.headers = {
14
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
15
  }
16
 
17
  def fetch_page(self, url: str) -> requests.Response:
18
  """
19
  Fetch webpage content with error handling
20
 
21
  :param url: URL to fetch
22
  :return: Response object
23
  """
24
  try:
25
  response = requests.get(url, headers=self.headers)
26
  response.raise_for_status()
27
  return response
28
  except requests.RequestException as e:
29
  self.logger.error(f"Error fetching {url}: {e}")
30
  return None
31
 
32
  def extract_json_metadata(self, url: str) -> Dict:
33
  """
34
  Extract JSON metadata from the page
35
 
36
  :param url: URL of the page
37
  :return: Dictionary of metadata
38
  """
39
  json_url = f"{url}.json"
40
  response = self.fetch_page(json_url)
41
 
42
  if response:
43
  try:
44
  return response.json()
45
  except json.JSONDecodeError:
46
  self.logger.error(f"Could not parse JSON from {json_url}")
47
  return {}
48
  return {}
49
 
50
  def extract_images(self, url: str) -> List[Dict]:
51
  """
52
  Extract images from the page
53
 
54
  :param url: URL of the page to scrape
55
  :return: List of image dictionaries
56
  """
57
  # Fetch page content
58
  response = self.fetch_page(url)
59
  if not response:
60
  return []
61
 
62
  # Parse HTML
63
  soup = BeautifulSoup(response.text, 'html.parser')
64
 
65
  # Extract JSON metadata
66
  metadata = self.extract_json_metadata(url)
67
 
68
  # List to store images
69
  images = []
70
 
71
  # Strategy 1: Look for image viewers or specific image containers
72
  image_containers = [
73
  soup.find('div', class_='viewer-container'),
74
  soup.find('div', class_='image-viewer'),
75
  soup.find('div', id='image-container')
76
  ]
77
 
78
  # Strategy 2: Find all image tags
79
  img_tags = soup.find_all('img')
80
 
81
  # Combine image sources
82
  for img in img_tags:
83
  # Get image source
84
  src = img.get('src')
85
  if not src:
86
  continue
87
 
88
  # Resolve relative URLs
89
  full_src = urljoin(url, src)
90
 
91
  # Extract alt text or use filename
92
  alt = img.get('alt', os.path.basename(urlparse(full_src).path))
93
 
94
  # Create image dictionary
95
  image_info = {
96
  'url': full_src,
97
  'alt': alt,
98
  'source_page': url
99
  }
100
 
101
  # Try to add metadata if available
102
  if metadata:
103
  try:
104
  # Extract relevant metadata from JSON if possible
105
  image_info['metadata'] = {
106
  'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
107
  'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
108
  'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
109
  }
110
  except Exception as e:
111
  self.logger.warning(f"Error extracting metadata: {e}")
112
 
113
  images.append(image_info)
114
 
115
  return images
116
 
117
  def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
118
  """
119
  Download images to local directory
120
 
121
  :param images: List of image dictionaries
122
  :param output_dir: Directory to save images
123
  :return: List of downloaded file paths
124
  """
125
  # Create output directory
126
  os.makedirs(output_dir, exist_ok=True)
127
 
128
  downloaded_files = []
129
 
130
  for i, image in enumerate(images):
131
  try:
132
  response = requests.get(image['url'], headers=self.headers)
133
  response.raise_for_status()
134
 
135
  # Generate filename
136
  ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
137
  filename = os.path.join(output_dir, f'image_{i}{ext}')
138
 
139
  with open(filename, 'wb') as f:
140
  f.write(response.content)
141
 
142
  downloaded_files.append(filename)
143
  self.logger.info(f"Downloaded: {filename}")
144
 
145
  except Exception as e:
146
  self.logger.error(f"Error downloading {image['url']}: {e}")
147
 
148
  return downloaded_files