Spaces:

spark-ds549
/

LibRAG

Sleeping

App Files Files Community

LibRAG / bpl_scraper.py

bmv2021

added image display capabilities

29b3738 7 months ago

raw

history blame

5.86 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	import json
	import re
	from typing import List, Dict
	import logging
	from urllib.parse import urljoin, urlparse

	class DigitalCommonwealthScraper:
	def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
	"""
	Initialize the scraper with base URL and logging

	:param base_url: Base URL for Digital Commonwealth
	"""
	self.base_url = base_url
	logging.basicConfig(level=logging.INFO)
	self.logger = logging.getLogger(__name__)

	# Headers to mimic browser request
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	def fetch_page(self, url: str) -> requests.Response:
	"""
	Fetch webpage content with error handling

	:param url: URL to fetch
	:return: Response object
	"""
	try:
	response = requests.get(url, headers=self.headers)
	response.raise_for_status()
	return response
	except requests.RequestException as e:
	self.logger.error(f"Error fetching {url}: {e}")
	return None

	def extract_json_metadata(self, url: str) -> Dict:
	"""
	Extract JSON metadata from the page

	:param url: URL of the page
	:return: Dictionary of metadata
	"""
	json_url = f"{url}.json"
	response = self.fetch_page(json_url)

	if response:
	try:
	return response.json()
	except json.JSONDecodeError:
	self.logger.error(f"Could not parse JSON from {json_url}")
	return {}
	return {}

	def extract_images(self, url: str) -> List[Dict]:
	"""
	Extract images from the page

	:param url: URL of the page to scrape
	:return: List of image dictionaries
	"""
	# Fetch page content
	response = self.fetch_page(url)
	if not response:
	return []

	# Parse HTML
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract JSON metadata
	metadata = self.extract_json_metadata(url)

	# List to store images
	images = []

	# Strategy 1: Look for image viewers or specific image containers
	image_containers = [
	soup.find('div', class_='viewer-container'),
	soup.find('div', class_='image-viewer'),
	soup.find('div', id='image-container')
	]

	# Strategy 2: Find all image tags
	img_tags = soup.find_all('img')

	# Combine image sources
	for img in img_tags:
	# Get image source
	src = img.get('src')
	if not src:
	continue

	# Resolve relative URLs
	full_src = urljoin(url, src)

	# Extract alt text or use filename
	alt = img.get('alt', os.path.basename(urlparse(full_src).path))

	# Create image dictionary
	image_info = {
	'url': full_src,
	'alt': alt,
	'source_page': url
	}

	# Try to add metadata if available
	if metadata:
	try:
	# Extract relevant metadata from JSON if possible
	image_info['metadata'] = {
	'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
	'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
	'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
	}
	except Exception as e:
	self.logger.warning(f"Error extracting metadata: {e}")

	images.append(image_info)

	return images

	def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
	"""
	Download images to local directory

	:param images: List of image dictionaries
	:param output_dir: Directory to save images
	:return: List of downloaded file paths
	"""
	# Create output directory
	os.makedirs(output_dir, exist_ok=True)

	downloaded_files = []

	for i, image in enumerate(images):
	try:
	response = requests.get(image['url'], headers=self.headers)
	response.raise_for_status()

	# Generate filename
	ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
	filename = os.path.join(output_dir, f'image_{i}{ext}')

	with open(filename, 'wb') as f:
	f.write(response.content)

	downloaded_files.append(filename)
	self.logger.info(f"Downloaded: {filename}")

	except Exception as e:
	self.logger.error(f"Error downloading {image['url']}: {e}")

	return downloaded_files

	#def main():
	# Example usage
	# scraper = DigitalCommonwealthScraper()
	#
	# Example URL from input
	# url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"

	# Extract images
	#images = scraper.extract_images(url)

	# Print image information
	#for img in images:
	# print(json.dumps(img, indent=2))

	# Optional: Download images
	#scraper.download_images(images)

	#if __name__ == "__main__":
	# main()