LibRAG / bpl_scraper.py
bmv2021's picture
added image display capabilities
29b3738
raw
history blame
5.86 kB
import requests
from bs4 import BeautifulSoup
import os
import json
import re
from typing import List, Dict
import logging
from urllib.parse import urljoin, urlparse
class DigitalCommonwealthScraper:
def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
"""
Initialize the scraper with base URL and logging
:param base_url: Base URL for Digital Commonwealth
"""
self.base_url = base_url
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
# Headers to mimic browser request
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_page(self, url: str) -> requests.Response:
"""
Fetch webpage content with error handling
:param url: URL to fetch
:return: Response object
"""
try:
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response
except requests.RequestException as e:
self.logger.error(f"Error fetching {url}: {e}")
return None
def extract_json_metadata(self, url: str) -> Dict:
"""
Extract JSON metadata from the page
:param url: URL of the page
:return: Dictionary of metadata
"""
json_url = f"{url}.json"
response = self.fetch_page(json_url)
if response:
try:
return response.json()
except json.JSONDecodeError:
self.logger.error(f"Could not parse JSON from {json_url}")
return {}
return {}
def extract_images(self, url: str) -> List[Dict]:
"""
Extract images from the page
:param url: URL of the page to scrape
:return: List of image dictionaries
"""
# Fetch page content
response = self.fetch_page(url)
if not response:
return []
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Extract JSON metadata
metadata = self.extract_json_metadata(url)
# List to store images
images = []
# Strategy 1: Look for image viewers or specific image containers
image_containers = [
soup.find('div', class_='viewer-container'),
soup.find('div', class_='image-viewer'),
soup.find('div', id='image-container')
]
# Strategy 2: Find all image tags
img_tags = soup.find_all('img')
# Combine image sources
for img in img_tags:
# Get image source
src = img.get('src')
if not src:
continue
# Resolve relative URLs
full_src = urljoin(url, src)
# Extract alt text or use filename
alt = img.get('alt', os.path.basename(urlparse(full_src).path))
# Create image dictionary
image_info = {
'url': full_src,
'alt': alt,
'source_page': url
}
# Try to add metadata if available
if metadata:
try:
# Extract relevant metadata from JSON if possible
image_info['metadata'] = {
'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
}
except Exception as e:
self.logger.warning(f"Error extracting metadata: {e}")
images.append(image_info)
return images
def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
"""
Download images to local directory
:param images: List of image dictionaries
:param output_dir: Directory to save images
:return: List of downloaded file paths
"""
# Create output directory
os.makedirs(output_dir, exist_ok=True)
downloaded_files = []
for i, image in enumerate(images):
try:
response = requests.get(image['url'], headers=self.headers)
response.raise_for_status()
# Generate filename
ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
filename = os.path.join(output_dir, f'image_{i}{ext}')
with open(filename, 'wb') as f:
f.write(response.content)
downloaded_files.append(filename)
self.logger.info(f"Downloaded: {filename}")
except Exception as e:
self.logger.error(f"Error downloading {image['url']}: {e}")
return downloaded_files
#def main():
# Example usage
# scraper = DigitalCommonwealthScraper()
#
# Example URL from input
# url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
# Extract images
#images = scraper.extract_images(url)
# Print image information
#for img in images:
# print(json.dumps(img, indent=2))
# Optional: Download images
#scraper.download_images(images)
#if __name__ == "__main__":
# main()