Spaces:

spark-ds549
/

LibRAG

Sleeping

App Files Files Community

bmv2021 commited on Dec 10, 2024

Commit

29b3738

1 Parent(s): cf15415

added image display capabilities

Browse files

Files changed (1) hide show

bpl_scraper.py +177 -0

bpl_scraper.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import requests
+from bs4 import BeautifulSoup
+import os
+import json
+import re
+from typing import List, Dict
+import logging
+from urllib.parse import urljoin, urlparse
+class DigitalCommonwealthScraper:
+    def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
+        """
+        Initialize the scraper with base URL and logging
+        :param base_url: Base URL for Digital Commonwealth
+        """
+        self.base_url = base_url
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        # Headers to mimic browser request
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    def fetch_page(self, url: str) -> requests.Response:
+        """
+        Fetch webpage content with error handling
+        :param url: URL to fetch
+        :return: Response object
+        """
+        try:
+            response = requests.get(url, headers=self.headers)
+            response.raise_for_status()
+            return response
+        except requests.RequestException as e:
+            self.logger.error(f"Error fetching {url}: {e}")
+            return None
+    def extract_json_metadata(self, url: str) -> Dict:
+        """
+        Extract JSON metadata from the page
+        :param url: URL of the page
+        :return: Dictionary of metadata
+        """
+        json_url = f"{url}.json"
+        response = self.fetch_page(json_url)
+        if response:
+            try:
+                return response.json()
+            except json.JSONDecodeError:
+                self.logger.error(f"Could not parse JSON from {json_url}")
+                return {}
+        return {}
+    def extract_images(self, url: str) -> List[Dict]:
+        """
+        Extract images from the page
+        :param url: URL of the page to scrape
+        :return: List of image dictionaries
+        """
+        # Fetch page content
+        response = self.fetch_page(url)
+        if not response:
+            return []
+        # Parse HTML
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract JSON metadata
+        metadata = self.extract_json_metadata(url)
+        # List to store images
+        images = []
+        # Strategy 1: Look for image viewers or specific image containers
+        image_containers = [
+            soup.find('div', class_='viewer-container'),
+            soup.find('div', class_='image-viewer'),
+            soup.find('div', id='image-container')
+        ]
+        # Strategy 2: Find all image tags
+        img_tags = soup.find_all('img')
+        # Combine image sources
+        for img in img_tags:
+            # Get image source
+            src = img.get('src')
+            if not src:
+                continue
+            # Resolve relative URLs
+            full_src = urljoin(url, src)
+            # Extract alt text or use filename
+            alt = img.get('alt', os.path.basename(urlparse(full_src).path))
+            # Create image dictionary
+            image_info = {
+                'url': full_src,
+                'alt': alt,
+                'source_page': url
+            }
+            # Try to add metadata if available
+            if metadata:
+                try:
+                    # Extract relevant metadata from JSON if possible
+                    image_info['metadata'] = {
+                        'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
+                        'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
+                        'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
+                    }
+                except Exception as e:
+                    self.logger.warning(f"Error extracting metadata: {e}")
+            images.append(image_info)
+        return images
+    def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
+        """
+        Download images to local directory
+        :param images: List of image dictionaries
+        :param output_dir: Directory to save images
+        :return: List of downloaded file paths
+        """
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+        downloaded_files = []
+        for i, image in enumerate(images):
+            try:
+                response = requests.get(image['url'], headers=self.headers)
+                response.raise_for_status()
+                # Generate filename
+                ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
+                filename = os.path.join(output_dir, f'image_{i}{ext}')
+                with open(filename, 'wb') as f:
+                    f.write(response.content)
+                downloaded_files.append(filename)
+                self.logger.info(f"Downloaded: {filename}")
+            except Exception as e:
+                self.logger.error(f"Error downloading {image['url']}: {e}")
+        return downloaded_files
+#def main():
+    # Example usage
+ #   scraper = DigitalCommonwealthScraper()
+  #
+    # Example URL from input
+   # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
+    # Extract images
+    #images = scraper.extract_images(url)
+    # Print image information
+    #for img in images:
+     #   print(json.dumps(img, indent=2))
+    # Optional: Download images
+    #scraper.download_images(images)
+#if __name__ == "__main__":
+ #   main()