Spaces:

spark-ds549
/

LibRAG

Sleeping

App Files Files Community

Dan Foley commited on Dec 13, 2024

Commit

b842841

unverified ·

1 Parent(s): 1e0a968

Delete bpl_scraper.py

Browse files

Files changed (1) hide show

bpl_scraper.py +0 -177

bpl_scraper.py DELETED Viewed

@@ -1,177 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import os
-import json
-import re
-from typing import List, Dict
-import logging
-from urllib.parse import urljoin, urlparse
-class DigitalCommonwealthScraper:
-    def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
-        """
-        Initialize the scraper with base URL and logging
-        :param base_url: Base URL for Digital Commonwealth
-        """
-        self.base_url = base_url
-        logging.basicConfig(level=logging.INFO)
-        self.logger = logging.getLogger(__name__)
-        # Headers to mimic browser request
-        self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
-    def fetch_page(self, url: str) -> requests.Response:
-        """
-        Fetch webpage content with error handling
-        :param url: URL to fetch
-        :return: Response object
-        """
-        try:
-            response = requests.get(url, headers=self.headers)
-            response.raise_for_status()
-            return response
-        except requests.RequestException as e:
-            self.logger.error(f"Error fetching {url}: {e}")
-            return None
-    def extract_json_metadata(self, url: str) -> Dict:
-        """
-        Extract JSON metadata from the page
-        :param url: URL of the page
-        :return: Dictionary of metadata
-        """
-        json_url = f"{url}.json"
-        response = self.fetch_page(json_url)
-        if response:
-            try:
-                return response.json()
-            except json.JSONDecodeError:
-                self.logger.error(f"Could not parse JSON from {json_url}")
-                return {}
-        return {}
-    def extract_images(self, url: str) -> List[Dict]:
-        """
-        Extract images from the page
-        :param url: URL of the page to scrape
-        :return: List of image dictionaries
-        """
-        # Fetch page content
-        response = self.fetch_page(url)
-        if not response:
-            return []
-        # Parse HTML
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Extract JSON metadata
-        metadata = self.extract_json_metadata(url)
-        # List to store images
-        images = []
-        # Strategy 1: Look for image viewers or specific image containers
-        image_containers = [
-            soup.find('div', class_='viewer-container'),
-            soup.find('div', class_='image-viewer'),
-            soup.find('div', id='image-container')
-        ]
-        # Strategy 2: Find all image tags
-        img_tags = soup.find_all('img')
-        # Combine image sources
-        for img in img_tags:
-            # Get image source
-            src = img.get('src')
-            if not src:
-                continue
-            # Resolve relative URLs
-            full_src = urljoin(url, src)
-            # Extract alt text or use filename
-            alt = img.get('alt', os.path.basename(urlparse(full_src).path))
-            # Create image dictionary
-            image_info = {
-                'url': full_src,
-                'alt': alt,
-                'source_page': url
-            }
-            # Try to add metadata if available
-            if metadata:
-                try:
-                    # Extract relevant metadata from JSON if possible
-                    image_info['metadata'] = {
-                        'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
-                        'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
-                        'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
-                    }
-                except Exception as e:
-                    self.logger.warning(f"Error extracting metadata: {e}")
-            images.append(image_info)
-        return images
-    def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
-        """
-        Download images to local directory
-        :param images: List of image dictionaries
-        :param output_dir: Directory to save images
-        :return: List of downloaded file paths
-        """
-        # Create output directory
-        os.makedirs(output_dir, exist_ok=True)
-        downloaded_files = []
-        for i, image in enumerate(images):
-            try:
-                response = requests.get(image['url'], headers=self.headers)
-                response.raise_for_status()
-                # Generate filename
-                ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
-                filename = os.path.join(output_dir, f'image_{i}{ext}')
-                with open(filename, 'wb') as f:
-                    f.write(response.content)
-                downloaded_files.append(filename)
-                self.logger.info(f"Downloaded: {filename}")
-            except Exception as e:
-                self.logger.error(f"Error downloading {image['url']}: {e}")
-        return downloaded_files
-#def main():
-    # Example usage
- #   scraper = DigitalCommonwealthScraper()
-  #
-    # Example URL from input
-   # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
-    # Extract images
-    #images = scraper.extract_images(url)
-    # Print image information
-    #for img in images:
-     #   print(json.dumps(img, indent=2))
-    # Optional: Download images
-    #scraper.download_images(images)
-#if __name__ == "__main__":
- #   main()