url-scraper

Sleeping

App Files Files Community

NanobotzAI commited on Apr 28

Commit

13a6515

verified ·

1 Parent(s): 83f0301

Update app.py

Browse files

Files changed (1) hide show

app.py +295 -383

app.py CHANGED Viewed

@@ -1,411 +1,323 @@
-from flask import Flask, jsonify, request, Response, stream_with_context
-from flask_cors import CORS
-import requests
-from bs4 import BeautifulSoup
-import os
-import re
-import urllib.parse
 import time
-import random
-import base64
-from io import BytesIO
-from googlesearch import search
 import logging
-import queue
-from huggingface_hub import HfApi
-# Create a logging filter to suppress socket warnings
-class SocketWarningFilter(logging.Filter):
-    def filter(self, record):
-        return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
-# Create a queue for log messages
-log_queue = queue.Queue()
-# Custom log handler that puts messages in the queue
-class QueueHandler(logging.Handler):
-    def emit(self, record):
-        log_entry = self.format(record)
-        log_queue.put(log_entry)
-# Set up logging with the custom handler
-logger = logging.getLogger()
-queue_handler = QueueHandler()
-queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-queue_handler.addFilter(SocketWarningFilter())  # Add the filter to the handler
-logger.addHandler(queue_handler)
-logger.setLevel(logging.INFO)
-# Also add the filter to the root logger to catch all socket warnings
-logging.getLogger().addFilter(SocketWarningFilter())
 app = Flask(__name__)
-# Enable CORS with specific settings
-CORS(app, resources={
-    r"/*": {
-        "origins": "*",
-        "methods": ["GET", "POST", "OPTIONS"],
-        "allow_headers": ["Content-Type", "Authorization"]
-    }
-})
-def search_images(query, num_images=5):
-    # Headers to mimic a browser request
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-    }
-    # Format the query for URL
-    formatted_query = urllib.parse.quote(query)
-    # Google Images URL
-    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
     try:
-        # Get the HTML content
-        response = requests.get(url, headers=headers, timeout=30)
-        response.raise_for_status()
-        # Find all image URLs using regex
-        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
-        # Remove duplicates while preserving order
-        image_urls = list(dict.fromkeys(image_urls))
-        # Store results
-        results = []
-        downloaded = 0
-        for img_url in image_urls:
-            if downloaded >= num_images:
-                break
             try:
-                # Skip small thumbnails and icons
-                if 'gstatic.com' in img_url or 'google.com' in img_url:
-                    continue
-                # Download image
-                img_response = requests.get(img_url, headers=headers, timeout=10)
-                img_response.raise_for_status()
-                # Check if the response is actually an image
-                content_type = img_response.headers.get('Content-Type', '')
-                if not content_type.startswith('image/'):
-                    continue
-                # Convert image to base64
-                image_base64 = base64.b64encode(img_response.content).decode('utf-8')
-                # Add to results
-                results.append({
-                    'image_url': img_url,
-                    'base64_data': f"data:{content_type};base64,{image_base64}"
                 })
-                downloaded += 1
-                # Add a random delay between downloads
-                time.sleep(random.uniform(0.5, 1))
             except Exception as e:
-                logger.error(f"Error downloading image: {str(e)}")
-                continue
-        return results
     except Exception as e:
-        logger.error(f"An error occurred: {str(e)}")
-        return []
-HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure you set the HF_TOKEN in your environment
-@app.route('/restart_space', methods=['POST'])
-def api_restart_space():
-    """API route to restart a Hugging Face Space."""
-    space_id = 'Pamudu13/web-scraper'
-    factory_reboot = request.json.get('factory_reboot', False)  # Optional: Set to True if you want a factory reboot
-    if not space_id:
-        return jsonify({'error': 'space_id parameter is required'}), 400
     try:
-        hfapi = HfApi()
-        # Call the restart_space method
-        res = hfapi.restart_space(
-            space_id,
-            token=HF_TOKEN,
-            factory_reboot=factory_reboot
-        )
-        return jsonify({
-            'success': True,
-            'message': f"Successfully restarted Space: {space_id}",
-            'response': res
-        }), 200
     except Exception as e:
-        return jsonify({
-            'success': False,
-            'message': f"Error: {str(e)}"
-        }), 500
-@app.route('/get_live_space_status', methods=['GET'])
-def get_live_space_status():
-    """API route to stream live status of a Hugging Face Space."""
-    space_id = request.args.get('space_id', 'Pamudu13/web-scraper')  # Default to 'Pamudu13/web-scraper' if not provided
-    def generate():
-        while True:
-            try:
-                # Fetch the current runtime status of the Space
-                hf_api = HfApi()
-                space_runtime = hf_api.get_space_runtime(repo_id=space_id)
-                # Extract relevant details
-                status = space_runtime.stage  # e.g., 'BUILDING', 'RUNNING', etc.
-                hardware = space_runtime.hardware  # e.g., 'cpu-basic', 't4-medium', etc.
-                # Send the status as a Server-Sent Event
-                yield f"data: {status}\n\n"
-                yield f"data: {hardware}\n\n"
-                # Delay before checking the status again
-                time.sleep(5)  # Adjust polling interval as needed
-            except Exception as e:
-                # Handle errors and send an error message
-                yield f"data: Error: {str(e)}\n\n"
-                break  # Stop the stream in case of an error
-    return Response(stream_with_context(generate()), mimetype='text/event-stream')
-@app.route('/search_images', methods=['GET'])
-def api_search_images():
     try:
-        # Get query parameters
-        query = request.args.get('query', '')
-        num_images = int(request.args.get('num_images', 5))
-        if not query:
-            return jsonify({'error': 'Query parameter is required'}), 400
-        if num_images < 1 or num_images > 20:
-            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
-        # Search for images
-        results = search_images(query, num_images)
-        response = jsonify({
-            'success': True,
-            'query': query,
-            'results': results
         })
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-    except Exception as e:
-        logger.error(f"Error in search_images: {str(e)}")
-        response = jsonify({
-            'success': False,
-            'error': str(e)
-        }), 500
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-def scrape_site_content(query, num_sites=5):
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-    }
-    results = []
-    scraped = 0
-    retries = 2  # Number of retries per URL
-    timeout = 5  # Reduced timeout to 5 seconds
-    try:
-        # Get more URLs than needed to account for failures
-        search_results = list(search(query, num_results=num_sites * 2))
-        # Process each found URL
-        for url in search_results:
-            if scraped >= num_sites:
-                break
-            success = False
-            for attempt in range(retries):
-                try:
-                    # Get the HTML content
-                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
-                    logger.info(f"Scraping URL: {url}")
-                    response = requests.get(
-                        url,
-                        headers=headers,
-                        timeout=timeout,
-                        verify=False  # Skip SSL verification
-                    )
-                    response.raise_for_status()
-                    # Verify it's HTML content
-                    content_type = response.headers.get('Content-Type', '').lower()
-                    if 'text/html' not in content_type:
-                        logger.info(f"Skipping {url} - not HTML content")
-                        break
-                    # Parse the HTML content
-                    soup = BeautifulSoup(response.text, 'html.parser')
-                    # Remove script and style elements
-                    for script in soup(["script", "style"]):
-                        script.decompose()
-                    # Extract text content (limit to first 10000 characters)
-                    text_content = soup.get_text(separator='\n', strip=True)[:10000]
-                    # Skip if not enough content
-                    if len(text_content.split()) < 100:  # Skip if less than 100 words
-                        logger.info(f"Skipping {url} - not enough content")
-                        break
-                    # Extract all links (limit to first 10)
-                    links = []
-                    for link in soup.find_all('a', href=True)[:10]:
-                        href = link['href']
-                        if href.startswith('http'):
-                            links.append({
-                                'text': link.get_text(strip=True),
-                                'url': href
-                            })
-                    # Extract meta information
-                    title = soup.title.string if soup.title else ''
-                    meta_description = ''
-                    meta_keywords = ''
-                    meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
-                    if meta_desc_tag:
-                        meta_description = meta_desc_tag.get('content', '')
-                    meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
-                    if meta_keywords_tag:
-                        meta_keywords = meta_keywords_tag.get('content', '')
-                    results.append({
-                        'url': url,
-                        'title': title,
-                        'meta_description': meta_description,
-                        'meta_keywords': meta_keywords,
-                        'text_content': text_content,
-                        'links': links
-                    })
-                    scraped += 1
-                    success = True
-                    # Add a random delay between scrapes
-                    time.sleep(random.uniform(0.5, 1))
-                    break  # Break retry loop on success
-                except requests.Timeout:
-                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
-                    if attempt == retries - 1:  # Last attempt
-                        print(f"Skipping {url} after {retries} timeout attempts")
-                except requests.RequestException as e:
-                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
-                    if attempt == retries - 1:  # Last attempt
-                        print(f"Skipping {url} after {retries} failed attempts")
-                # Add a longer delay between retries
-                if not success and attempt < retries - 1:
-                    time.sleep(random.uniform(1, 2))
-            # If we haven't found enough valid content and have more URLs, continue
-            if scraped < num_sites and len(results) < len(search_results):
-                continue
-        return results
     except Exception as e:
-        print(f"Error in search/scraping process: {str(e)}")
-        # Return whatever results we've managed to gather
-        return results
-@app.route('/scrape_sites', methods=['GET'])
-def api_scrape_sites():
     try:
-        # Get query parameters
-        query = request.args.get('query', '')
-        num_sites = int(request.args.get('num_sites', 10))
-        if not query:
-            return jsonify({'error': 'Query parameter is required'}), 400
-        if num_sites < 1 or num_sites > 20:
-            return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
-        # Scrape the websites
-        results = scrape_site_content(query, num_sites)
-        response = jsonify({
-            'success': True,
-            'query': query,
-            'results': results
-        })
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
     except Exception as e:
-        logger.error(f"Error in api_scrape_sites: {str(e)}")
-        response = jsonify({
-            'success': False,
-            'error': str(e)
-        }), 500
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-@app.route('/logs/stream')
-def stream_logs():
-    def generate():
-        while True:
-            try:
-                # Get log message from queue, timeout after 1 second
-                log_message = log_queue.get(timeout=1)
-                yield f"data: {log_message}\n\n"
-            except queue.Empty:
-                # Send a heartbeat to keep the connection alive
-                yield "data: heartbeat\n\n"
-            except GeneratorExit:
-                break
-    response = Response(stream_with_context(generate()), mimetype='text/event-stream')
-    response.headers['Cache-Control'] = 'no-cache'
-    response.headers['Connection'] = 'keep-alive'
-    return response
 if __name__ == '__main__':
-    logger.info("Starting Flask API server...")
-    app.run(host='0.0.0.0', port=5001, debug=True)

+from flask import Flask, request, jsonify
+from scrapy import Spider, Request
+from scrapy.crawler import CrawlerRunner
+from scrapy.utils.project import get_project_settings
+from twisted.internet import reactor
+from twisted.internet.defer import inlineCallbacks, returnValue, Deferred
+from urllib.parse import urljoin, urlparse
+import json
+import threading
 import time
 import logging
+import traceback
+from queue import Queue
+from functools import wraps
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
 app = Flask(__name__)
+# Thread-safe queue for results
+result_queue = Queue()
+class URLSpider(Spider):
+    name = 'url_spider'
+    found_urls = set()  # Class variable to store all found URLs
+    def __init__(self, start_url=None, max_urls=10, *args, **kwargs):
+        super(URLSpider, self).__init__(*args, **kwargs)
+        self.start_urls = [start_url]
+        self.allowed_domain = urlparse(start_url).netloc
+        self.max_urls = max_urls
+        self.url_count = 0
+        logger.info(f"Starting spider for URL: {start_url} with max_urls={max_urls}")
+    def start_requests(self):
+        for url in self.start_urls:
+            yield Request(url, callback=self.parse, dont_filter=True, errback=self.handle_error)
+    def handle_error(self, failure):
+        logger.error(f"Request failed: {failure.value}")
+        return None
+    def parse(self, response):
+        try:
+            if self.url_count >= self.max_urls:
+                logger.info(f"Reached maximum URL limit ({self.max_urls}). Stopping crawl.")
+                return
+            links = response.css('a::attr(href)').getall()
+            logger.info(f"Found {len(links)} links on {response.url}")
+            for link in links:
+                if self.url_count >= self.max_urls:
+                    return
+                absolute_url = urljoin(response.url, link)
+                parsed_url = urlparse(absolute_url)
+                if parsed_url.netloc == self.allowed_domain and absolute_url not in self.found_urls:
+                    self.found_urls.add(absolute_url)
+                    self.url_count += 1
+                    logger.info(f"Found URL ({self.url_count}/{self.max_urls}): {absolute_url}")
+                    if self.url_count < self.max_urls:
+                        logger.info(f"Following link: {absolute_url}")
+                        yield Request(absolute_url, callback=self.parse, errback=self.handle_error)
+        except Exception as e:
+            logger.error(f"Error in parse method: {str(e)}")
+            traceback.print_exc()
+def run_spider(url, max_urls):
     try:
+        settings = get_project_settings()
+        settings.update({
+            'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'LOG_ENABLED': True,
+            'LOG_LEVEL': 'INFO',
+            'ROBOTSTXT_OBEY': True,
+            'CONCURRENT_REQUESTS': 16,
+            'DOWNLOAD_TIMEOUT': 30,
+            'RETRY_TIMES': 3,
+        })
+        runner = CrawlerRunner(settings)
+        # Create a deferred to store the results
+        results = {'urls': set()}
+        def crawler_callback(result):
             try:
+                # Get URLs from the spider's class variable
+                urls = list(URLSpider.found_urls)
+                logger.info(f"Crawling completed. Found {len(urls)} URLs.")
+                # Put the results in the queue
+                result_queue.put({
+                    'status': 'success',
+                    'urls': urls,
+                    'count': len(urls)
                 })
+                return result
             except Exception as e:
+                logger.error(f"Error in crawler_callback: {str(e)}")
+                traceback.print_exc()
+                result_queue.put({
+                    'status': 'error',
+                    'error': str(e),
+                    'urls': []
+                })
+                return result
+        # Run the spider
+        logger.info("Starting crawler...")
+        try:
+            deferred = runner.crawl(URLSpider, start_url=url, max_urls=max_urls)
+            deferred.addCallback(crawler_callback)
+            deferred.addErrback(lambda f: result_queue.put({
+                'status': 'error',
+                'error': str(f.value),
+                'urls': []
+            }))
+            return deferred
+        except Exception as e:
+            logger.error(f"Error starting crawler: {str(e)}")
+            traceback.print_exc()
+            result_queue.put({
+                'status': 'error',
+                'error': str(e),
+                'urls': []
+            })
+            return None
     except Exception as e:
+        logger.error(f"Error in run_spider: {str(e)}")
+        traceback.print_exc()
+        result_queue.put({'error': str(e)})
+@app.route('/scrape', methods=['POST'])
+def scrape_url():
     try:
+        data = request.get_json()
+        if not data:
+            logger.error("No JSON data provided in request")
+            return jsonify({'error': 'No JSON data provided'}), 400
+        url = data.get('url')
+        max_urls = data.get('max_urls', 50)
+        if not url:
+            logger.error("No URL provided in request")
+            return jsonify({'error': 'URL is required'}), 400
+        logger.info(f"Received scrape request for URL: {url} with max_urls={max_urls}")
+        # Run the spider in the reactor thread
+        reactor.callFromThread(run_spider, url, max_urls)
+        # Wait for results with timeout
+        try:
+            result = result_queue.get(timeout=60)
+            if 'error' in result:
+                logger.error(f"Scraping error: {result['error']}")
+                return jsonify({'error': 'Failed to scrape URL', 'details': {'error': result['error']}}), 500
+            return jsonify(result)
+        except Exception as e:
+            logger.error(f"Timeout waiting for results: {str(e)}")
+            return jsonify({'error': 'Scraping timed out'}), 500
     except Exception as e:
+        logger.error(f"Error during scraping: {str(e)}")
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({'status': 'ok'})
+def run_reactor():
+    reactor.run(installSignalHandlers=False)
+# Start reactor in a separate thread when the app starts
+if not reactor.running:
+    reactor_thread = threading.Thread(target=run_reactor, daemon=True)
+    reactor_thread.start()
+class ContentSpider(Spider):
+    name = 'content_spider'
+    content_results = {}  # Class variable to store content results
+    def __init__(self, urls=None, *args, **kwargs):
+        super(ContentSpider, self).__init__(*args, **kwargs)
+        self.start_urls = urls if urls else []
+        logger.info(f"Starting content spider for {len(self.start_urls)} URLs")
+    def parse(self, response):
+        try:
+            # Extract title
+            title = response.css('title::text').get() or ''
+            # Extract main content (this is a simple example, adjust selectors as needed)
+            content = ' '.join(response.css('p::text, h1::text, h2::text, h3::text, h4::text, h5::text, h6::text').getall())
+            # Store the result
+            self.content_results[response.url] = {
+                'title': title,
+                'content': content[:2000] + '...' if len(content) > 2000 else content,  # Limit content length
+                'status': 'success'
+            }
+            logger.info(f"Scraped content from {response.url}")
+        except Exception as e:
+            logger.error(f"Error scraping content from {response.url}: {str(e)}")
+            self.content_results[response.url] = {
+                'title': '',
+                'content': '',
+                'status': 'error',
+                'error': str(e)
+            }
+def run_content_spider(urls):
     try:
+        settings = get_project_settings()
+        settings.update({
+            'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'LOG_ENABLED': True,
+            'LOG_LEVEL': 'INFO',
+            'ROBOTSTXT_OBEY': True,
+            'CONCURRENT_REQUESTS': 16,
+            'DOWNLOAD_TIMEOUT': 30,
+            'RETRY_TIMES': 3,
         })
+        runner = CrawlerRunner(settings)
+        def content_crawler_callback(result):
+            try:
+                # Get content results from the spider's class variable
+                content_results = ContentSpider.content_results
+                logger.info(f"Content scraping completed for {len(content_results)} URLs.")
+                # Put the results in the queue
+                result_queue.put({
+                    'status': 'success',
+                    'results': content_results
+                })
+                return result
+            except Exception as e:
+                logger.error(f"Error in content_crawler_callback: {str(e)}")
+                traceback.print_exc()
+                result_queue.put({
+                    'status': 'error',
+                    'error': str(e),
+                    'results': {}
+                })
+                return result
+        # Run the spider
+        logger.info("Starting content crawler...")
+        try:
+            deferred = runner.crawl(ContentSpider, urls=urls)
+            deferred.addCallback(content_crawler_callback)
+            deferred.addErrback(lambda f: result_queue.put({
+                'status': 'error',
+                'error': str(f.value),
+                'results': {}
+            }))
+            return deferred
+        except Exception as e:
+            logger.error(f"Error starting content crawler: {str(e)}")
+            traceback.print_exc()
+            result_queue.put({
+                'status': 'error',
+                'error': str(e),
+                'results': {}
+            })
+            return None
     except Exception as e:
+        logger.error(f"Error in run_content_spider: {str(e)}")
+        traceback.print_exc()
+        result_queue.put({'error': str(e)})
+@app.route('/scrape-content', methods=['POST'])
+def scrape_content():
     try:
+        data = request.get_json()
+        if not data:
+            logger.error("No JSON data provided in request")
+            return jsonify({'error': 'No JSON data provided'}), 400
+        urls = data.get('urls', [])
+        if not urls:
+            logger.error("No URLs provided in request")
+            return jsonify({'error': 'URLs are required'}), 400
+        logger.info(f"Received content scrape request for {len(urls)} URLs")
+        # Run the content spider in the reactor thread
+        reactor.callFromThread(run_content_spider, urls)
+        # Wait for results with timeout
+        try:
+            result = result_queue.get(timeout=60)
+            if 'error' in result:
+                logger.error(f"Content scraping error: {result['error']}")
+                return jsonify({'error': 'Failed to scrape content', 'details': {'error': result['error']}}), 500
+            return jsonify(result)
+        except Exception as e:
+            logger.error(f"Timeout waiting for content results: {str(e)}")
+            return jsonify({'error': 'Content scraping timed out'}), 500
     except Exception as e:
+        logger.error(f"Error during content scraping: {str(e)}")
+        traceback.print_exc()
+        return jsonify({'error': str(e)}), 500
 if __name__ == '__main__':
+    logger.info("Starting URL Scraper API on port 5001")
+    app.run(host='0.0.0.0', port=5001, threaded=True, use_reloader=False)