web-scraper

Sleeping

App Files Files Community

Pamudu13 commited on Mar 28

Commit

4ce93da

verified ·

1 Parent(s): 536e1c8

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -22

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from flask import Flask, jsonify, request
 import requests
 from bs4 import BeautifulSoup
 import os
@@ -9,8 +10,43 @@ import random
 import base64
 from io import BytesIO
 from googlesearch import search
 app = Flask(__name__)
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
@@ -77,13 +113,13 @@ def search_images(query, num_images=5):
                 time.sleep(random.uniform(0.5, 1))
             except Exception as e:
-                print(f"Error downloading image: {str(e)}")
                 continue
         return results
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
         return []
 @app.route('/search_images', methods=['GET'])
@@ -102,18 +138,27 @@ def api_search_images():
         # Search for images
         results = search_images(query, num_images)
-        return jsonify({
             'success': True,
             'query': query,
             'results': results
         })
     except Exception as e:
-        return jsonify({
             'success': False,
             'error': str(e)
         }), 500
 def scrape_site_content(query, num_sites=5):
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -126,10 +171,38 @@ def scrape_site_content(query, num_sites=5):
     results = []
     scraped = 0
     try:
-        # Use googlesearch-python to get URLs
-        search_results = search(query, num_results=num_sites)
         # Process each found URL
         for url in search_results:
@@ -137,17 +210,43 @@ def scrape_site_content(query, num_sites=5):
                 break
             try:
-                # Get the HTML content
-                response = requests.get(url, headers=headers, timeout=10)
-                response.raise_for_status()
                 # Verify it's HTML content
                 content_type = response.headers.get('Content-Type', '').lower()
                 if 'text/html' not in content_type:
                     continue
                 # Parse the HTML content
                 soup = BeautifulSoup(response.text, 'html.parser')
                 # Remove script and style elements
                 for script in soup(["script", "style"]):
@@ -155,6 +254,7 @@ def scrape_site_content(query, num_sites=5):
                 # Extract text content (limit to first 1000 characters)
                 text_content = soup.get_text(separator='\n', strip=True)[:10000]
                 # Extract all links (limit to first 10)
                 links = []
@@ -165,6 +265,7 @@ def scrape_site_content(query, num_sites=5):
                             'text': link.get_text(strip=True),
                             'url': href
                         })
                 # Extract meta information
                 title = soup.title.string if soup.title else ''
@@ -189,16 +290,24 @@ def scrape_site_content(query, num_sites=5):
                 })
                 scraped += 1
-                # Add a random delay between scrapes
-                time.sleep(random.uniform(0.5, 1))
             except Exception as e:
-                print(f"Error scraping {url}: {str(e)}")
                 continue
     except Exception as e:
-        print(f"Error in search: {str(e)}")
     return results
 @app.route('/scrape_sites', methods=['GET'])
@@ -217,24 +326,46 @@ def api_scrape_sites():
         # Scrape the websites
         results = scrape_site_content(query, num_sites)
-        return jsonify({
             'success': True,
             'query': query,
             'results': results
         })
     except Exception as e:
-        return jsonify({
             'success': False,
             'error': str(e)
         }), 500
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=5000)

+from flask import Flask, jsonify, request, Response, stream_with_context
+from flask_cors import CORS
 import requests
 from bs4 import BeautifulSoup
 import os
 import base64
 from io import BytesIO
 from googlesearch import search
+import logging
+import queue
+# Create a logging filter to suppress socket warnings
+class SocketWarningFilter(logging.Filter):
+    def filter(self, record):
+        return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
+# Create a queue for log messages
+log_queue = queue.Queue()
+# Custom log handler that puts messages in the queue
+class QueueHandler(logging.Handler):
+    def emit(self, record):
+        log_entry = self.format(record)
+        log_queue.put(log_entry)
+# Set up logging with the custom handler
+logger = logging.getLogger()
+queue_handler = QueueHandler()
+queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+queue_handler.addFilter(SocketWarningFilter())  # Add the filter to the handler
+logger.addHandler(queue_handler)
+logger.setLevel(logging.INFO)
+# Also add the filter to the root logger to catch all socket warnings
+logging.getLogger().addFilter(SocketWarningFilter())
 app = Flask(__name__)
+# Enable CORS with specific settings
+CORS(app, resources={
+    r"/*": {
+        "origins": "*",
+        "methods": ["GET", "POST", "OPTIONS"],
+        "allow_headers": ["Content-Type", "Authorization"]
+    }
+})
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
                 time.sleep(random.uniform(0.5, 1))
             except Exception as e:
+                logger.error(f"Error downloading image: {str(e)}")
                 continue
         return results
     except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
         return []
 @app.route('/search_images', methods=['GET'])
         # Search for images
         results = search_images(query, num_images)
+        response = jsonify({
             'success': True,
             'query': query,
             'results': results
         })
+        # Add CORS headers
+        response.headers['Access-Control-Allow-Origin'] = '*'
+        return response
     except Exception as e:
+        logger.error(f"Error in search_images: {str(e)}")
+        response = jsonify({
             'success': False,
             'error': str(e)
         }), 500
+        # Add CORS headers
+        response.headers['Access-Control-Allow-Origin'] = '*'
+        return response
 def scrape_site_content(query, num_sites=5):
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
     results = []
     scraped = 0
+    max_retries = 3
+    base_delay = 5  # Base delay in seconds
     try:
+        logger.info(f"Starting to scrape content for query: {query}")
+        # Add initial delay before starting searches
+        initial_delay = random.uniform(2, 4)
+        logger.info(f"Initial delay of {initial_delay:.2f} seconds before starting searches...")
+        time.sleep(initial_delay)
+        # Use googlesearch-python to get URLs with retry logic
+        search_results = []
+        retry_count = 0
+        while retry_count < max_retries:
+            try:
+                search_results = list(search(query, num_results=num_sites))
+                break
+            except Exception as e:
+                retry_count += 1
+                if "429" in str(e):
+                    delay = base_delay * (2 ** retry_count)  # Exponential backoff
+                    logger.warning(f"Rate limited by Google. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
+                    time.sleep(delay)
+                else:
+                    logger.error(f"Error during search (attempt {retry_count}/{max_retries}): {str(e)}")
+                    if retry_count == max_retries:
+                        raise
+                    time.sleep(base_delay)
+        logger.info(f"Found {len(search_results)} URLs to scrape for query: {query}")
         # Process each found URL
         for url in search_results:
                 break
             try:
+                logger.info(f"Attempting to scrape URL: {url}")
+                # Add random delay before each request
+                delay = random.uniform(1, 3)
+                logger.info(f"Waiting {delay:.2f} seconds before request...")
+                time.sleep(delay)
+                # Get the HTML content with retry logic
+                retry_count = 0
+                while retry_count < max_retries:
+                    try:
+                        response = requests.get(url, headers=headers, timeout=10)
+                        response.raise_for_status()
+                        break
+                    except requests.exceptions.RequestException as e:
+                        retry_count += 1
+                        if "429" in str(e):
+                            delay = base_delay * (2 ** retry_count)
+                            logger.warning(f"Rate limited. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
+                            time.sleep(delay)
+                        else:
+                            logger.error(f"Request failed (attempt {retry_count}/{max_retries}): {str(e)}")
+                            if retry_count == max_retries:
+                                raise
+                            time.sleep(base_delay)
+                logger.info(f"Successfully retrieved content from: {url}")
                 # Verify it's HTML content
                 content_type = response.headers.get('Content-Type', '').lower()
                 if 'text/html' not in content_type:
+                    logger.info(f"Skipping {url} - not HTML content (Content-Type: {content_type})")
                     continue
                 # Parse the HTML content
                 soup = BeautifulSoup(response.text, 'html.parser')
+                logger.info(f"Successfully parsed HTML from: {url}")
                 # Remove script and style elements
                 for script in soup(["script", "style"]):
                 # Extract text content (limit to first 1000 characters)
                 text_content = soup.get_text(separator='\n', strip=True)[:10000]
+                logger.info(f"Extracted {len(text_content)} characters of text from: {url}")
                 # Extract all links (limit to first 10)
                 links = []
                             'text': link.get_text(strip=True),
                             'url': href
                         })
+                logger.info(f"Found {len(links)} valid links on: {url}")
                 # Extract meta information
                 title = soup.title.string if soup.title else ''
                 })
                 scraped += 1
+                logger.info(f"Successfully scraped {scraped}/{num_sites} sites. Current URL: {url}")
+                # Add a random delay between successful scrapes
+                delay = random.uniform(2, 4)
+                logger.info(f"Waiting {delay:.2f} seconds before next scrape...")
+                time.sleep(delay)
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Request failed for URL {url}: {str(e)}")
+                continue
             except Exception as e:
+                logger.error(f"Error scraping {url}: {str(e)}")
                 continue
     except Exception as e:
+        logger.error(f"Error in search: {str(e)}")
+    logger.info(f"Completed scraping. Successfully scraped {len(results)} out of {num_sites} sites")
     return results
 @app.route('/scrape_sites', methods=['GET'])
         # Scrape the websites
         results = scrape_site_content(query, num_sites)
+        response = jsonify({
             'success': True,
             'query': query,
             'results': results
         })
+        # Add CORS headers
+        response.headers['Access-Control-Allow-Origin'] = '*'
+        return response
     except Exception as e:
+        logger.error(f"Error in api_scrape_sites: {str(e)}")
+        response = jsonify({
             'success': False,
             'error': str(e)
         }), 500
+        # Add CORS headers
+        response.headers['Access-Control-Allow-Origin'] = '*'
+        return response
+@app.route('/logs/stream')
+def stream_logs():
+    def generate():
+        while True:
+            try:
+                # Get log message from queue, timeout after 1 second
+                log_message = log_queue.get(timeout=1)
+                yield f"data: {log_message}\n\n"
+            except queue.Empty:
+                # Send a heartbeat to keep the connection alive
+                yield "data: heartbeat\n\n"
+            except GeneratorExit:
+                break
+    response = Response(stream_with_context(generate()), mimetype='text/event-stream')
+    response.headers['Cache-Control'] = 'no-cache'
+    response.headers['Connection'] = 'keep-alive'
+    return response
+if __name__ == '__main__':
+    logger.info("Starting Flask API server...")
+    app.run(host='0.0.0.0', port=5001, debug=True)