web-scraper

Sleeping

File size: 14,394 Bytes

4ce93da
 
b868160
3e48a1e
b868160
 
 
a2b8ed7
 
b868160
a2b8ed7
bc96608
4ce93da
 
6f31366
4ce93da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2b8ed7
 
4ce93da
 
 
 
 
 
 
 
a2b8ed7
4dd1d1c
b868160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c771ba
4dd1d1c
b868160
 
 
0c771ba
b868160
 
0c771ba
b868160
 
0c771ba
b868160
 
 
0c771ba
b868160
 
 
0c771ba
b868160
 
 
a2b8ed7
 
b868160
 
 
a2b8ed7
b868160
 
 
 
a2b8ed7
b868160
 
4dd1d1c
b868160
4dd1d1c
 
b868160
4dd1d1c
 
b868160
4dd1d1c
b868160
 
4dd1d1c
 
4ce93da
4dd1d1c
 
b868160
4dd1d1c
b868160
4ce93da
b868160
a2b8ed7
86e3d75
 
 
 
 
 
 
 
6f31366
86e3d75
 
 
 
6f31366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e3d75
cd81c16
 
 
9aadbe4
6ab6544
cd81c16
 
 
9aadbe4
661cd70
9aadbe4
cd81c16
9aadbe4
 
 
cd81c16
 
 
9aadbe4
cd81c16
 
9aadbe4
6ab6544
cd81c16
9aadbe4
cd81c16
 
6ab6544
cd81c16
6ab6544
 
 
9aadbe4
0c771ba
 
 
b868160
0c771ba
 
 
 
 
 
 
 
a2b8ed7
b868160
0c771ba
 
4ce93da
0c771ba
 
a2b8ed7
 
 
4ce93da
 
 
 
a2b8ed7
4ce93da
 
a2b8ed7
 
 
0c771ba
4ce93da
 
 
 
befa8d6
 
 
 
 
 
 
 
 
 
 
 
f807ea9
 
befa8d6
 
f807ea9
ca1bf94
f5a443c
 
 
 
 
befa8d6
f807ea9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ce93da
f5a443c
f807ea9
 
befa8d6
f807ea9
 
 
befa8d6
f807ea9
 
befa8d6
f807ea9
 
 
 
4ce93da
f807ea9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
befa8d6
 
f807ea9
 
befa8d6
f807ea9
 
 
befa8d6
 
 
 
 
 
 
536e1c8
befa8d6
 
 
 
 
 
 
 
 
 
4ce93da
befa8d6
 
 
 
 
4ce93da
 
 
 
befa8d6
4ce93da
 
befa8d6
 
 
a2b8ed7
4ce93da
 
 
a2b8ed7
4ce93da
 
 
 
 
 
 
 
 
 
 
 
 
a2b8ed7
4ce93da
 
 
 
a2b8ed7
4ce93da

from flask import Flask, jsonify, request, Response, stream_with_context
from flask_cors import CORS
import requests
from bs4 import BeautifulSoup
import os
import re
import urllib.parse
import time
import random
import base64
from io import BytesIO
from googlesearch import search
import logging
import queue
from huggingface_hub import HfApi

# Create a logging filter to suppress socket warnings
class SocketWarningFilter(logging.Filter):
    def filter(self, record):
        return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())

# Create a queue for log messages
log_queue = queue.Queue()

# Custom log handler that puts messages in the queue
class QueueHandler(logging.Handler):
    def emit(self, record):
        log_entry = self.format(record)
        log_queue.put(log_entry)

# Set up logging with the custom handler
logger = logging.getLogger()
queue_handler = QueueHandler()
queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
queue_handler.addFilter(SocketWarningFilter())  # Add the filter to the handler
logger.addHandler(queue_handler)
logger.setLevel(logging.INFO)

# Also add the filter to the root logger to catch all socket warnings
logging.getLogger().addFilter(SocketWarningFilter())

app = Flask(__name__)
# Enable CORS with specific settings
CORS(app, resources={
    r"/*": {
        "origins": "*",
        "methods": ["GET", "POST", "OPTIONS"],
        "allow_headers": ["Content-Type", "Authorization"]
    }
})

def search_images(query, num_images=5):
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
    }

    # Format the query for URL
    formatted_query = urllib.parse.quote(query)

    # Google Images URL
    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"

    try:
        # Get the HTML content
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        # Find all image URLs using regex
        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)

        # Remove duplicates while preserving order
        image_urls = list(dict.fromkeys(image_urls))

        # Store results
        results = []
        downloaded = 0

        for img_url in image_urls:
            if downloaded >= num_images:
                break

            try:
                # Skip small thumbnails and icons
                if 'gstatic.com' in img_url or 'google.com' in img_url:
                    continue

                # Download image
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()

                # Check if the response is actually an image
                content_type = img_response.headers.get('Content-Type', '')
                if not content_type.startswith('image/'):
                    continue

                # Convert image to base64
                image_base64 = base64.b64encode(img_response.content).decode('utf-8')

                # Add to results
                results.append({
                    'image_url': img_url,
                    'base64_data': f"data:{content_type};base64,{image_base64}"
                })

                downloaded += 1

                # Add a random delay between downloads
                time.sleep(random.uniform(0.5, 1))

            except Exception as e:
                logger.error(f"Error downloading image: {str(e)}")
                continue

        return results

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        return []



HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure you set the HF_TOKEN in your environment

@app.route('/restart_space', methods=['POST'])
def api_restart_space():
    """API route to restart a Hugging Face Space."""
    space_id = 'Pamudu13/web-scraper'  
    factory_reboot = request.json.get('factory_reboot', False)  # Optional: Set to True if you want a factory reboot

    if not space_id:
        return jsonify({'error': 'space_id parameter is required'}), 400

    try:
        hfapi = HfApi()

        # Call the restart_space method
        res = hfapi.restart_space(
            space_id,
            token=HF_TOKEN,
            factory_reboot=factory_reboot
        )

        return jsonify({
            'success': True,
            'message': f"Successfully restarted Space: {space_id}",
            'response': res
        }), 200

    except Exception as e:
        return jsonify({
            'success': False,
            'message': f"Error: {str(e)}"
        }), 500    

@app.route('/get_live_space_status', methods=['GET'])
def get_live_space_status():
    """API route to stream live status of a Hugging Face Space."""
    space_id = request.args.get('space_id', 'Pamudu13/web-scraper')  # Default to 'Pamudu13/web-scraper' if not provided

    def generate():
        while True:
            try:
                # Fetch the current runtime status of the Space
                hfapi = HfApi()
                space_runtime = hf_api.get_space_runtime(repo_id=space_id)
                
                # Extract relevant details
                status = space_runtime.stage  # e.g., 'BUILDING', 'RUNNING', etc.
                hardware = space_runtime.hardware  # e.g., 'cpu-basic', 't4-medium', etc.
                
                # Send the status as a Server-Sent Event
                yield f"data: {status}\n\n"
                yield f"data: {hardware}\n\n"

                # Delay before checking the status again
                time.sleep(5)  # Adjust polling interval as needed

            except Exception as e:
                # Handle errors and send an error message
                yield f"data: Error: {str(e)}\n\n"
                break  # Stop the stream in case of an error

    return Response(stream_with_context(generate()), mimetype='text/event-stream')




@app.route('/search_images', methods=['GET'])
def api_search_images():
    try:
        # Get query parameters
        query = request.args.get('query', '')
        num_images = int(request.args.get('num_images', 5))

        if not query:
            return jsonify({'error': 'Query parameter is required'}), 400

        if num_images < 1 or num_images > 20:
            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400

        # Search for images
        results = search_images(query, num_images)

        response = jsonify({
            'success': True,
            'query': query,
            'results': results
        })

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

    except Exception as e:
        logger.error(f"Error in search_images: {str(e)}")
        response = jsonify({
            'success': False,
            'error': str(e)
        }), 500

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

def scrape_site_content(query, num_sites=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'DNT': '1',
        'Connection': 'keep-alive',
    }

    results = []
    scraped = 0
    retries = 2  # Number of retries per URL
    timeout = 5  # Reduced timeout to 5 seconds

    try:
        # Get more URLs than needed to account for failures
        search_results = list(search(query, num_results=num_sites * 2))

        # Process each found URL
        for url in search_results:
            if scraped >= num_sites:
                break

            success = False
            for attempt in range(retries):
                try:
                    # Get the HTML content
                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
                    logger.info(f"Scraping URL: {url}")
                    response = requests.get(
                        url,
                        headers=headers,
                        timeout=timeout,
                        verify=False  # Skip SSL verification
                    )
                    response.raise_for_status()

                    # Verify it's HTML content
                    content_type = response.headers.get('Content-Type', '').lower()
                    if 'text/html' not in content_type:
                        logger.info(f"Skipping {url} - not HTML content")
                        break

                    # Parse the HTML content
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Remove script and style elements
                    for script in soup(["script", "style"]):
                        script.decompose()

                    # Extract text content (limit to first 10000 characters)
                    text_content = soup.get_text(separator='\n', strip=True)[:10000]

                    # Skip if not enough content
                    if len(text_content.split()) < 100:  # Skip if less than 100 words
                        logger.info(f"Skipping {url} - not enough content")
                        break

                    # Extract all links (limit to first 10)
                    links = []
                    for link in soup.find_all('a', href=True)[:10]:
                        href = link['href']
                        if href.startswith('http'):
                            links.append({
                                'text': link.get_text(strip=True),
                                'url': href
                            })

                    # Extract meta information
                    title = soup.title.string if soup.title else ''
                    meta_description = ''
                    meta_keywords = ''

                    meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                    if meta_desc_tag:
                        meta_description = meta_desc_tag.get('content', '')

                    meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
                    if meta_keywords_tag:
                        meta_keywords = meta_keywords_tag.get('content', '')

                    results.append({
                        'url': url,
                        'title': title,
                        'meta_description': meta_description,
                        'meta_keywords': meta_keywords,
                        'text_content': text_content,
                        'links': links
                    })

                    scraped += 1
                    success = True
                    # Add a random delay between scrapes
                    time.sleep(random.uniform(0.5, 1))
                    break  # Break retry loop on success

                except requests.Timeout:
                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
                    if attempt == retries - 1:  # Last attempt
                        print(f"Skipping {url} after {retries} timeout attempts")
                except requests.RequestException as e:
                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
                    if attempt == retries - 1:  # Last attempt
                        print(f"Skipping {url} after {retries} failed attempts")

                # Add a longer delay between retries
                if not success and attempt < retries - 1:
                    time.sleep(random.uniform(1, 2))

            # If we haven't found enough valid content and have more URLs, continue
            if scraped < num_sites and len(results) < len(search_results):
                continue

        return results

    except Exception as e:
        print(f"Error in search/scraping process: {str(e)}")
        # Return whatever results we've managed to gather
        return results


@app.route('/scrape_sites', methods=['GET'])
def api_scrape_sites():
    try:
        # Get query parameters
        query = request.args.get('query', '')
        num_sites = int(request.args.get('num_sites', 10))

        if not query:
            return jsonify({'error': 'Query parameter is required'}), 400

        if num_sites < 1 or num_sites > 20:
            return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400

        # Scrape the websites
        results = scrape_site_content(query, num_sites)

        response = jsonify({
            'success': True,
            'query': query,
            'results': results
        })

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

    except Exception as e:
        logger.error(f"Error in api_scrape_sites: {str(e)}")
        response = jsonify({
            'success': False,
            'error': str(e)
        }), 500

        # Add CORS headers
        response.headers['Access-Control-Allow-Origin'] = '*'
        return response

@app.route('/logs/stream')
def stream_logs():
    def generate():
        while True:
            try:
                # Get log message from queue, timeout after 1 second
                log_message = log_queue.get(timeout=1)
                yield f"data: {log_message}\n\n"
            except queue.Empty:
                # Send a heartbeat to keep the connection alive
                yield "data: heartbeat\n\n"
            except GeneratorExit:
                break

    response = Response(stream_with_context(generate()), mimetype='text/event-stream')
    response.headers['Cache-Control'] = 'no-cache'
    response.headers['Connection'] = 'keep-alive'
    return response

if __name__ == '__main__':
    logger.info("Starting Flask API server...")
    app.run(host='0.0.0.0', port=5001, debug=True)