from flask import Flask, jsonify, request, Response, stream_with_context from flask_cors import CORS import requests from bs4 import BeautifulSoup import os import re import urllib.parse import time import random import base64 from io import BytesIO from googlesearch import search import logging import queue from huggingface_hub import HfApi # Create a logging filter to suppress socket warnings class SocketWarningFilter(logging.Filter): def filter(self, record): return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage()) # Create a queue for log messages log_queue = queue.Queue() # Custom log handler that puts messages in the queue class QueueHandler(logging.Handler): def emit(self, record): log_entry = self.format(record) log_queue.put(log_entry) # Set up logging with the custom handler logger = logging.getLogger() queue_handler = QueueHandler() queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler logger.addHandler(queue_handler) logger.setLevel(logging.INFO) # Also add the filter to the root logger to catch all socket warnings logging.getLogger().addFilter(SocketWarningFilter()) app = Flask(__name__) # Enable CORS with specific settings CORS(app, resources={ r"/*": { "origins": "*", "methods": ["GET", "POST", "OPTIONS"], "allow_headers": ["Content-Type", "Authorization"] } }) def search_images(query, num_images=5): # Headers to mimic a browser request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', } # Format the query for URL formatted_query = urllib.parse.quote(query) # Google Images URL url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active" try: # Get the HTML content response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Find all image URLs using regex image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text) # Remove duplicates while preserving order image_urls = list(dict.fromkeys(image_urls)) # Store results results = [] downloaded = 0 for img_url in image_urls: if downloaded >= num_images: break try: # Skip small thumbnails and icons if 'gstatic.com' in img_url or 'google.com' in img_url: continue # Download image img_response = requests.get(img_url, headers=headers, timeout=10) img_response.raise_for_status() # Check if the response is actually an image content_type = img_response.headers.get('Content-Type', '') if not content_type.startswith('image/'): continue # Convert image to base64 image_base64 = base64.b64encode(img_response.content).decode('utf-8') # Add to results results.append({ 'image_url': img_url, 'base64_data': f"data:{content_type};base64,{image_base64}" }) downloaded += 1 # Add a random delay between downloads time.sleep(random.uniform(0.5, 1)) except Exception as e: logger.error(f"Error downloading image: {str(e)}") continue return results except Exception as e: logger.error(f"An error occurred: {str(e)}") return [] HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment @app.route('/restart_space', methods=['POST']) def api_restart_space(): """API route to restart a Hugging Face Space.""" space_id = 'Pamudu13/web-scraper' factory_reboot = request.json.get('factory_reboot', False) # Optional: Set to True if you want a factory reboot if not space_id: return jsonify({'error': 'space_id parameter is required'}), 400 try: hfapi = HfApi() # Call the restart_space method res = hfapi.restart_space( space_id, token=HF_TOKEN, factory_reboot=factory_reboot ) return jsonify({ 'success': True, 'message': f"Successfully restarted Space: {space_id}", 'response': res }), 200 except Exception as e: return jsonify({ 'success': False, 'message': f"Error: {str(e)}" }), 500 @app.route('/get_live_space_status', methods=['GET']) def get_live_space_status(): """API route to stream live status of a Hugging Face Space.""" space_id = request.args.get('space_id', 'Pamudu13/web-scraper') # Default to 'Pamudu13/web-scraper' if not provided def generate(): while True: try: # Fetch the current runtime status of the Space hfapi = HfApi() space_runtime = hf_api.get_space_runtime(repo_id=space_id) # Extract relevant details status = space_runtime.stage # e.g., 'BUILDING', 'RUNNING', etc. hardware = space_runtime.hardware # e.g., 'cpu-basic', 't4-medium', etc. # Send the status as a Server-Sent Event yield f"data: {status}\n\n" yield f"data: {hardware}\n\n" # Delay before checking the status again time.sleep(5) # Adjust polling interval as needed except Exception as e: # Handle errors and send an error message yield f"data: Error: {str(e)}\n\n" break # Stop the stream in case of an error return Response(stream_with_context(generate()), mimetype='text/event-stream') @app.route('/search_images', methods=['GET']) def api_search_images(): try: # Get query parameters query = request.args.get('query', '') num_images = int(request.args.get('num_images', 5)) if not query: return jsonify({'error': 'Query parameter is required'}), 400 if num_images < 1 or num_images > 20: return jsonify({'error': 'Number of images must be between 1 and 20'}), 400 # Search for images results = search_images(query, num_images) response = jsonify({ 'success': True, 'query': query, 'results': results }) # Add CORS headers response.headers['Access-Control-Allow-Origin'] = '*' return response except Exception as e: logger.error(f"Error in search_images: {str(e)}") response = jsonify({ 'success': False, 'error': str(e) }), 500 # Add CORS headers response.headers['Access-Control-Allow-Origin'] = '*' return response def scrape_site_content(query, num_sites=5): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', } results = [] scraped = 0 retries = 2 # Number of retries per URL timeout = 5 # Reduced timeout to 5 seconds try: # Get more URLs than needed to account for failures search_results = list(search(query, num_results=num_sites * 2)) # Process each found URL for url in search_results: if scraped >= num_sites: break success = False for attempt in range(retries): try: # Get the HTML content logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})") logger.info(f"Scraping URL: {url}") response = requests.get( url, headers=headers, timeout=timeout, verify=False # Skip SSL verification ) response.raise_for_status() # Verify it's HTML content content_type = response.headers.get('Content-Type', '').lower() if 'text/html' not in content_type: logger.info(f"Skipping {url} - not HTML content") break # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Extract text content (limit to first 10000 characters) text_content = soup.get_text(separator='\n', strip=True)[:10000] # Skip if not enough content if len(text_content.split()) < 100: # Skip if less than 100 words logger.info(f"Skipping {url} - not enough content") break # Extract all links (limit to first 10) links = [] for link in soup.find_all('a', href=True)[:10]: href = link['href'] if href.startswith('http'): links.append({ 'text': link.get_text(strip=True), 'url': href }) # Extract meta information title = soup.title.string if soup.title else '' meta_description = '' meta_keywords = '' meta_desc_tag = soup.find('meta', attrs={'name': 'description'}) if meta_desc_tag: meta_description = meta_desc_tag.get('content', '') meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) if meta_keywords_tag: meta_keywords = meta_keywords_tag.get('content', '') results.append({ 'url': url, 'title': title, 'meta_description': meta_description, 'meta_keywords': meta_keywords, 'text_content': text_content, 'links': links }) scraped += 1 success = True # Add a random delay between scrapes time.sleep(random.uniform(0.5, 1)) break # Break retry loop on success except requests.Timeout: print(f"Timeout on {url} (attempt {attempt + 1}/{retries})") if attempt == retries - 1: # Last attempt print(f"Skipping {url} after {retries} timeout attempts") except requests.RequestException as e: print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}") if attempt == retries - 1: # Last attempt print(f"Skipping {url} after {retries} failed attempts") # Add a longer delay between retries if not success and attempt < retries - 1: time.sleep(random.uniform(1, 2)) # If we haven't found enough valid content and have more URLs, continue if scraped < num_sites and len(results) < len(search_results): continue return results except Exception as e: print(f"Error in search/scraping process: {str(e)}") # Return whatever results we've managed to gather return results @app.route('/scrape_sites', methods=['GET']) def api_scrape_sites(): try: # Get query parameters query = request.args.get('query', '') num_sites = int(request.args.get('num_sites', 10)) if not query: return jsonify({'error': 'Query parameter is required'}), 400 if num_sites < 1 or num_sites > 20: return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400 # Scrape the websites results = scrape_site_content(query, num_sites) response = jsonify({ 'success': True, 'query': query, 'results': results }) # Add CORS headers response.headers['Access-Control-Allow-Origin'] = '*' return response except Exception as e: logger.error(f"Error in api_scrape_sites: {str(e)}") response = jsonify({ 'success': False, 'error': str(e) }), 500 # Add CORS headers response.headers['Access-Control-Allow-Origin'] = '*' return response @app.route('/logs/stream') def stream_logs(): def generate(): while True: try: # Get log message from queue, timeout after 1 second log_message = log_queue.get(timeout=1) yield f"data: {log_message}\n\n" except queue.Empty: # Send a heartbeat to keep the connection alive yield "data: heartbeat\n\n" except GeneratorExit: break response = Response(stream_with_context(generate()), mimetype='text/event-stream') response.headers['Cache-Control'] = 'no-cache' response.headers['Connection'] = 'keep-alive' return response if __name__ == '__main__': logger.info("Starting Flask API server...") app.run(host='0.0.0.0', port=5001, debug=True)