from flask import Flask, jsonify, request import requests from bs4 import BeautifulSoup import os import re import urllib.parse import time import random import base64 from io import BytesIO app = Flask(__name__) def search_images(query, num_images=5): # Headers to mimic a browser request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', } # Format the query for URL formatted_query = urllib.parse.quote(query) # Google Images URL url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active" try: # Get the HTML content response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Find all image URLs using regex image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text) # Remove duplicates while preserving order image_urls = list(dict.fromkeys(image_urls)) # Store results results = [] downloaded = 0 for img_url in image_urls: if downloaded >= num_images: break try: # Skip small thumbnails and icons if 'gstatic.com' in img_url or 'google.com' in img_url: continue # Download image img_response = requests.get(img_url, headers=headers, timeout=10) img_response.raise_for_status() # Check if the response is actually an image content_type = img_response.headers.get('Content-Type', '') if not content_type.startswith('image/'): continue # Convert image to base64 image_base64 = base64.b64encode(img_response.content).decode('utf-8') # Add to results results.append({ 'image_url': img_url, 'base64_data': f"data:{content_type};base64,{image_base64}" }) downloaded += 1 # Add a random delay between downloads time.sleep(random.uniform(0.5, 1)) except Exception as e: print(f"Error downloading image: {str(e)}") continue return results except Exception as e: print(f"An error occurred: {str(e)}") return [] @app.route('/search_images', methods=['GET']) def api_search_images(): try: # Get query parameters query = request.args.get('query', '') num_images = int(request.args.get('num_images', 5)) if not query: return jsonify({'error': 'Query parameter is required'}), 400 if num_images < 1 or num_images > 20: return jsonify({'error': 'Number of images must be between 1 and 20'}), 400 # Search for images results = search_images(query, num_images) return jsonify({ 'success': True, 'query': query, 'results': results }) except Exception as e: return jsonify({ 'success': False, 'error': str(e) }), 500 def scrape_site_content(query, num_sites=5): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', } results = [] scraped = 0 try: # Use a more direct search URL format search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}&num={num_sites}" search_response = requests.get(search_url, headers=headers, timeout=30) search_response.raise_for_status() # Parse the search results search_soup = BeautifulSoup(search_response.text, 'html.parser') # Look for URLs in multiple possible locations search_results = [] # Method 1: Look for cite elements for cite in search_soup.find_all('cite'): url = cite.text.strip() if url.startswith(('http://', 'https://')): search_results.append(url) # Method 2: Look for links with specific attributes for a in search_soup.find_all('a'): href = a.get('href', '') if 'url?q=' in href: url = href.split('url?q=')[1].split('&')[0] if url.startswith(('http://', 'https://')): search_results.append(urllib.parse.unquote(url)) # Remove duplicates while preserving order search_results = list(dict.fromkeys(search_results)) # Process each found URL for url in search_results: if scraped >= num_sites: break try: # Get the HTML content response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Verify it's HTML content content_type = response.headers.get('Content-Type', '').lower() if 'text/html' not in content_type: continue # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Extract text content (limit to first 1000 characters) text_content = soup.get_text(separator='\n', strip=True)[:1000] # Extract all links (limit to first 10) links = [] for link in soup.find_all('a', href=True)[:10]: href = link['href'] if href.startswith('http'): links.append({ 'text': link.get_text(strip=True), 'url': href }) # Extract meta information title = soup.title.string if soup.title else '' meta_description = '' meta_keywords = '' meta_desc_tag = soup.find('meta', attrs={'name': 'description'}) if meta_desc_tag: meta_description = meta_desc_tag.get('content', '') meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) if meta_keywords_tag: meta_keywords = meta_keywords_tag.get('content', '') results.append({ 'url': url, 'title': title, 'meta_description': meta_description, 'meta_keywords': meta_keywords, 'text_content': text_content, 'links': links }) scraped += 1 # Add a random delay between scrapes time.sleep(random.uniform(0.5, 1)) except Exception as e: print(f"Error scraping {url}: {str(e)}") continue except Exception as e: print(f"Error in search: {str(e)}") return results @app.route('/scrape_sites', methods=['GET']) def api_scrape_sites(): try: # Get query parameters query = request.args.get('query', '') num_sites = int(request.args.get('num_sites', 5)) if not query: return jsonify({'error': 'Query parameter is required'}), 400 if num_sites < 1 or num_sites > 20: return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400 # Scrape the websites results = scrape_site_content(query, num_sites) return jsonify({ 'success': True, 'query': query, 'results': results }) except Exception as e: return jsonify({ 'success': False, 'error': str(e) }), 500 if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)