web-scraper-restart

Sleeping

App Files Files Community

web-scraper-restart / app.py

Pamudu13

Update app.py

f5a443c verified 5 months ago

raw

history blame

8.54 kB

	from flask import Flask, jsonify, request
	import requests
	from bs4 import BeautifulSoup
	import os
	import re
	import urllib.parse
	import time
	import random
	import base64
	from io import BytesIO

	app = Flask(__name__)

	def search_images(query, num_images=5):
	# Headers to mimic a browser request
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'DNT': '1',
	'Connection': 'keep-alive',
	}

	# Format the query for URL
	formatted_query = urllib.parse.quote(query)

	# Google Images URL
	url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"

	try:
	# Get the HTML content
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()

	# Find all image URLs using regex
	image_urls = re.findall(r'https?://[^"\']*?(?:jpg\|jpeg\|png\|gif)', response.text)

	# Remove duplicates while preserving order
	image_urls = list(dict.fromkeys(image_urls))

	# Store results
	results = []
	downloaded = 0

	for img_url in image_urls:
	if downloaded >= num_images:
	break

	try:
	# Skip small thumbnails and icons
	if 'gstatic.com' in img_url or 'google.com' in img_url:
	continue

	# Download image
	img_response = requests.get(img_url, headers=headers, timeout=10)
	img_response.raise_for_status()

	# Check if the response is actually an image
	content_type = img_response.headers.get('Content-Type', '')
	if not content_type.startswith('image/'):
	continue

	# Convert image to base64
	image_base64 = base64.b64encode(img_response.content).decode('utf-8')

	# Add to results
	results.append({
	'image_url': img_url,
	'base64_data': f"data:{content_type};base64,{image_base64}"
	})

	downloaded += 1

	# Add a random delay between downloads
	time.sleep(random.uniform(0.5, 1))

	except Exception as e:
	print(f"Error downloading image: {str(e)}")
	continue

	return results

	except Exception as e:
	print(f"An error occurred: {str(e)}")
	return []

	@app.route('/search_images', methods=['GET'])
	def api_search_images():
	try:
	# Get query parameters
	query = request.args.get('query', '')
	num_images = int(request.args.get('num_images', 5))

	if not query:
	return jsonify({'error': 'Query parameter is required'}), 400

	if num_images < 1 or num_images > 20:
	return jsonify({'error': 'Number of images must be between 1 and 20'}), 400

	# Search for images
	results = search_images(query, num_images)

	return jsonify({
	'success': True,
	'query': query,
	'results': results
	})

	except Exception as e:
	return jsonify({
	'success': False,
	'error': str(e)
	}), 500

	def scrape_site_content(query, num_sites=5):
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'DNT': '1',
	'Connection': 'keep-alive',
	}

	results = []
	scraped = 0

	try:
	# Use a more direct search URL format
	search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}&num={num_sites}"
	search_response = requests.get(search_url, headers=headers, timeout=30)
	search_response.raise_for_status()

	# Parse the search results
	search_soup = BeautifulSoup(search_response.text, 'html.parser')

	# Look for URLs in multiple possible locations
	search_results = []

	# Method 1: Look for cite elements
	for cite in search_soup.find_all('cite'):
	url = cite.text.strip()
	if url.startswith(('http://', 'https://')):
	search_results.append(url)

	# Method 2: Look for links with specific attributes
	for a in search_soup.find_all('a'):
	href = a.get('href', '')
	if 'url?q=' in href:
	url = href.split('url?q=')[1].split('&')[0]
	if url.startswith(('http://', 'https://')):
	search_results.append(urllib.parse.unquote(url))

	# Remove duplicates while preserving order
	search_results = list(dict.fromkeys(search_results))

	# Process each found URL
	for url in search_results:
	if scraped >= num_sites:
	break

	try:
	# Get the HTML content
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Verify it's HTML content
	content_type = response.headers.get('Content-Type', '').lower()
	if 'text/html' not in content_type:
	continue

	# Parse the HTML content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Extract text content (limit to first 1000 characters)
	text_content = soup.get_text(separator='\n', strip=True)[:1000]

	# Extract all links (limit to first 10)
	links = []
	for link in soup.find_all('a', href=True)[:10]:
	href = link['href']
	if href.startswith('http'):
	links.append({
	'text': link.get_text(strip=True),
	'url': href
	})

	# Extract meta information
	title = soup.title.string if soup.title else ''
	meta_description = ''
	meta_keywords = ''

	meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
	if meta_desc_tag:
	meta_description = meta_desc_tag.get('content', '')

	meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
	if meta_keywords_tag:
	meta_keywords = meta_keywords_tag.get('content', '')

	results.append({
	'url': url,
	'title': title,
	'meta_description': meta_description,
	'meta_keywords': meta_keywords,
	'text_content': text_content,
	'links': links
	})

	scraped += 1
	# Add a random delay between scrapes
	time.sleep(random.uniform(0.5, 1))

	except Exception as e:
	print(f"Error scraping {url}: {str(e)}")
	continue

	except Exception as e:
	print(f"Error in search: {str(e)}")

	return results

	@app.route('/scrape_sites', methods=['GET'])
	def api_scrape_sites():
	try:
	# Get query parameters
	query = request.args.get('query', '')
	num_sites = int(request.args.get('num_sites', 5))

	if not query:
	return jsonify({'error': 'Query parameter is required'}), 400

	if num_sites < 1 or num_sites > 20:
	return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400

	# Scrape the websites
	results = scrape_site_content(query, num_sites)

	return jsonify({
	'success': True,
	'query': query,
	'results': results
	})

	except Exception as e:
	return jsonify({
	'success': False,
	'error': str(e)
	}), 500


	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=5000)