Spaces:
Sleeping
Sleeping
from flask import Flask, jsonify, request | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import re | |
import urllib.parse | |
import time | |
import random | |
import base64 | |
from io import BytesIO | |
from urllib.parse import urlparse | |
import html2text | |
import json | |
app = Flask(__name__) | |
def get_google_search_results(query, num_results=5): | |
"""Get search results from Google with rotating User-Agents""" | |
user_agents = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', | |
] | |
headers = { | |
'User-Agent': random.choice(user_agents), | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
'DNT': '1', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1' | |
} | |
# Add search parameters | |
params = { | |
'q': query, | |
'num': num_results + 5, # Request extra results in case some fail | |
'hl': 'en', | |
'safe': 'active' | |
} | |
try: | |
response = requests.get( | |
'https://www.google.com/search', | |
headers=headers, | |
params=params, | |
timeout=30 | |
) | |
response.raise_for_status() | |
return response.text | |
except Exception as e: | |
print(f"Search error: {str(e)}") | |
return None | |
def search_images(query, num_images=5): | |
"""Enhanced image search function""" | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
} | |
# Format the query for URL | |
formatted_query = urllib.parse.quote(query) | |
# Multiple search URLs to try | |
search_urls = [ | |
f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active", | |
f"https://www.bing.com/images/search?q={formatted_query}&safesearch=strict", | |
] | |
results = [] | |
for search_url in search_urls: | |
if len(results) >= num_images: | |
break | |
try: | |
response = requests.get(search_url, headers=headers, timeout=30) | |
response.raise_for_status() | |
# Find image URLs using multiple regex patterns | |
patterns = [ | |
r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', | |
r'"ou":"(https?://[^"]*?(?:jpg|jpeg|png|gif))"', | |
r'murl":"(.*?)"' | |
] | |
image_urls = [] | |
for pattern in patterns: | |
found_urls = re.findall(pattern, response.text) | |
image_urls.extend(found_urls if isinstance(found_urls[0], str) else found_urls[0] for found_urls in [found_urls] if found_urls) | |
# Remove duplicates while preserving order | |
image_urls = list(dict.fromkeys(image_urls)) | |
for img_url in image_urls: | |
if len(results) >= num_images: | |
break | |
try: | |
# Skip unwanted URLs | |
if any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com', 'bing.com']): | |
continue | |
# Download image with timeout | |
img_response = requests.get(img_url, headers=headers, timeout=10) | |
img_response.raise_for_status() | |
# Verify content type | |
content_type = img_response.headers.get('Content-Type', '') | |
if not content_type.startswith('image/'): | |
continue | |
# Check minimum image size (1KB) | |
if len(img_response.content) < 1024: | |
continue | |
# Convert to base64 | |
image_base64 = base64.b64encode(img_response.content).decode('utf-8') | |
results.append({ | |
'image_url': img_url, | |
'base64_data': f"data:{content_type};base64,{image_base64}", | |
'size': len(img_response.content), | |
'content_type': content_type | |
}) | |
# Random delay between downloads | |
time.sleep(random.uniform(0.5, 1.5)) | |
except Exception as e: | |
print(f"Error downloading image {img_url}: {str(e)}") | |
continue | |
except Exception as e: | |
print(f"Error with search URL {search_url}: {str(e)}") | |
continue | |
return results | |
def scrape_website(url, headers): | |
"""Enhanced website scraping function""" | |
try: | |
response = requests.get(url, headers=headers, timeout=15) | |
response.raise_for_status() | |
# Detect and handle encoding | |
if 'charset' in response.headers.get('content-type', '').lower(): | |
response.encoding = response.apparent_encoding | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Remove unwanted elements | |
for element in soup.find_all(['script', 'style', 'nav', 'footer', 'iframe', 'ad', '.advertisement']): | |
element.decompose() | |
# Get meta information | |
meta_data = { | |
'title': '', | |
'description': '', | |
'keywords': '', | |
'author': '', | |
'published_date': '' | |
} | |
# Title | |
if soup.title: | |
meta_data['title'] = soup.title.string | |
# Meta tags | |
meta_tags = { | |
'description': ['description', 'og:description'], | |
'keywords': ['keywords'], | |
'author': ['author', 'og:author'], | |
'published_date': ['article:published_time', 'datePublished'] | |
} | |
for key, meta_names in meta_tags.items(): | |
for name in meta_names: | |
meta_tag = soup.find('meta', attrs={'name': name}) or soup.find('meta', attrs={'property': name}) | |
if meta_tag and meta_tag.get('content'): | |
meta_data[key] = meta_tag.get('content') | |
break | |
# Extract main content | |
main_content = '' | |
content_tags = soup.find_all(['p', 'article', 'section', 'div'], class_=re.compile(r'(content|article|post|entry)')) | |
if content_tags: | |
for tag in content_tags: | |
main_content += ' ' + tag.get_text() | |
else: | |
# Fallback to all paragraph tags | |
main_content = ' '.join(p.get_text() for p in soup.find_all('p')) | |
# Clean the text | |
main_content = clean_text(main_content) | |
return { | |
'title': clean_text(meta_data['title']), | |
'meta_description': clean_text(meta_data['description']), | |
'keywords': clean_text(meta_data['keywords']), | |
'author': clean_text(meta_data['author']), | |
'published_date': meta_data['published_date'], | |
'content': main_content[:2000], # First 2000 characters | |
'url': url, | |
'domain': get_domain(url) | |
} | |
except Exception as e: | |
print(f"Error scraping {url}: {str(e)}") | |
return None | |
def clean_text(text): | |
"""Enhanced text cleaning function""" | |
if not text: | |
return '' | |
# Convert to string if not already | |
text = str(text) | |
# Remove HTML tags | |
text = re.sub(r'<[^>]+>', '', text) | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text) | |
# Remove special characters but keep basic punctuation | |
text = re.sub(r'[^\w\s.,!?-]', '', text) | |
# Remove multiple punctuation | |
text = re.sub(r'([.,!?])\1+', r'\1', text) | |
return text.strip() | |
def get_domain(url): | |
"""Extract and format domain from URL""" | |
try: | |
parsed_uri = urlparse(url) | |
domain = parsed_uri.netloc | |
# Remove 'www.' if present | |
domain = re.sub(r'^www\.', '', domain) | |
return domain | |
except: | |
return url | |
def search_and_scrape(query, num_results=5): | |
"""Enhanced search and scrape function""" | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
} | |
# Get search results HTML | |
search_html = get_google_search_results(query, num_results) | |
if not search_html: | |
return [] | |
soup = BeautifulSoup(search_html, 'html.parser') | |
search_results = [] | |
seen_domains = set() | |
# Find all search result divs | |
for result in soup.find_all('div', class_=['g', 'tF2Cxc']): | |
if len(search_results) >= num_results: | |
break | |
try: | |
# Find the link | |
link = result.find('a') | |
if not link: | |
continue | |
href = link.get('href', '') | |
# Basic URL validation | |
if not href.startswith('http') or any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']): | |
continue | |
# Check for duplicate domains | |
domain = get_domain(href) | |
if domain in seen_domains: | |
continue | |
seen_domains.add(domain) | |
# Random delay between requests | |
time.sleep(random.uniform(1, 2)) | |
# Scrape the website | |
site_data = scrape_website(href, headers) | |
if site_data and site_data['content']: | |
search_results.append(site_data) | |
except Exception as e: | |
print(f"Error processing search result: {str(e)}") | |
continue | |
return search_results | |
def api_search_images(): | |
"""API endpoint for image search""" | |
try: | |
query = request.args.get('query', '') | |
num_images = int(request.args.get('num_images', 5)) | |
if not query: | |
return jsonify({'error': 'Query parameter is required'}), 400 | |
if num_images < 1 or num_images > 20: | |
return jsonify({'error': 'Number of images must be between 1 and 20'}), 400 | |
results = search_images(query, num_images) | |
return jsonify({ | |
'success': True, | |
'query': query, | |
'count': len(results), | |
'results': results | |
}) | |
except Exception as e: | |
return jsonify({ | |
'success': False, | |
'error': str(e) | |
}), 500 | |
def api_scrape_sites(): | |
"""API endpoint for web scraping""" | |
try: | |
query = request.args.get('query', '') | |
num_results = int(request.args.get('num_results', 5)) | |
if not query: | |
return jsonify({'error': 'Query parameter is required'}), 400 | |
if num_results < 1 or num_results > 10: | |
return jsonify({'error': 'Number of results must be between 1 and 10'}), 400 | |
results = search_and_scrape(query, num_results) | |
return jsonify({ | |
'success': True, | |
'query': query, | |
'count': len(results), | |
'results': results | |
}) | |
except Exception as e: | |
return jsonify({ | |
'success': False, | |
'error': str(e) | |
}), 500 | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=5000) | |