Spaces:
Sleeping
Sleeping
File size: 11,865 Bytes
a2b8ed7 3e48a1e a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba a2b8ed7 0c771ba 3e48a1e a2b8ed7 0c771ba a2b8ed7 0c771ba 9b6a416 a2b8ed7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
from flask import Flask, jsonify, request
import requests
from bs4 import BeautifulSoup
import os
import re
import urllib.parse
import time
import random
import base64
from io import BytesIO
from urllib.parse import urlparse
import html2text
import json
app = Flask(__name__)
def get_google_search_results(query, num_results=5):
"""Get search results from Google with rotating User-Agents"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
]
headers = {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# Add search parameters
params = {
'q': query,
'num': num_results + 5, # Request extra results in case some fail
'hl': 'en',
'safe': 'active'
}
try:
response = requests.get(
'https://www.google.com/search',
headers=headers,
params=params,
timeout=30
)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Search error: {str(e)}")
return None
def search_images(query, num_images=5):
"""Enhanced image search function"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
}
# Format the query for URL
formatted_query = urllib.parse.quote(query)
# Multiple search URLs to try
search_urls = [
f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active",
f"https://www.bing.com/images/search?q={formatted_query}&safesearch=strict",
]
results = []
for search_url in search_urls:
if len(results) >= num_images:
break
try:
response = requests.get(search_url, headers=headers, timeout=30)
response.raise_for_status()
# Find image URLs using multiple regex patterns
patterns = [
r'https?://[^"\']*?(?:jpg|jpeg|png|gif)',
r'"ou":"(https?://[^"]*?(?:jpg|jpeg|png|gif))"',
r'murl":"(.*?)"'
]
image_urls = []
for pattern in patterns:
found_urls = re.findall(pattern, response.text)
image_urls.extend(found_urls if isinstance(found_urls[0], str) else found_urls[0] for found_urls in [found_urls] if found_urls)
# Remove duplicates while preserving order
image_urls = list(dict.fromkeys(image_urls))
for img_url in image_urls:
if len(results) >= num_images:
break
try:
# Skip unwanted URLs
if any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com', 'bing.com']):
continue
# Download image with timeout
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
# Verify content type
content_type = img_response.headers.get('Content-Type', '')
if not content_type.startswith('image/'):
continue
# Check minimum image size (1KB)
if len(img_response.content) < 1024:
continue
# Convert to base64
image_base64 = base64.b64encode(img_response.content).decode('utf-8')
results.append({
'image_url': img_url,
'base64_data': f"data:{content_type};base64,{image_base64}",
'size': len(img_response.content),
'content_type': content_type
})
# Random delay between downloads
time.sleep(random.uniform(0.5, 1.5))
except Exception as e:
print(f"Error downloading image {img_url}: {str(e)}")
continue
except Exception as e:
print(f"Error with search URL {search_url}: {str(e)}")
continue
return results
def scrape_website(url, headers):
"""Enhanced website scraping function"""
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
# Detect and handle encoding
if 'charset' in response.headers.get('content-type', '').lower():
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for element in soup.find_all(['script', 'style', 'nav', 'footer', 'iframe', 'ad', '.advertisement']):
element.decompose()
# Get meta information
meta_data = {
'title': '',
'description': '',
'keywords': '',
'author': '',
'published_date': ''
}
# Title
if soup.title:
meta_data['title'] = soup.title.string
# Meta tags
meta_tags = {
'description': ['description', 'og:description'],
'keywords': ['keywords'],
'author': ['author', 'og:author'],
'published_date': ['article:published_time', 'datePublished']
}
for key, meta_names in meta_tags.items():
for name in meta_names:
meta_tag = soup.find('meta', attrs={'name': name}) or soup.find('meta', attrs={'property': name})
if meta_tag and meta_tag.get('content'):
meta_data[key] = meta_tag.get('content')
break
# Extract main content
main_content = ''
content_tags = soup.find_all(['p', 'article', 'section', 'div'], class_=re.compile(r'(content|article|post|entry)'))
if content_tags:
for tag in content_tags:
main_content += ' ' + tag.get_text()
else:
# Fallback to all paragraph tags
main_content = ' '.join(p.get_text() for p in soup.find_all('p'))
# Clean the text
main_content = clean_text(main_content)
return {
'title': clean_text(meta_data['title']),
'meta_description': clean_text(meta_data['description']),
'keywords': clean_text(meta_data['keywords']),
'author': clean_text(meta_data['author']),
'published_date': meta_data['published_date'],
'content': main_content[:2000], # First 2000 characters
'url': url,
'domain': get_domain(url)
}
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
return None
def clean_text(text):
"""Enhanced text cleaning function"""
if not text:
return ''
# Convert to string if not already
text = str(text)
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?-]', '', text)
# Remove multiple punctuation
text = re.sub(r'([.,!?])\1+', r'\1', text)
return text.strip()
def get_domain(url):
"""Extract and format domain from URL"""
try:
parsed_uri = urlparse(url)
domain = parsed_uri.netloc
# Remove 'www.' if present
domain = re.sub(r'^www\.', '', domain)
return domain
except:
return url
def search_and_scrape(query, num_results=5):
"""Enhanced search and scrape function"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
# Get search results HTML
search_html = get_google_search_results(query, num_results)
if not search_html:
return []
soup = BeautifulSoup(search_html, 'html.parser')
search_results = []
seen_domains = set()
# Find all search result divs
for result in soup.find_all('div', class_=['g', 'tF2Cxc']):
if len(search_results) >= num_results:
break
try:
# Find the link
link = result.find('a')
if not link:
continue
href = link.get('href', '')
# Basic URL validation
if not href.startswith('http') or any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
continue
# Check for duplicate domains
domain = get_domain(href)
if domain in seen_domains:
continue
seen_domains.add(domain)
# Random delay between requests
time.sleep(random.uniform(1, 2))
# Scrape the website
site_data = scrape_website(href, headers)
if site_data and site_data['content']:
search_results.append(site_data)
except Exception as e:
print(f"Error processing search result: {str(e)}")
continue
return search_results
@app.route('/search_images', methods=['GET'])
def api_search_images():
"""API endpoint for image search"""
try:
query = request.args.get('query', '')
num_images = int(request.args.get('num_images', 5))
if not query:
return jsonify({'error': 'Query parameter is required'}), 400
if num_images < 1 or num_images > 20:
return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
results = search_images(query, num_images)
return jsonify({
'success': True,
'query': query,
'count': len(results),
'results': results
})
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@app.route('/scrape_sites', methods=['GET'])
def api_scrape_sites():
"""API endpoint for web scraping"""
try:
query = request.args.get('query', '')
num_results = int(request.args.get('num_results', 5))
if not query:
return jsonify({'error': 'Query parameter is required'}), 400
if num_results < 1 or num_results > 10:
return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
results = search_and_scrape(query, num_results)
return jsonify({
'success': True,
'query': query,
'count': len(results),
'results': results
})
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
|