Pamudu13 commited on
Commit
4dd1d1c
·
verified ·
1 Parent(s): 4205a9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -220
app.py CHANGED
@@ -1,175 +1,134 @@
1
  from flask import Flask, jsonify, request
2
- import requests
 
 
 
 
3
  from bs4 import BeautifulSoup
4
- import os
5
- import re
6
- import urllib.parse
7
  import time
8
  import random
9
- import base64
 
10
  from io import BytesIO
11
- from urllib.parse import urlparse
12
- import html2text
13
  import json
 
 
 
14
 
15
  app = Flask(__name__)
16
 
17
- def get_google_search_results(query, num_results=5):
18
- """Get search results from Google with rotating User-Agents"""
19
- user_agents = [
20
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
21
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
22
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
23
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
24
- ]
25
-
26
- headers = {
27
- 'User-Agent': random.choice(user_agents),
28
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
29
- 'Accept-Language': 'en-US,en;q=0.5',
30
- 'Accept-Encoding': 'gzip, deflate',
31
- 'DNT': '1',
32
- 'Connection': 'keep-alive',
33
- 'Upgrade-Insecure-Requests': '1'
34
- }
35
-
36
- # Add search parameters
37
- params = {
38
- 'q': query,
39
- 'num': num_results + 5, # Request extra results in case some fail
40
- 'hl': 'en',
41
- 'safe': 'active'
42
- }
43
 
44
- try:
45
- response = requests.get(
46
- 'https://www.google.com/search',
47
- headers=headers,
48
- params=params,
49
- timeout=30
50
- )
51
- response.raise_for_status()
52
- return response.text
53
- except Exception as e:
54
- print(f"Search error: {str(e)}")
55
- return None
56
 
57
- def search_images(query, num_images=5):
58
- """Enhanced image search function"""
59
- headers = {
60
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
61
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
62
- 'Accept-Language': 'en-US,en;q=0.5',
63
- 'Accept-Encoding': 'gzip, deflate',
64
- }
65
-
66
- # Format the query for URL
67
- formatted_query = urllib.parse.quote(query)
68
-
69
- # Multiple search URLs to try
70
- search_urls = [
71
- f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active",
72
- f"https://www.bing.com/images/search?q={formatted_query}&safesearch=strict",
73
- ]
74
 
 
 
 
75
  results = []
76
- for search_url in search_urls:
77
- if len(results) >= num_images:
78
- break
79
-
80
- try:
81
- response = requests.get(search_url, headers=headers, timeout=30)
82
- response.raise_for_status()
83
-
84
- # Find image URLs using multiple regex patterns
85
- patterns = [
86
- r'https?://[^"\']*?(?:jpg|jpeg|png|gif)',
87
- r'"ou":"(https?://[^"]*?(?:jpg|jpeg|png|gif))"',
88
- r'murl":"(.*?)"'
89
- ]
90
-
91
- image_urls = []
92
- for pattern in patterns:
93
- found_urls = re.findall(pattern, response.text)
94
- image_urls.extend(found_urls if isinstance(found_urls[0], str) else found_urls[0] for found_urls in [found_urls] if found_urls)
95
-
96
- # Remove duplicates while preserving order
97
- image_urls = list(dict.fromkeys(image_urls))
98
-
99
- for img_url in image_urls:
100
- if len(results) >= num_images:
101
- break
102
 
103
- try:
104
- # Skip unwanted URLs
105
- if any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com', 'bing.com']):
106
- continue
107
 
108
- # Download image with timeout
109
- img_response = requests.get(img_url, headers=headers, timeout=10)
110
- img_response.raise_for_status()
111
 
112
- # Verify content type
113
- content_type = img_response.headers.get('Content-Type', '')
114
- if not content_type.startswith('image/'):
115
- continue
116
 
117
- # Check minimum image size (1KB)
118
- if len(img_response.content) < 1024:
119
- continue
120
 
121
- # Convert to base64
122
- image_base64 = base64.b64encode(img_response.content).decode('utf-8')
 
 
 
123
 
124
- results.append({
125
- 'image_url': img_url,
126
- 'base64_data': f"data:{content_type};base64,{image_base64}",
127
- 'size': len(img_response.content),
128
- 'content_type': content_type
129
- })
130
 
131
- # Random delay between downloads
132
- time.sleep(random.uniform(0.5, 1.5))
133
 
134
- except Exception as e:
135
- print(f"Error downloading image {img_url}: {str(e)}")
 
136
  continue
137
 
138
- except Exception as e:
139
- print(f"Error with search URL {search_url}: {str(e)}")
140
- continue
141
 
142
- return results
 
 
143
 
144
- def scrape_website(url, headers):
145
- """Enhanced website scraping function"""
146
- try:
147
- response = requests.get(url, headers=headers, timeout=15)
148
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- # Detect and handle encoding
151
- if 'charset' in response.headers.get('content-type', '').lower():
152
- response.encoding = response.apparent_encoding
153
 
154
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
155
 
156
- # Remove unwanted elements
157
- for element in soup.find_all(['script', 'style', 'nav', 'footer', 'iframe', 'ad', '.advertisement']):
158
- element.decompose()
159
 
160
- # Get meta information
161
  meta_data = {
162
- 'title': '',
163
  'description': '',
164
  'keywords': '',
165
  'author': '',
166
  'published_date': ''
167
  }
168
 
169
- # Title
170
- if soup.title:
171
- meta_data['title'] = soup.title.string
172
-
173
  # Meta tags
174
  meta_tags = {
175
  'description': ['description', 'og:description'],
@@ -185,19 +144,15 @@ def scrape_website(url, headers):
185
  meta_data[key] = meta_tag.get('content')
186
  break
187
 
188
- # Extract main content
189
  main_content = ''
190
- content_tags = soup.find_all(['p', 'article', 'section', 'div'], class_=re.compile(r'(content|article|post|entry)'))
 
191
 
192
  if content_tags:
193
- for tag in content_tags:
194
- main_content += ' ' + tag.get_text()
195
  else:
196
- # Fallback to all paragraph tags
197
- main_content = ' '.join(p.get_text() for p in soup.find_all('p'))
198
-
199
- # Clean the text
200
- main_content = clean_text(main_content)
201
 
202
  return {
203
  'title': clean_text(meta_data['title']),
@@ -205,7 +160,7 @@ def scrape_website(url, headers):
205
  'keywords': clean_text(meta_data['keywords']),
206
  'author': clean_text(meta_data['author']),
207
  'published_date': meta_data['published_date'],
208
- 'content': main_content[:2000], # First 2000 characters
209
  'url': url,
210
  'domain': get_domain(url)
211
  }
@@ -214,95 +169,75 @@ def scrape_website(url, headers):
214
  print(f"Error scraping {url}: {str(e)}")
215
  return None
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def clean_text(text):
218
- """Enhanced text cleaning function"""
219
  if not text:
220
  return ''
221
 
222
- # Convert to string if not already
223
  text = str(text)
224
-
225
- # Remove HTML tags
226
- text = re.sub(r'<[^>]+>', '', text)
227
-
228
- # Remove extra whitespace
229
  text = re.sub(r'\s+', ' ', text)
230
-
231
- # Remove special characters but keep basic punctuation
232
  text = re.sub(r'[^\w\s.,!?-]', '', text)
233
-
234
- # Remove multiple punctuation
235
- text = re.sub(r'([.,!?])\1+', r'\1', text)
236
-
237
  return text.strip()
238
 
239
  def get_domain(url):
240
- """Extract and format domain from URL"""
241
  try:
242
- parsed_uri = urlparse(url)
243
- domain = parsed_uri.netloc
244
- # Remove 'www.' if present
245
- domain = re.sub(r'^www\.', '', domain)
246
- return domain
247
  except:
248
  return url
249
 
250
- def search_and_scrape(query, num_results=5):
251
- """Enhanced search and scrape function"""
252
- headers = {
253
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
254
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
255
- }
256
-
257
- # Get search results HTML
258
- search_html = get_google_search_results(query, num_results)
259
- if not search_html:
260
- return []
261
-
262
- soup = BeautifulSoup(search_html, 'html.parser')
263
- search_results = []
264
- seen_domains = set()
265
-
266
- # Find all search result divs
267
- for result in soup.find_all('div', class_=['g', 'tF2Cxc']):
268
- if len(search_results) >= num_results:
269
- break
270
-
271
- try:
272
- # Find the link
273
- link = result.find('a')
274
- if not link:
275
- continue
276
-
277
- href = link.get('href', '')
278
-
279
- # Basic URL validation
280
- if not href.startswith('http') or any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
281
- continue
282
-
283
- # Check for duplicate domains
284
- domain = get_domain(href)
285
- if domain in seen_domains:
286
- continue
287
- seen_domains.add(domain)
288
-
289
- # Random delay between requests
290
- time.sleep(random.uniform(1, 2))
291
-
292
- # Scrape the website
293
- site_data = scrape_website(href, headers)
294
- if site_data and site_data['content']:
295
- search_results.append(site_data)
296
-
297
- except Exception as e:
298
- print(f"Error processing search result: {str(e)}")
299
- continue
300
-
301
- return search_results
302
-
303
  @app.route('/search_images', methods=['GET'])
304
  def api_search_images():
305
- """API endpoint for image search"""
306
  try:
307
  query = request.args.get('query', '')
308
  num_images = int(request.args.get('num_images', 5))
@@ -330,7 +265,6 @@ def api_search_images():
330
 
331
  @app.route('/scrape_sites', methods=['GET'])
332
  def api_scrape_sites():
333
- """API endpoint for web scraping"""
334
  try:
335
  query = request.args.get('query', '')
336
  num_results = int(request.args.get('num_results', 5))
@@ -356,6 +290,12 @@ def api_scrape_sites():
356
  'error': str(e)
357
  }), 500
358
 
 
 
 
 
 
 
359
  if __name__ == '__main__':
360
  app.run(host='0.0.0.0', port=5000)
361
 
 
1
  from flask import Flask, jsonify, request
2
+ import undetected_chromedriver as uc
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.chrome.options import Options
7
  from bs4 import BeautifulSoup
8
+ import base64
 
 
9
  import time
10
  import random
11
+ import re
12
+ import requests
13
  from io import BytesIO
14
+ from PIL import Image
 
15
  import json
16
+ import threading
17
+ from urllib.parse import quote, urlparse
18
+ import html2text
19
 
20
  app = Flask(__name__)
21
 
22
+ # Thread-local storage for the browser instance
23
+ thread_local = threading.local()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ def get_browser():
26
+ """Get or create thread-local browser instance"""
27
+ if not hasattr(thread_local, "browser"):
28
+ chrome_options = uc.ChromeOptions()
29
+ chrome_options.add_argument('--headless')
30
+ chrome_options.add_argument('--no-sandbox')
31
+ chrome_options.add_argument('--disable-dev-shm-usage')
32
+ chrome_options.add_argument('--disable-gpu')
33
+ chrome_options.add_argument('--window-size=1920,1080')
 
 
 
34
 
35
+ thread_local.browser = uc.Chrome(options=chrome_options)
36
+ return thread_local.browser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ def search_images(query, num_images=5):
39
+ """Enhanced image search using selenium"""
40
+ browser = get_browser()
41
  results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ try:
44
+ # Google Images search
45
+ search_url = f"https://www.google.com/search?q={quote(query)}&tbm=isch"
46
+ browser.get(search_url)
47
 
48
+ # Wait for images to load
49
+ time.sleep(2)
 
50
 
51
+ # Scroll to load more images
52
+ for _ in range(3):
53
+ browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
54
+ time.sleep(1)
55
 
56
+ # Find image elements
57
+ image_elements = browser.find_elements(By.CSS_SELECTOR, 'img.rg_i')
 
58
 
59
+ for img in image_elements[:num_images]:
60
+ try:
61
+ # Click image to get full resolution
62
+ img.click()
63
+ time.sleep(1)
64
 
65
+ # Wait for full resolution image
66
+ wait = WebDriverWait(browser, 10)
67
+ full_img = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb')))
 
 
 
68
 
69
+ img_url = full_img.get_attribute('src')
 
70
 
71
+ # Skip data URLs and unwanted domains
72
+ if (img_url.startswith('data:') or
73
+ any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com'])):
74
  continue
75
 
76
+ # Download and process image
77
+ response = requests.get(img_url, timeout=10)
78
+ img_content = response.content
79
 
80
+ # Verify it's an image and get format
81
+ img = Image.open(BytesIO(img_content))
82
+ img_format = img.format.lower()
83
 
84
+ # Convert to base64
85
+ buffered = BytesIO()
86
+ img.save(buffered, format=img_format)
87
+ img_base64 = base64.b64encode(buffered.getvalue()).decode()
88
+
89
+ results.append({
90
+ 'image_url': img_url,
91
+ 'base64_data': f"data:image/{img_format};base64,{img_base64}",
92
+ 'size': len(img_content),
93
+ 'dimensions': img.size,
94
+ 'format': img_format
95
+ })
96
+
97
+ time.sleep(random.uniform(0.5, 1.0))
98
+
99
+ except Exception as e:
100
+ print(f"Error processing image: {str(e)}")
101
+ continue
102
+
103
+ if len(results) >= num_images:
104
+ break
105
+
106
+ except Exception as e:
107
+ print(f"Search error: {str(e)}")
108
+
109
+ return results
110
 
111
+ def scrape_website(url):
112
+ """Enhanced website scraping using selenium"""
113
+ browser = get_browser()
114
 
115
+ try:
116
+ browser.get(url)
117
+ time.sleep(2) # Wait for dynamic content
118
 
119
+ # Get page source after JavaScript execution
120
+ page_source = browser.page_source
121
+ soup = BeautifulSoup(page_source, 'html.parser')
122
 
123
+ # Extract metadata and content
124
  meta_data = {
125
+ 'title': soup.title.string if soup.title else '',
126
  'description': '',
127
  'keywords': '',
128
  'author': '',
129
  'published_date': ''
130
  }
131
 
 
 
 
 
132
  # Meta tags
133
  meta_tags = {
134
  'description': ['description', 'og:description'],
 
144
  meta_data[key] = meta_tag.get('content')
145
  break
146
 
147
+ # Get main content
148
  main_content = ''
149
+ content_tags = soup.find_all(['article', 'main', 'div'],
150
+ class_=re.compile(r'(content|article|post|entry)'))
151
 
152
  if content_tags:
153
+ main_content = ' '.join(tag.get_text(strip=True) for tag in content_tags)
 
154
  else:
155
+ main_content = ' '.join(p.get_text(strip=True) for p in soup.find_all('p'))
 
 
 
 
156
 
157
  return {
158
  'title': clean_text(meta_data['title']),
 
160
  'keywords': clean_text(meta_data['keywords']),
161
  'author': clean_text(meta_data['author']),
162
  'published_date': meta_data['published_date'],
163
+ 'content': clean_text(main_content)[:2000],
164
  'url': url,
165
  'domain': get_domain(url)
166
  }
 
169
  print(f"Error scraping {url}: {str(e)}")
170
  return None
171
 
172
+ def search_and_scrape(query, num_results=5):
173
+ """Enhanced search and scrape using selenium"""
174
+ browser = get_browser()
175
+ results = []
176
+
177
+ try:
178
+ # Perform Google search
179
+ search_url = f"https://www.google.com/search?q={quote(query)}&num={num_results + 5}"
180
+ browser.get(search_url)
181
+ time.sleep(2)
182
+
183
+ # Get search results
184
+ search_results = browser.find_elements(By.CSS_SELECTOR, 'div.g')
185
+ seen_domains = set()
186
+
187
+ for result in search_results:
188
+ if len(results) >= num_results:
189
+ break
190
+
191
+ try:
192
+ link = result.find_element(By.CSS_SELECTOR, 'a')
193
+ href = link.get_attribute('href')
194
+
195
+ # Skip unwanted URLs
196
+ if not href or not href.startswith('http') or \
197
+ any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
198
+ continue
199
+
200
+ # Check for duplicate domains
201
+ domain = get_domain(href)
202
+ if domain in seen_domains:
203
+ continue
204
+ seen_domains.add(domain)
205
+
206
+ # Scrape website
207
+ site_data = scrape_website(href)
208
+ if site_data and site_data['content']:
209
+ results.append(site_data)
210
+
211
+ time.sleep(random.uniform(1, 2))
212
+
213
+ except Exception as e:
214
+ print(f"Error processing result: {str(e)}")
215
+ continue
216
+
217
+ except Exception as e:
218
+ print(f"Search error: {str(e)}")
219
+
220
+ return results
221
+
222
  def clean_text(text):
223
+ """Clean extracted text"""
224
  if not text:
225
  return ''
226
 
 
227
  text = str(text)
 
 
 
 
 
228
  text = re.sub(r'\s+', ' ', text)
 
 
229
  text = re.sub(r'[^\w\s.,!?-]', '', text)
 
 
 
 
230
  return text.strip()
231
 
232
  def get_domain(url):
233
+ """Extract domain from URL"""
234
  try:
235
+ return urlparse(url).netloc.replace('www.', '')
 
 
 
 
236
  except:
237
  return url
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  @app.route('/search_images', methods=['GET'])
240
  def api_search_images():
 
241
  try:
242
  query = request.args.get('query', '')
243
  num_images = int(request.args.get('num_images', 5))
 
265
 
266
  @app.route('/scrape_sites', methods=['GET'])
267
  def api_scrape_sites():
 
268
  try:
269
  query = request.args.get('query', '')
270
  num_results = int(request.args.get('num_results', 5))
 
290
  'error': str(e)
291
  }), 500
292
 
293
+ @app.teardown_appcontext
294
+ def cleanup(exception=None):
295
+ """Clean up browser instances"""
296
+ if hasattr(thread_local, "browser"):
297
+ thread_local.browser.quit()
298
+
299
  if __name__ == '__main__':
300
  app.run(host='0.0.0.0', port=5000)
301