Pamudu13 commited on
Commit
b868160
·
verified ·
1 Parent(s): b1a8325

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -240
app.py CHANGED
@@ -1,244 +1,94 @@
1
  from flask import Flask, jsonify, request
2
- import undetected_chromedriver as uc
3
- from selenium.webdriver.common.by import By
4
- from selenium.webdriver.support.ui import WebDriverWait
5
- from selenium.webdriver.support import expected_conditions as EC
6
- from selenium.webdriver.chrome.options import Options
7
  from bs4 import BeautifulSoup
8
- import base64
 
 
9
  import time
10
  import random
11
- import re
12
- import requests
13
  from io import BytesIO
14
- from PIL import Image
15
- import json
16
- import threading
17
- from urllib.parse import quote, urlparse
18
- import html2text
19
 
20
  app = Flask(__name__)
21
 
22
- # Thread-local storage for the browser instance
23
- thread_local = threading.local()
24
-
25
- def get_browser():
26
- """Get or create thread-local browser instance"""
27
- if not hasattr(thread_local, "browser"):
28
- chrome_options = uc.ChromeOptions()
29
- chrome_options.add_argument('--headless')
30
- chrome_options.add_argument('--no-sandbox')
31
- chrome_options.add_argument('--disable-dev-shm-usage')
32
- chrome_options.add_argument('--disable-gpu')
33
- chrome_options.add_argument('--window-size=1920,1080')
34
-
35
- thread_local.browser = uc.Chrome(options=chrome_options)
36
- return thread_local.browser
37
-
38
  def search_images(query, num_images=5):
39
- """Enhanced image search using selenium"""
40
- browser = get_browser()
41
- results = []
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  try:
44
- # Google Images search
45
- search_url = f"https://www.google.com/search?q={quote(query)}&tbm=isch"
46
- browser.get(search_url)
47
 
48
- # Wait for images to load
49
- time.sleep(2)
50
 
51
- # Scroll to load more images
52
- for _ in range(3):
53
- browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
54
- time.sleep(1)
55
 
56
- # Find image elements
57
- image_elements = browser.find_elements(By.CSS_SELECTOR, 'img.rg_i')
 
58
 
59
- for img in image_elements[:num_images]:
60
- try:
61
- # Click image to get full resolution
62
- img.click()
63
- time.sleep(1)
64
-
65
- # Wait for full resolution image
66
- wait = WebDriverWait(browser, 10)
67
- full_img = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb')))
68
-
69
- img_url = full_img.get_attribute('src')
70
 
71
- # Skip data URLs and unwanted domains
72
- if (img_url.startswith('data:') or
73
- any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com'])):
74
  continue
75
 
76
- # Download and process image
77
- response = requests.get(img_url, timeout=10)
78
- img_content = response.content
79
 
80
- # Verify it's an image and get format
81
- img = Image.open(BytesIO(img_content))
82
- img_format = img.format.lower()
 
83
 
84
- # Convert to base64
85
- buffered = BytesIO()
86
- img.save(buffered, format=img_format)
87
- img_base64 = base64.b64encode(buffered.getvalue()).decode()
88
 
 
89
  results.append({
90
  'image_url': img_url,
91
- 'base64_data': f"data:image/{img_format};base64,{img_base64}",
92
- 'size': len(img_content),
93
- 'dimensions': img.size,
94
- 'format': img_format
95
  })
96
 
97
- time.sleep(random.uniform(0.5, 1.0))
98
-
99
- except Exception as e:
100
- print(f"Error processing image: {str(e)}")
101
- continue
102
-
103
- if len(results) >= num_images:
104
- break
105
-
106
- except Exception as e:
107
- print(f"Search error: {str(e)}")
108
-
109
- return results
110
-
111
- def scrape_website(url):
112
- """Enhanced website scraping using selenium"""
113
- browser = get_browser()
114
-
115
- try:
116
- browser.get(url)
117
- time.sleep(2) # Wait for dynamic content
118
-
119
- # Get page source after JavaScript execution
120
- page_source = browser.page_source
121
- soup = BeautifulSoup(page_source, 'html.parser')
122
-
123
- # Extract metadata and content
124
- meta_data = {
125
- 'title': soup.title.string if soup.title else '',
126
- 'description': '',
127
- 'keywords': '',
128
- 'author': '',
129
- 'published_date': ''
130
- }
131
-
132
- # Meta tags
133
- meta_tags = {
134
- 'description': ['description', 'og:description'],
135
- 'keywords': ['keywords'],
136
- 'author': ['author', 'og:author'],
137
- 'published_date': ['article:published_time', 'datePublished']
138
- }
139
-
140
- for key, meta_names in meta_tags.items():
141
- for name in meta_names:
142
- meta_tag = soup.find('meta', attrs={'name': name}) or soup.find('meta', attrs={'property': name})
143
- if meta_tag and meta_tag.get('content'):
144
- meta_data[key] = meta_tag.get('content')
145
- break
146
-
147
- # Get main content
148
- main_content = ''
149
- content_tags = soup.find_all(['article', 'main', 'div'],
150
- class_=re.compile(r'(content|article|post|entry)'))
151
-
152
- if content_tags:
153
- main_content = ' '.join(tag.get_text(strip=True) for tag in content_tags)
154
- else:
155
- main_content = ' '.join(p.get_text(strip=True) for p in soup.find_all('p'))
156
-
157
- return {
158
- 'title': clean_text(meta_data['title']),
159
- 'meta_description': clean_text(meta_data['description']),
160
- 'keywords': clean_text(meta_data['keywords']),
161
- 'author': clean_text(meta_data['author']),
162
- 'published_date': meta_data['published_date'],
163
- 'content': clean_text(main_content)[:2000],
164
- 'url': url,
165
- 'domain': get_domain(url)
166
- }
167
-
168
- except Exception as e:
169
- print(f"Error scraping {url}: {str(e)}")
170
- return None
171
-
172
- def search_and_scrape(query, num_results=5):
173
- """Enhanced search and scrape using selenium"""
174
- browser = get_browser()
175
- results = []
176
-
177
- try:
178
- # Perform Google search
179
- search_url = f"https://www.google.com/search?q={quote(query)}&num={num_results + 5}"
180
- browser.get(search_url)
181
- time.sleep(2)
182
-
183
- # Get search results
184
- search_results = browser.find_elements(By.CSS_SELECTOR, 'div.g')
185
- seen_domains = set()
186
-
187
- for result in search_results:
188
- if len(results) >= num_results:
189
- break
190
-
191
- try:
192
- link = result.find_element(By.CSS_SELECTOR, 'a')
193
- href = link.get_attribute('href')
194
 
195
- # Skip unwanted URLs
196
- if not href or not href.startswith('http') or \
197
- any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
198
- continue
199
-
200
- # Check for duplicate domains
201
- domain = get_domain(href)
202
- if domain in seen_domains:
203
- continue
204
- seen_domains.add(domain)
205
-
206
- # Scrape website
207
- site_data = scrape_website(href)
208
- if site_data and site_data['content']:
209
- results.append(site_data)
210
-
211
- time.sleep(random.uniform(1, 2))
212
 
213
  except Exception as e:
214
- print(f"Error processing result: {str(e)}")
215
  continue
216
 
217
- except Exception as e:
218
- print(f"Search error: {str(e)}")
219
-
220
- return results
221
 
222
- def clean_text(text):
223
- """Clean extracted text"""
224
- if not text:
225
- return ''
226
-
227
- text = str(text)
228
- text = re.sub(r'\s+', ' ', text)
229
- text = re.sub(r'[^\w\s.,!?-]', '', text)
230
- return text.strip()
231
-
232
- def get_domain(url):
233
- """Extract domain from URL"""
234
- try:
235
- return urlparse(url).netloc.replace('www.', '')
236
- except:
237
- return url
238
 
239
  @app.route('/search_images', methods=['GET'])
240
  def api_search_images():
241
  try:
 
242
  query = request.args.get('query', '')
243
  num_images = int(request.args.get('num_images', 5))
244
 
@@ -248,39 +98,12 @@ def api_search_images():
248
  if num_images < 1 or num_images > 20:
249
  return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
250
 
 
251
  results = search_images(query, num_images)
252
 
253
  return jsonify({
254
  'success': True,
255
  'query': query,
256
- 'count': len(results),
257
- 'results': results
258
- })
259
-
260
- except Exception as e:
261
- return jsonify({
262
- 'success': False,
263
- 'error': str(e)
264
- }), 500
265
-
266
- @app.route('/scrape_sites', methods=['GET'])
267
- def api_scrape_sites():
268
- try:
269
- query = request.args.get('query', '')
270
- num_results = int(request.args.get('num_results', 5))
271
-
272
- if not query:
273
- return jsonify({'error': 'Query parameter is required'}), 400
274
-
275
- if num_results < 1 or num_results > 10:
276
- return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
277
-
278
- results = search_and_scrape(query, num_results)
279
-
280
- return jsonify({
281
- 'success': True,
282
- 'query': query,
283
- 'count': len(results),
284
  'results': results
285
  })
286
 
@@ -290,12 +113,6 @@ def api_scrape_sites():
290
  'error': str(e)
291
  }), 500
292
 
293
- @app.teardown_appcontext
294
- def cleanup(exception=None):
295
- """Clean up browser instances"""
296
- if hasattr(thread_local, "browser"):
297
- thread_local.browser.quit()
298
-
299
  if __name__ == '__main__':
300
  app.run(host='0.0.0.0', port=5000)
301
 
 
1
  from flask import Flask, jsonify, request
2
+ import requests
 
 
 
 
3
  from bs4 import BeautifulSoup
4
+ import os
5
+ import re
6
+ import urllib.parse
7
  import time
8
  import random
9
+ import base64
 
10
  from io import BytesIO
 
 
 
 
 
11
 
12
  app = Flask(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def search_images(query, num_images=5):
15
+ # Headers to mimic a browser request
16
+ headers = {
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
18
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19
+ 'Accept-Language': 'en-US,en;q=0.5',
20
+ 'Accept-Encoding': 'gzip, deflate',
21
+ 'DNT': '1',
22
+ 'Connection': 'keep-alive',
23
+ }
24
+
25
+ # Format the query for URL
26
+ formatted_query = urllib.parse.quote(query)
27
+
28
+ # Google Images URL
29
+ url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
30
 
31
  try:
32
+ # Get the HTML content
33
+ response = requests.get(url, headers=headers, timeout=30)
34
+ response.raise_for_status()
35
 
36
+ # Find all image URLs using regex
37
+ image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
38
 
39
+ # Remove duplicates while preserving order
40
+ image_urls = list(dict.fromkeys(image_urls))
 
 
41
 
42
+ # Store results
43
+ results = []
44
+ downloaded = 0
45
 
46
+ for img_url in image_urls:
47
+ if downloaded >= num_images:
48
+ break
 
 
 
 
 
 
 
 
49
 
50
+ try:
51
+ # Skip small thumbnails and icons
52
+ if 'gstatic.com' in img_url or 'google.com' in img_url:
53
  continue
54
 
55
+ # Download image
56
+ img_response = requests.get(img_url, headers=headers, timeout=10)
57
+ img_response.raise_for_status()
58
 
59
+ # Check if the response is actually an image
60
+ content_type = img_response.headers.get('Content-Type', '')
61
+ if not content_type.startswith('image/'):
62
+ continue
63
 
64
+ # Convert image to base64
65
+ image_base64 = base64.b64encode(img_response.content).decode('utf-8')
 
 
66
 
67
+ # Add to results
68
  results.append({
69
  'image_url': img_url,
70
+ 'base64_data': f"data:{content_type};base64,{image_base64}"
 
 
 
71
  })
72
 
73
+ downloaded += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # Add a random delay between downloads
76
+ time.sleep(random.uniform(0.5, 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  except Exception as e:
79
+ print(f"Error downloading image: {str(e)}")
80
  continue
81
 
82
+ return results
 
 
 
83
 
84
+ except Exception as e:
85
+ print(f"An error occurred: {str(e)}")
86
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  @app.route('/search_images', methods=['GET'])
89
  def api_search_images():
90
  try:
91
+ # Get query parameters
92
  query = request.args.get('query', '')
93
  num_images = int(request.args.get('num_images', 5))
94
 
 
98
  if num_images < 1 or num_images > 20:
99
  return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
100
 
101
+ # Search for images
102
  results = search_images(query, num_images)
103
 
104
  return jsonify({
105
  'success': True,
106
  'query': query,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  'results': results
108
  })
109
 
 
113
  'error': str(e)
114
  }), 500
115
 
 
 
 
 
 
 
116
  if __name__ == '__main__':
117
  app.run(host='0.0.0.0', port=5000)
118