Pamudu13 commited on
Commit
0c771ba
·
verified ·
1 Parent(s): a099086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -149
app.py CHANGED
@@ -10,225 +10,328 @@ import base64
10
  from io import BytesIO
11
  from urllib.parse import urlparse
12
  import html2text
 
13
 
14
  app = Flask(__name__)
15
 
16
- def search_images(query, num_images=5):
17
- # Headers to mimic a browser request
 
 
 
 
 
 
 
18
  headers = {
19
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
20
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21
  'Accept-Language': 'en-US,en;q=0.5',
22
  'Accept-Encoding': 'gzip, deflate',
23
  'DNT': '1',
24
  'Connection': 'keep-alive',
 
25
  }
26
 
27
- # Format the query for URL
28
- formatted_query = urllib.parse.quote(query)
29
-
30
- # Google Images URL
31
- url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
 
 
32
 
33
  try:
34
- # Get the HTML content
35
- response = requests.get(url, headers=headers, timeout=30)
 
 
 
 
36
  response.raise_for_status()
 
 
 
 
37
 
38
- # Find all image URLs using regex
39
- image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
40
-
41
- # Remove duplicates while preserving order
42
- image_urls = list(dict.fromkeys(image_urls))
43
-
44
- # Store results
45
- results = []
46
- downloaded = 0
47
 
48
- for img_url in image_urls:
49
- if downloaded >= num_images:
50
- break
51
 
52
- try:
53
- # Skip small thumbnails and icons
54
- if 'gstatic.com' in img_url or 'google.com' in img_url:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  continue
56
 
57
- # Download image
58
- img_response = requests.get(img_url, headers=headers, timeout=10)
59
- img_response.raise_for_status()
60
 
61
- # Check if the response is actually an image
62
- content_type = img_response.headers.get('Content-Type', '')
63
- if not content_type.startswith('image/'):
64
- continue
65
 
66
- # Convert image to base64
67
- image_base64 = base64.b64encode(img_response.content).decode('utf-8')
 
 
 
68
 
69
- # Add to results
70
- results.append({
71
- 'image_url': img_url,
72
- 'base64_data': f"data:{content_type};base64,{image_base64}"
73
- })
74
 
75
- downloaded += 1
76
 
77
- # Add a random delay between downloads
78
- time.sleep(random.uniform(0.5, 1))
 
79
 
80
- except Exception as e:
81
- print(f"Error downloading image: {str(e)}")
82
- continue
 
 
 
 
 
83
 
84
- return results
 
 
85
 
86
- except Exception as e:
87
- print(f"An error occurred: {str(e)}")
88
- return []
 
 
 
 
89
 
90
- @app.route('/search_images', methods=['GET'])
91
- def api_search_images():
92
- try:
93
- # Get query parameters
94
- query = request.args.get('query', '')
95
- num_images = int(request.args.get('num_images', 5))
96
 
97
- if not query:
98
- return jsonify({'error': 'Query parameter is required'}), 400
 
99
 
100
- if num_images < 1 or num_images > 20:
101
- return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
 
 
 
 
102
 
103
- # Search for images
104
- results = search_images(query, num_images)
105
 
106
- return jsonify({
107
- 'success': True,
108
- 'query': query,
109
- 'results': results
110
- })
 
 
 
 
 
111
 
112
  except Exception as e:
113
- return jsonify({
114
- 'success': False,
115
- 'error': str(e)
116
- }), 500
117
-
118
- def get_domain(url):
119
- """Extract domain from URL"""
120
- parsed_uri = urlparse(url)
121
- return parsed_uri.netloc
122
 
123
  def clean_text(text):
124
- """Clean scraped text"""
125
- # Remove extra whitespace
126
- text = re.sub(r'\s+', ' ', text)
127
- # Remove special characters
128
- text = re.sub(r'[^\w\s.,!?-]', '', text)
129
- return text.strip()
130
-
131
- def scrape_website(url, headers):
132
- """Scrape content from a single website"""
133
- try:
134
- response = requests.get(url, headers=headers, timeout=10)
135
- response.raise_for_status()
136
-
137
- soup = BeautifulSoup(response.text, 'html.parser')
138
 
139
- # Remove unwanted elements
140
- for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
141
- element.decompose()
142
 
143
- # Convert HTML to text
144
- h = html2text.HTML2Text()
145
- h.ignore_links = True
146
- h.ignore_images = True
147
- text = h.handle(str(soup))
148
 
149
- # Clean the text
150
- text = clean_text(text)
151
 
152
- # Get meta description
153
- meta_desc = ''
154
- meta_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
155
- if meta_tag:
156
- meta_desc = meta_tag.get('content', '')
157
 
158
- # Get title
159
- title = soup.title.string if soup.title else ''
160
 
161
- return {
162
- 'title': clean_text(title),
163
- 'meta_description': clean_text(meta_desc),
164
- 'content': text[:1000], # Limit content length
165
- 'url': url
166
- }
167
 
168
- except Exception as e:
169
- print(f"Error scraping {url}: {str(e)}")
170
- return None
 
 
 
 
 
 
 
171
 
172
  def search_and_scrape(query, num_results=5):
 
173
  headers = {
174
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
175
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
176
- 'Accept-Language': 'en-US,en;q=0.5',
177
- 'Accept-Encoding': 'gzip, deflate',
178
- 'DNT': '1',
179
- 'Connection': 'keep-alive',
180
  }
181
 
182
- # Format the query for URL
183
- formatted_query = urllib.parse.quote(query)
184
-
185
- # Google Search URL
186
- url = f"https://www.google.com/search?q={formatted_query}&num={num_results}"
187
-
188
- try:
189
- # Get Google search results
190
- response = requests.get(url, headers=headers, timeout=30)
191
- response.raise_for_status()
192
 
193
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
194
 
195
- # Find all search result divs
196
- search_results = []
197
- result_divs = soup.find_all('div', class_='g')
 
198
 
199
- for div in result_divs:
200
  # Find the link
201
- link = div.find('a')
202
  if not link:
203
  continue
204
 
205
  href = link.get('href', '')
206
 
207
- # Skip if not a valid URL or if it's a Google-related URL
208
- if not href.startswith('http') or 'google.' in href:
 
 
 
 
 
209
  continue
 
210
 
211
- # Add random delay between requests
212
  time.sleep(random.uniform(1, 2))
213
 
214
  # Scrape the website
215
  site_data = scrape_website(href, headers)
216
- if site_data:
217
  search_results.append(site_data)
218
 
219
- if len(search_results) >= num_results:
220
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- return search_results
 
 
 
 
 
 
 
223
 
224
  except Exception as e:
225
- print(f"An error occurred: {str(e)}")
226
- return []
 
 
227
 
228
- @app.route('/', methods=['GET'])
229
  def api_scrape_sites():
 
230
  try:
231
- # Get query parameters
232
  query = request.args.get('query', '')
233
  num_results = int(request.args.get('num_results', 5))
234
 
@@ -238,12 +341,12 @@ def api_scrape_sites():
238
  if num_results < 1 or num_results > 10:
239
  return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
240
 
241
- # Search and scrape sites
242
  results = search_and_scrape(query, num_results)
243
 
244
  return jsonify({
245
  'success': True,
246
  'query': query,
 
247
  'results': results
248
  })
249
 
@@ -252,7 +355,7 @@ def api_scrape_sites():
252
  'success': False,
253
  'error': str(e)
254
  }), 500
255
-
256
  if __name__ == '__main__':
257
  app.run(host='0.0.0.0', port=5000)
258
 
 
10
  from io import BytesIO
11
  from urllib.parse import urlparse
12
  import html2text
13
+ import json
14
 
15
  app = Flask(__name__)
16
 
17
+ def get_google_search_results(query, num_results=5):
18
+ """Get search results from Google with rotating User-Agents"""
19
+ user_agents = [
20
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
21
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
22
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
23
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
24
+ ]
25
+
26
  headers = {
27
+ 'User-Agent': random.choice(user_agents),
28
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
29
  'Accept-Language': 'en-US,en;q=0.5',
30
  'Accept-Encoding': 'gzip, deflate',
31
  'DNT': '1',
32
  'Connection': 'keep-alive',
33
+ 'Upgrade-Insecure-Requests': '1'
34
  }
35
 
36
+ # Add search parameters
37
+ params = {
38
+ 'q': query,
39
+ 'num': num_results + 5, # Request extra results in case some fail
40
+ 'hl': 'en',
41
+ 'safe': 'active'
42
+ }
43
 
44
  try:
45
+ response = requests.get(
46
+ 'https://www.google.com/search',
47
+ headers=headers,
48
+ params=params,
49
+ timeout=30
50
+ )
51
  response.raise_for_status()
52
+ return response.text
53
+ except Exception as e:
54
+ print(f"Search error: {str(e)}")
55
+ return None
56
 
57
+ def search_images(query, num_images=5):
58
+ """Enhanced image search function"""
59
+ headers = {
60
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
61
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
62
+ 'Accept-Language': 'en-US,en;q=0.5',
63
+ 'Accept-Encoding': 'gzip, deflate',
64
+ }
 
65
 
66
+ # Format the query for URL
67
+ formatted_query = urllib.parse.quote(query)
 
68
 
69
+ # Multiple search URLs to try
70
+ search_urls = [
71
+ f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active",
72
+ f"https://www.bing.com/images/search?q={formatted_query}&safesearch=strict",
73
+ ]
74
+
75
+ results = []
76
+ for search_url in search_urls:
77
+ if len(results) >= num_images:
78
+ break
79
+
80
+ try:
81
+ response = requests.get(search_url, headers=headers, timeout=30)
82
+ response.raise_for_status()
83
+
84
+ # Find image URLs using multiple regex patterns
85
+ patterns = [
86
+ r'https?://[^"\']*?(?:jpg|jpeg|png|gif)',
87
+ r'"ou":"(https?://[^"]*?(?:jpg|jpeg|png|gif))"',
88
+ r'murl&quot;:&quot;(.*?)&quot;'
89
+ ]
90
+
91
+ image_urls = []
92
+ for pattern in patterns:
93
+ found_urls = re.findall(pattern, response.text)
94
+ image_urls.extend(found_urls if isinstance(found_urls[0], str) else found_urls[0] for found_urls in [found_urls] if found_urls)
95
+
96
+ # Remove duplicates while preserving order
97
+ image_urls = list(dict.fromkeys(image_urls))
98
+
99
+ for img_url in image_urls:
100
+ if len(results) >= num_images:
101
+ break
102
+
103
+ try:
104
+ # Skip unwanted URLs
105
+ if any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com', 'bing.com']):
106
+ continue
107
+
108
+ # Download image with timeout
109
+ img_response = requests.get(img_url, headers=headers, timeout=10)
110
+ img_response.raise_for_status()
111
+
112
+ # Verify content type
113
+ content_type = img_response.headers.get('Content-Type', '')
114
+ if not content_type.startswith('image/'):
115
+ continue
116
+
117
+ # Check minimum image size (1KB)
118
+ if len(img_response.content) < 1024:
119
+ continue
120
+
121
+ # Convert to base64
122
+ image_base64 = base64.b64encode(img_response.content).decode('utf-8')
123
+
124
+ results.append({
125
+ 'image_url': img_url,
126
+ 'base64_data': f"data:{content_type};base64,{image_base64}",
127
+ 'size': len(img_response.content),
128
+ 'content_type': content_type
129
+ })
130
+
131
+ # Random delay between downloads
132
+ time.sleep(random.uniform(0.5, 1.5))
133
+
134
+ except Exception as e:
135
+ print(f"Error downloading image {img_url}: {str(e)}")
136
  continue
137
 
138
+ except Exception as e:
139
+ print(f"Error with search URL {search_url}: {str(e)}")
140
+ continue
141
 
142
+ return results
 
 
 
143
 
144
+ def scrape_website(url, headers):
145
+ """Enhanced website scraping function"""
146
+ try:
147
+ response = requests.get(url, headers=headers, timeout=15)
148
+ response.raise_for_status()
149
 
150
+ # Detect and handle encoding
151
+ if 'charset' in response.headers.get('content-type', '').lower():
152
+ response.encoding = response.apparent_encoding
 
 
153
 
154
+ soup = BeautifulSoup(response.text, 'html.parser')
155
 
156
+ # Remove unwanted elements
157
+ for element in soup.find_all(['script', 'style', 'nav', 'footer', 'iframe', 'ad', '.advertisement']):
158
+ element.decompose()
159
 
160
+ # Get meta information
161
+ meta_data = {
162
+ 'title': '',
163
+ 'description': '',
164
+ 'keywords': '',
165
+ 'author': '',
166
+ 'published_date': ''
167
+ }
168
 
169
+ # Title
170
+ if soup.title:
171
+ meta_data['title'] = soup.title.string
172
 
173
+ # Meta tags
174
+ meta_tags = {
175
+ 'description': ['description', 'og:description'],
176
+ 'keywords': ['keywords'],
177
+ 'author': ['author', 'og:author'],
178
+ 'published_date': ['article:published_time', 'datePublished']
179
+ }
180
 
181
+ for key, meta_names in meta_tags.items():
182
+ for name in meta_names:
183
+ meta_tag = soup.find('meta', attrs={'name': name}) or soup.find('meta', attrs={'property': name})
184
+ if meta_tag and meta_tag.get('content'):
185
+ meta_data[key] = meta_tag.get('content')
186
+ break
187
 
188
+ # Extract main content
189
+ main_content = ''
190
+ content_tags = soup.find_all(['p', 'article', 'section', 'div'], class_=re.compile(r'(content|article|post|entry)'))
191
 
192
+ if content_tags:
193
+ for tag in content_tags:
194
+ main_content += ' ' + tag.get_text()
195
+ else:
196
+ # Fallback to all paragraph tags
197
+ main_content = ' '.join(p.get_text() for p in soup.find_all('p'))
198
 
199
+ # Clean the text
200
+ main_content = clean_text(main_content)
201
 
202
+ return {
203
+ 'title': clean_text(meta_data['title']),
204
+ 'meta_description': clean_text(meta_data['description']),
205
+ 'keywords': clean_text(meta_data['keywords']),
206
+ 'author': clean_text(meta_data['author']),
207
+ 'published_date': meta_data['published_date'],
208
+ 'content': main_content[:2000], # First 2000 characters
209
+ 'url': url,
210
+ 'domain': get_domain(url)
211
+ }
212
 
213
  except Exception as e:
214
+ print(f"Error scraping {url}: {str(e)}")
215
+ return None
 
 
 
 
 
 
 
216
 
217
  def clean_text(text):
218
+ """Enhanced text cleaning function"""
219
+ if not text:
220
+ return ''
 
 
 
 
 
 
 
 
 
 
 
221
 
222
+ # Convert to string if not already
223
+ text = str(text)
 
224
 
225
+ # Remove HTML tags
226
+ text = re.sub(r'<[^>]+>', '', text)
 
 
 
227
 
228
+ # Remove extra whitespace
229
+ text = re.sub(r'\s+', ' ', text)
230
 
231
+ # Remove special characters but keep basic punctuation
232
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
 
 
 
233
 
234
+ # Remove multiple punctuation
235
+ text = re.sub(r'([.,!?])\1+', r'\1', text)
236
 
237
+ return text.strip()
 
 
 
 
 
238
 
239
+ def get_domain(url):
240
+ """Extract and format domain from URL"""
241
+ try:
242
+ parsed_uri = urlparse(url)
243
+ domain = parsed_uri.netloc
244
+ # Remove 'www.' if present
245
+ domain = re.sub(r'^www\.', '', domain)
246
+ return domain
247
+ except:
248
+ return url
249
 
250
  def search_and_scrape(query, num_results=5):
251
+ """Enhanced search and scrape function"""
252
  headers = {
253
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
254
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 
 
 
 
255
  }
256
 
257
+ # Get search results HTML
258
+ search_html = get_google_search_results(query, num_results)
259
+ if not search_html:
260
+ return []
 
 
 
 
 
 
261
 
262
+ soup = BeautifulSoup(search_html, 'html.parser')
263
+ search_results = []
264
+ seen_domains = set()
265
 
266
+ # Find all search result divs
267
+ for result in soup.find_all('div', class_=['g', 'tF2Cxc']):
268
+ if len(search_results) >= num_results:
269
+ break
270
 
271
+ try:
272
  # Find the link
273
+ link = result.find('a')
274
  if not link:
275
  continue
276
 
277
  href = link.get('href', '')
278
 
279
+ # Basic URL validation
280
+ if not href.startswith('http') or any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
281
+ continue
282
+
283
+ # Check for duplicate domains
284
+ domain = get_domain(href)
285
+ if domain in seen_domains:
286
  continue
287
+ seen_domains.add(domain)
288
 
289
+ # Random delay between requests
290
  time.sleep(random.uniform(1, 2))
291
 
292
  # Scrape the website
293
  site_data = scrape_website(href, headers)
294
+ if site_data and site_data['content']:
295
  search_results.append(site_data)
296
 
297
+ except Exception as e:
298
+ print(f"Error processing search result: {str(e)}")
299
+ continue
300
+
301
+ return search_results
302
+
303
+ @app.route('/search_images', methods=['GET'])
304
+ def api_search_images():
305
+ """API endpoint for image search"""
306
+ try:
307
+ query = request.args.get('query', '')
308
+ num_images = int(request.args.get('num_images', 5))
309
+
310
+ if not query:
311
+ return jsonify({'error': 'Query parameter is required'}), 400
312
+
313
+ if num_images < 1 or num_images > 20:
314
+ return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
315
 
316
+ results = search_images(query, num_images)
317
+
318
+ return jsonify({
319
+ 'success': True,
320
+ 'query': query,
321
+ 'count': len(results),
322
+ 'results': results
323
+ })
324
 
325
  except Exception as e:
326
+ return jsonify({
327
+ 'success': False,
328
+ 'error': str(e)
329
+ }), 500
330
 
331
+ @app.route('/scrape_sites', methods=['GET'])
332
  def api_scrape_sites():
333
+ """API endpoint for web scraping"""
334
  try:
 
335
  query = request.args.get('query', '')
336
  num_results = int(request.args.get('num_results', 5))
337
 
 
341
  if num_results < 1 or num_results > 10:
342
  return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
343
 
 
344
  results = search_and_scrape(query, num_results)
345
 
346
  return jsonify({
347
  'success': True,
348
  'query': query,
349
+ 'count': len(results),
350
  'results': results
351
  })
352
 
 
355
  'success': False,
356
  'error': str(e)
357
  }), 500
358
+
359
  if __name__ == '__main__':
360
  app.run(host='0.0.0.0', port=5000)
361