Pamudu13 commited on
Commit
4ce93da
·
verified ·
1 Parent(s): 536e1c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -22
app.py CHANGED
@@ -1,4 +1,5 @@
1
- from flask import Flask, jsonify, request
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import os
@@ -9,8 +10,43 @@ import random
9
  import base64
10
  from io import BytesIO
11
  from googlesearch import search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  app = Flask(__name__)
 
 
 
 
 
 
 
 
14
 
15
  def search_images(query, num_images=5):
16
  # Headers to mimic a browser request
@@ -77,13 +113,13 @@ def search_images(query, num_images=5):
77
  time.sleep(random.uniform(0.5, 1))
78
 
79
  except Exception as e:
80
- print(f"Error downloading image: {str(e)}")
81
  continue
82
 
83
  return results
84
 
85
  except Exception as e:
86
- print(f"An error occurred: {str(e)}")
87
  return []
88
 
89
  @app.route('/search_images', methods=['GET'])
@@ -102,18 +138,27 @@ def api_search_images():
102
  # Search for images
103
  results = search_images(query, num_images)
104
 
105
- return jsonify({
106
  'success': True,
107
  'query': query,
108
  'results': results
109
  })
110
 
 
 
 
 
111
  except Exception as e:
112
- return jsonify({
 
113
  'success': False,
114
  'error': str(e)
115
  }), 500
116
 
 
 
 
 
117
  def scrape_site_content(query, num_sites=5):
118
  headers = {
119
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -126,10 +171,38 @@ def scrape_site_content(query, num_sites=5):
126
 
127
  results = []
128
  scraped = 0
 
 
129
 
130
  try:
131
- # Use googlesearch-python to get URLs
132
- search_results = search(query, num_results=num_sites)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  # Process each found URL
135
  for url in search_results:
@@ -137,17 +210,43 @@ def scrape_site_content(query, num_sites=5):
137
  break
138
 
139
  try:
140
- # Get the HTML content
141
- response = requests.get(url, headers=headers, timeout=10)
142
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  # Verify it's HTML content
145
  content_type = response.headers.get('Content-Type', '').lower()
146
  if 'text/html' not in content_type:
 
147
  continue
148
 
149
  # Parse the HTML content
150
  soup = BeautifulSoup(response.text, 'html.parser')
 
151
 
152
  # Remove script and style elements
153
  for script in soup(["script", "style"]):
@@ -155,6 +254,7 @@ def scrape_site_content(query, num_sites=5):
155
 
156
  # Extract text content (limit to first 1000 characters)
157
  text_content = soup.get_text(separator='\n', strip=True)[:10000]
 
158
 
159
  # Extract all links (limit to first 10)
160
  links = []
@@ -165,6 +265,7 @@ def scrape_site_content(query, num_sites=5):
165
  'text': link.get_text(strip=True),
166
  'url': href
167
  })
 
168
 
169
  # Extract meta information
170
  title = soup.title.string if soup.title else ''
@@ -189,16 +290,24 @@ def scrape_site_content(query, num_sites=5):
189
  })
190
 
191
  scraped += 1
192
- # Add a random delay between scrapes
193
- time.sleep(random.uniform(0.5, 1))
194
 
 
 
 
 
 
 
 
 
195
  except Exception as e:
196
- print(f"Error scraping {url}: {str(e)}")
197
  continue
198
 
199
  except Exception as e:
200
- print(f"Error in search: {str(e)}")
201
 
 
202
  return results
203
 
204
  @app.route('/scrape_sites', methods=['GET'])
@@ -217,24 +326,46 @@ def api_scrape_sites():
217
  # Scrape the websites
218
  results = scrape_site_content(query, num_sites)
219
 
220
- return jsonify({
221
  'success': True,
222
  'query': query,
223
  'results': results
224
  })
225
 
 
 
 
 
226
  except Exception as e:
227
- return jsonify({
 
228
  'success': False,
229
  'error': str(e)
230
  }), 500
231
-
232
-
233
- if __name__ == '__main__':
234
- app.run(host='0.0.0.0', port=5000)
235
-
236
-
237
 
 
 
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
 
 
 
 
240
 
 
 
 
 
1
+ from flask import Flask, jsonify, request, Response, stream_with_context
2
+ from flask_cors import CORS
3
  import requests
4
  from bs4 import BeautifulSoup
5
  import os
 
10
  import base64
11
  from io import BytesIO
12
  from googlesearch import search
13
+ import logging
14
+ import queue
15
+
16
+ # Create a logging filter to suppress socket warnings
17
+ class SocketWarningFilter(logging.Filter):
18
+ def filter(self, record):
19
+ return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
20
+
21
+ # Create a queue for log messages
22
+ log_queue = queue.Queue()
23
+
24
+ # Custom log handler that puts messages in the queue
25
+ class QueueHandler(logging.Handler):
26
+ def emit(self, record):
27
+ log_entry = self.format(record)
28
+ log_queue.put(log_entry)
29
+
30
+ # Set up logging with the custom handler
31
+ logger = logging.getLogger()
32
+ queue_handler = QueueHandler()
33
+ queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
34
+ queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler
35
+ logger.addHandler(queue_handler)
36
+ logger.setLevel(logging.INFO)
37
+
38
+ # Also add the filter to the root logger to catch all socket warnings
39
+ logging.getLogger().addFilter(SocketWarningFilter())
40
 
41
  app = Flask(__name__)
42
+ # Enable CORS with specific settings
43
+ CORS(app, resources={
44
+ r"/*": {
45
+ "origins": "*",
46
+ "methods": ["GET", "POST", "OPTIONS"],
47
+ "allow_headers": ["Content-Type", "Authorization"]
48
+ }
49
+ })
50
 
51
  def search_images(query, num_images=5):
52
  # Headers to mimic a browser request
 
113
  time.sleep(random.uniform(0.5, 1))
114
 
115
  except Exception as e:
116
+ logger.error(f"Error downloading image: {str(e)}")
117
  continue
118
 
119
  return results
120
 
121
  except Exception as e:
122
+ logger.error(f"An error occurred: {str(e)}")
123
  return []
124
 
125
  @app.route('/search_images', methods=['GET'])
 
138
  # Search for images
139
  results = search_images(query, num_images)
140
 
141
+ response = jsonify({
142
  'success': True,
143
  'query': query,
144
  'results': results
145
  })
146
 
147
+ # Add CORS headers
148
+ response.headers['Access-Control-Allow-Origin'] = '*'
149
+ return response
150
+
151
  except Exception as e:
152
+ logger.error(f"Error in search_images: {str(e)}")
153
+ response = jsonify({
154
  'success': False,
155
  'error': str(e)
156
  }), 500
157
 
158
+ # Add CORS headers
159
+ response.headers['Access-Control-Allow-Origin'] = '*'
160
+ return response
161
+
162
  def scrape_site_content(query, num_sites=5):
163
  headers = {
164
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 
171
 
172
  results = []
173
  scraped = 0
174
+ max_retries = 3
175
+ base_delay = 5 # Base delay in seconds
176
 
177
  try:
178
+ logger.info(f"Starting to scrape content for query: {query}")
179
+
180
+ # Add initial delay before starting searches
181
+ initial_delay = random.uniform(2, 4)
182
+ logger.info(f"Initial delay of {initial_delay:.2f} seconds before starting searches...")
183
+ time.sleep(initial_delay)
184
+
185
+ # Use googlesearch-python to get URLs with retry logic
186
+ search_results = []
187
+ retry_count = 0
188
+
189
+ while retry_count < max_retries:
190
+ try:
191
+ search_results = list(search(query, num_results=num_sites))
192
+ break
193
+ except Exception as e:
194
+ retry_count += 1
195
+ if "429" in str(e):
196
+ delay = base_delay * (2 ** retry_count) # Exponential backoff
197
+ logger.warning(f"Rate limited by Google. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
198
+ time.sleep(delay)
199
+ else:
200
+ logger.error(f"Error during search (attempt {retry_count}/{max_retries}): {str(e)}")
201
+ if retry_count == max_retries:
202
+ raise
203
+ time.sleep(base_delay)
204
+
205
+ logger.info(f"Found {len(search_results)} URLs to scrape for query: {query}")
206
 
207
  # Process each found URL
208
  for url in search_results:
 
210
  break
211
 
212
  try:
213
+ logger.info(f"Attempting to scrape URL: {url}")
214
+
215
+ # Add random delay before each request
216
+ delay = random.uniform(1, 3)
217
+ logger.info(f"Waiting {delay:.2f} seconds before request...")
218
+ time.sleep(delay)
219
+
220
+ # Get the HTML content with retry logic
221
+ retry_count = 0
222
+ while retry_count < max_retries:
223
+ try:
224
+ response = requests.get(url, headers=headers, timeout=10)
225
+ response.raise_for_status()
226
+ break
227
+ except requests.exceptions.RequestException as e:
228
+ retry_count += 1
229
+ if "429" in str(e):
230
+ delay = base_delay * (2 ** retry_count)
231
+ logger.warning(f"Rate limited. Waiting {delay} seconds before retry {retry_count}/{max_retries}")
232
+ time.sleep(delay)
233
+ else:
234
+ logger.error(f"Request failed (attempt {retry_count}/{max_retries}): {str(e)}")
235
+ if retry_count == max_retries:
236
+ raise
237
+ time.sleep(base_delay)
238
+
239
+ logger.info(f"Successfully retrieved content from: {url}")
240
 
241
  # Verify it's HTML content
242
  content_type = response.headers.get('Content-Type', '').lower()
243
  if 'text/html' not in content_type:
244
+ logger.info(f"Skipping {url} - not HTML content (Content-Type: {content_type})")
245
  continue
246
 
247
  # Parse the HTML content
248
  soup = BeautifulSoup(response.text, 'html.parser')
249
+ logger.info(f"Successfully parsed HTML from: {url}")
250
 
251
  # Remove script and style elements
252
  for script in soup(["script", "style"]):
 
254
 
255
  # Extract text content (limit to first 1000 characters)
256
  text_content = soup.get_text(separator='\n', strip=True)[:10000]
257
+ logger.info(f"Extracted {len(text_content)} characters of text from: {url}")
258
 
259
  # Extract all links (limit to first 10)
260
  links = []
 
265
  'text': link.get_text(strip=True),
266
  'url': href
267
  })
268
+ logger.info(f"Found {len(links)} valid links on: {url}")
269
 
270
  # Extract meta information
271
  title = soup.title.string if soup.title else ''
 
290
  })
291
 
292
  scraped += 1
293
+ logger.info(f"Successfully scraped {scraped}/{num_sites} sites. Current URL: {url}")
 
294
 
295
+ # Add a random delay between successful scrapes
296
+ delay = random.uniform(2, 4)
297
+ logger.info(f"Waiting {delay:.2f} seconds before next scrape...")
298
+ time.sleep(delay)
299
+
300
+ except requests.exceptions.RequestException as e:
301
+ logger.error(f"Request failed for URL {url}: {str(e)}")
302
+ continue
303
  except Exception as e:
304
+ logger.error(f"Error scraping {url}: {str(e)}")
305
  continue
306
 
307
  except Exception as e:
308
+ logger.error(f"Error in search: {str(e)}")
309
 
310
+ logger.info(f"Completed scraping. Successfully scraped {len(results)} out of {num_sites} sites")
311
  return results
312
 
313
  @app.route('/scrape_sites', methods=['GET'])
 
326
  # Scrape the websites
327
  results = scrape_site_content(query, num_sites)
328
 
329
+ response = jsonify({
330
  'success': True,
331
  'query': query,
332
  'results': results
333
  })
334
 
335
+ # Add CORS headers
336
+ response.headers['Access-Control-Allow-Origin'] = '*'
337
+ return response
338
+
339
  except Exception as e:
340
+ logger.error(f"Error in api_scrape_sites: {str(e)}")
341
+ response = jsonify({
342
  'success': False,
343
  'error': str(e)
344
  }), 500
 
 
 
 
 
 
345
 
346
+ # Add CORS headers
347
+ response.headers['Access-Control-Allow-Origin'] = '*'
348
+ return response
349
 
350
+ @app.route('/logs/stream')
351
+ def stream_logs():
352
+ def generate():
353
+ while True:
354
+ try:
355
+ # Get log message from queue, timeout after 1 second
356
+ log_message = log_queue.get(timeout=1)
357
+ yield f"data: {log_message}\n\n"
358
+ except queue.Empty:
359
+ # Send a heartbeat to keep the connection alive
360
+ yield "data: heartbeat\n\n"
361
+ except GeneratorExit:
362
+ break
363
 
364
+ response = Response(stream_with_context(generate()), mimetype='text/event-stream')
365
+ response.headers['Cache-Control'] = 'no-cache'
366
+ response.headers['Connection'] = 'keep-alive'
367
+ return response
368
 
369
+ if __name__ == '__main__':
370
+ logger.info("Starting Flask API server...")
371
+ app.run(host='0.0.0.0', port=5001, debug=True)