Pamudu13 commited on
Commit
61467ea
·
verified ·
1 Parent(s): 83f0301

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -316
app.py CHANGED
@@ -14,30 +14,6 @@ import logging
14
  import queue
15
  from huggingface_hub import HfApi
16
 
17
- # Create a logging filter to suppress socket warnings
18
- class SocketWarningFilter(logging.Filter):
19
- def filter(self, record):
20
- return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
21
-
22
- # Create a queue for log messages
23
- log_queue = queue.Queue()
24
-
25
- # Custom log handler that puts messages in the queue
26
- class QueueHandler(logging.Handler):
27
- def emit(self, record):
28
- log_entry = self.format(record)
29
- log_queue.put(log_entry)
30
-
31
- # Set up logging with the custom handler
32
- logger = logging.getLogger()
33
- queue_handler = QueueHandler()
34
- queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
35
- queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler
36
- logger.addHandler(queue_handler)
37
- logger.setLevel(logging.INFO)
38
-
39
- # Also add the filter to the root logger to catch all socket warnings
40
- logging.getLogger().addFilter(SocketWarningFilter())
41
 
42
  app = Flask(__name__)
43
  # Enable CORS with specific settings
@@ -49,81 +25,6 @@ CORS(app, resources={
49
  }
50
  })
51
 
52
- def search_images(query, num_images=5):
53
- # Headers to mimic a browser request
54
- headers = {
55
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
56
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
57
- 'Accept-Language': 'en-US,en;q=0.5',
58
- 'Accept-Encoding': 'gzip, deflate',
59
- 'DNT': '1',
60
- 'Connection': 'keep-alive',
61
- }
62
-
63
- # Format the query for URL
64
- formatted_query = urllib.parse.quote(query)
65
-
66
- # Google Images URL
67
- url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
68
-
69
- try:
70
- # Get the HTML content
71
- response = requests.get(url, headers=headers, timeout=30)
72
- response.raise_for_status()
73
-
74
- # Find all image URLs using regex
75
- image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
76
-
77
- # Remove duplicates while preserving order
78
- image_urls = list(dict.fromkeys(image_urls))
79
-
80
- # Store results
81
- results = []
82
- downloaded = 0
83
-
84
- for img_url in image_urls:
85
- if downloaded >= num_images:
86
- break
87
-
88
- try:
89
- # Skip small thumbnails and icons
90
- if 'gstatic.com' in img_url or 'google.com' in img_url:
91
- continue
92
-
93
- # Download image
94
- img_response = requests.get(img_url, headers=headers, timeout=10)
95
- img_response.raise_for_status()
96
-
97
- # Check if the response is actually an image
98
- content_type = img_response.headers.get('Content-Type', '')
99
- if not content_type.startswith('image/'):
100
- continue
101
-
102
- # Convert image to base64
103
- image_base64 = base64.b64encode(img_response.content).decode('utf-8')
104
-
105
- # Add to results
106
- results.append({
107
- 'image_url': img_url,
108
- 'base64_data': f"data:{content_type};base64,{image_base64}"
109
- })
110
-
111
- downloaded += 1
112
-
113
- # Add a random delay between downloads
114
- time.sleep(random.uniform(0.5, 1))
115
-
116
- except Exception as e:
117
- logger.error(f"Error downloading image: {str(e)}")
118
- continue
119
-
120
- return results
121
-
122
- except Exception as e:
123
- logger.error(f"An error occurred: {str(e)}")
124
- return []
125
-
126
-
127
 
128
  HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment
129
 
@@ -189,223 +90,6 @@ def get_live_space_status():
189
  return Response(stream_with_context(generate()), mimetype='text/event-stream')
190
 
191
 
192
-
193
-
194
- @app.route('/search_images', methods=['GET'])
195
- def api_search_images():
196
- try:
197
- # Get query parameters
198
- query = request.args.get('query', '')
199
- num_images = int(request.args.get('num_images', 5))
200
-
201
- if not query:
202
- return jsonify({'error': 'Query parameter is required'}), 400
203
-
204
- if num_images < 1 or num_images > 20:
205
- return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
206
-
207
- # Search for images
208
- results = search_images(query, num_images)
209
-
210
- response = jsonify({
211
- 'success': True,
212
- 'query': query,
213
- 'results': results
214
- })
215
-
216
- # Add CORS headers
217
- response.headers['Access-Control-Allow-Origin'] = '*'
218
- return response
219
-
220
- except Exception as e:
221
- logger.error(f"Error in search_images: {str(e)}")
222
- response = jsonify({
223
- 'success': False,
224
- 'error': str(e)
225
- }), 500
226
-
227
- # Add CORS headers
228
- response.headers['Access-Control-Allow-Origin'] = '*'
229
- return response
230
-
231
- def scrape_site_content(query, num_sites=5):
232
- headers = {
233
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
234
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
235
- 'Accept-Language': 'en-US,en;q=0.5',
236
- 'Accept-Encoding': 'gzip, deflate',
237
- 'DNT': '1',
238
- 'Connection': 'keep-alive',
239
- }
240
-
241
- results = []
242
- scraped = 0
243
- retries = 2 # Number of retries per URL
244
- timeout = 5 # Reduced timeout to 5 seconds
245
-
246
- try:
247
- # Get more URLs than needed to account for failures
248
- search_results = list(search(query, num_results=num_sites * 2))
249
-
250
- # Process each found URL
251
- for url in search_results:
252
- if scraped >= num_sites:
253
- break
254
-
255
- success = False
256
- for attempt in range(retries):
257
- try:
258
- # Get the HTML content
259
- logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
260
- logger.info(f"Scraping URL: {url}")
261
- response = requests.get(
262
- url,
263
- headers=headers,
264
- timeout=timeout,
265
- verify=False # Skip SSL verification
266
- )
267
- response.raise_for_status()
268
-
269
- # Verify it's HTML content
270
- content_type = response.headers.get('Content-Type', '').lower()
271
- if 'text/html' not in content_type:
272
- logger.info(f"Skipping {url} - not HTML content")
273
- break
274
-
275
- # Parse the HTML content
276
- soup = BeautifulSoup(response.text, 'html.parser')
277
-
278
- # Remove script and style elements
279
- for script in soup(["script", "style"]):
280
- script.decompose()
281
-
282
- # Extract text content (limit to first 10000 characters)
283
- text_content = soup.get_text(separator='\n', strip=True)[:10000]
284
-
285
- # Skip if not enough content
286
- if len(text_content.split()) < 100: # Skip if less than 100 words
287
- logger.info(f"Skipping {url} - not enough content")
288
- break
289
-
290
- # Extract all links (limit to first 10)
291
- links = []
292
- for link in soup.find_all('a', href=True)[:10]:
293
- href = link['href']
294
- if href.startswith('http'):
295
- links.append({
296
- 'text': link.get_text(strip=True),
297
- 'url': href
298
- })
299
-
300
- # Extract meta information
301
- title = soup.title.string if soup.title else ''
302
- meta_description = ''
303
- meta_keywords = ''
304
-
305
- meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
306
- if meta_desc_tag:
307
- meta_description = meta_desc_tag.get('content', '')
308
-
309
- meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
310
- if meta_keywords_tag:
311
- meta_keywords = meta_keywords_tag.get('content', '')
312
-
313
- results.append({
314
- 'url': url,
315
- 'title': title,
316
- 'meta_description': meta_description,
317
- 'meta_keywords': meta_keywords,
318
- 'text_content': text_content,
319
- 'links': links
320
- })
321
-
322
- scraped += 1
323
- success = True
324
- # Add a random delay between scrapes
325
- time.sleep(random.uniform(0.5, 1))
326
- break # Break retry loop on success
327
-
328
- except requests.Timeout:
329
- print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
330
- if attempt == retries - 1: # Last attempt
331
- print(f"Skipping {url} after {retries} timeout attempts")
332
- except requests.RequestException as e:
333
- print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
334
- if attempt == retries - 1: # Last attempt
335
- print(f"Skipping {url} after {retries} failed attempts")
336
-
337
- # Add a longer delay between retries
338
- if not success and attempt < retries - 1:
339
- time.sleep(random.uniform(1, 2))
340
-
341
- # If we haven't found enough valid content and have more URLs, continue
342
- if scraped < num_sites and len(results) < len(search_results):
343
- continue
344
-
345
- return results
346
-
347
- except Exception as e:
348
- print(f"Error in search/scraping process: {str(e)}")
349
- # Return whatever results we've managed to gather
350
- return results
351
-
352
-
353
- @app.route('/scrape_sites', methods=['GET'])
354
- def api_scrape_sites():
355
- try:
356
- # Get query parameters
357
- query = request.args.get('query', '')
358
- num_sites = int(request.args.get('num_sites', 10))
359
-
360
- if not query:
361
- return jsonify({'error': 'Query parameter is required'}), 400
362
-
363
- if num_sites < 1 or num_sites > 20:
364
- return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
365
-
366
- # Scrape the websites
367
- results = scrape_site_content(query, num_sites)
368
-
369
- response = jsonify({
370
- 'success': True,
371
- 'query': query,
372
- 'results': results
373
- })
374
-
375
- # Add CORS headers
376
- response.headers['Access-Control-Allow-Origin'] = '*'
377
- return response
378
-
379
- except Exception as e:
380
- logger.error(f"Error in api_scrape_sites: {str(e)}")
381
- response = jsonify({
382
- 'success': False,
383
- 'error': str(e)
384
- }), 500
385
-
386
- # Add CORS headers
387
- response.headers['Access-Control-Allow-Origin'] = '*'
388
- return response
389
-
390
- @app.route('/logs/stream')
391
- def stream_logs():
392
- def generate():
393
- while True:
394
- try:
395
- # Get log message from queue, timeout after 1 second
396
- log_message = log_queue.get(timeout=1)
397
- yield f"data: {log_message}\n\n"
398
- except queue.Empty:
399
- # Send a heartbeat to keep the connection alive
400
- yield "data: heartbeat\n\n"
401
- except GeneratorExit:
402
- break
403
-
404
- response = Response(stream_with_context(generate()), mimetype='text/event-stream')
405
- response.headers['Cache-Control'] = 'no-cache'
406
- response.headers['Connection'] = 'keep-alive'
407
- return response
408
-
409
  if __name__ == '__main__':
410
  logger.info("Starting Flask API server...")
411
  app.run(host='0.0.0.0', port=5001, debug=True)
 
14
  import queue
15
  from huggingface_hub import HfApi
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  app = Flask(__name__)
19
  # Enable CORS with specific settings
 
25
  }
26
  })
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment
30
 
 
90
  return Response(stream_with_context(generate()), mimetype='text/event-stream')
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  if __name__ == '__main__':
94
  logger.info("Starting Flask API server...")
95
  app.run(host='0.0.0.0', port=5001, debug=True)