NanobotzAI commited on
Commit
13a6515
·
verified ·
1 Parent(s): 83f0301

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -383
app.py CHANGED
@@ -1,411 +1,323 @@
1
- from flask import Flask, jsonify, request, Response, stream_with_context
2
- from flask_cors import CORS
3
- import requests
4
- from bs4 import BeautifulSoup
5
- import os
6
- import re
7
- import urllib.parse
 
 
8
  import time
9
- import random
10
- import base64
11
- from io import BytesIO
12
- from googlesearch import search
13
  import logging
14
- import queue
15
- from huggingface_hub import HfApi
16
-
17
- # Create a logging filter to suppress socket warnings
18
- class SocketWarningFilter(logging.Filter):
19
- def filter(self, record):
20
- return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
21
-
22
- # Create a queue for log messages
23
- log_queue = queue.Queue()
24
-
25
- # Custom log handler that puts messages in the queue
26
- class QueueHandler(logging.Handler):
27
- def emit(self, record):
28
- log_entry = self.format(record)
29
- log_queue.put(log_entry)
30
-
31
- # Set up logging with the custom handler
32
- logger = logging.getLogger()
33
- queue_handler = QueueHandler()
34
- queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
35
- queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler
36
- logger.addHandler(queue_handler)
37
- logger.setLevel(logging.INFO)
38
-
39
- # Also add the filter to the root logger to catch all socket warnings
40
- logging.getLogger().addFilter(SocketWarningFilter())
41
 
42
  app = Flask(__name__)
43
- # Enable CORS with specific settings
44
- CORS(app, resources={
45
- r"/*": {
46
- "origins": "*",
47
- "methods": ["GET", "POST", "OPTIONS"],
48
- "allow_headers": ["Content-Type", "Authorization"]
49
- }
50
- })
51
-
52
- def search_images(query, num_images=5):
53
- # Headers to mimic a browser request
54
- headers = {
55
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
56
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
57
- 'Accept-Language': 'en-US,en;q=0.5',
58
- 'Accept-Encoding': 'gzip, deflate',
59
- 'DNT': '1',
60
- 'Connection': 'keep-alive',
61
- }
62
-
63
- # Format the query for URL
64
- formatted_query = urllib.parse.quote(query)
65
-
66
- # Google Images URL
67
- url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
- # Get the HTML content
71
- response = requests.get(url, headers=headers, timeout=30)
72
- response.raise_for_status()
73
-
74
- # Find all image URLs using regex
75
- image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
76
-
77
- # Remove duplicates while preserving order
78
- image_urls = list(dict.fromkeys(image_urls))
 
79
 
80
- # Store results
81
- results = []
82
- downloaded = 0
83
 
84
- for img_url in image_urls:
85
- if downloaded >= num_images:
86
- break
87
 
 
88
  try:
89
- # Skip small thumbnails and icons
90
- if 'gstatic.com' in img_url or 'google.com' in img_url:
91
- continue
92
-
93
- # Download image
94
- img_response = requests.get(img_url, headers=headers, timeout=10)
95
- img_response.raise_for_status()
96
-
97
- # Check if the response is actually an image
98
- content_type = img_response.headers.get('Content-Type', '')
99
- if not content_type.startswith('image/'):
100
- continue
101
-
102
- # Convert image to base64
103
- image_base64 = base64.b64encode(img_response.content).decode('utf-8')
104
-
105
- # Add to results
106
- results.append({
107
- 'image_url': img_url,
108
- 'base64_data': f"data:{content_type};base64,{image_base64}"
109
  })
110
-
111
- downloaded += 1
112
-
113
- # Add a random delay between downloads
114
- time.sleep(random.uniform(0.5, 1))
115
-
116
  except Exception as e:
117
- logger.error(f"Error downloading image: {str(e)}")
118
- continue
119
-
120
- return results
121
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  except Exception as e:
123
- logger.error(f"An error occurred: {str(e)}")
124
- return []
125
-
126
-
127
-
128
- HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment
129
-
130
- @app.route('/restart_space', methods=['POST'])
131
- def api_restart_space():
132
- """API route to restart a Hugging Face Space."""
133
- space_id = 'Pamudu13/web-scraper'
134
- factory_reboot = request.json.get('factory_reboot', False) # Optional: Set to True if you want a factory reboot
135
-
136
- if not space_id:
137
- return jsonify({'error': 'space_id parameter is required'}), 400
138
 
 
 
139
  try:
140
- hfapi = HfApi()
141
-
142
- # Call the restart_space method
143
- res = hfapi.restart_space(
144
- space_id,
145
- token=HF_TOKEN,
146
- factory_reboot=factory_reboot
147
- )
148
-
149
- return jsonify({
150
- 'success': True,
151
- 'message': f"Successfully restarted Space: {space_id}",
152
- 'response': res
153
- }), 200
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  except Exception as e:
156
- return jsonify({
157
- 'success': False,
158
- 'message': f"Error: {str(e)}"
159
- }), 500
160
-
161
- @app.route('/get_live_space_status', methods=['GET'])
162
- def get_live_space_status():
163
- """API route to stream live status of a Hugging Face Space."""
164
- space_id = request.args.get('space_id', 'Pamudu13/web-scraper') # Default to 'Pamudu13/web-scraper' if not provided
165
-
166
- def generate():
167
- while True:
168
- try:
169
- # Fetch the current runtime status of the Space
170
- hf_api = HfApi()
171
- space_runtime = hf_api.get_space_runtime(repo_id=space_id)
172
-
173
- # Extract relevant details
174
- status = space_runtime.stage # e.g., 'BUILDING', 'RUNNING', etc.
175
- hardware = space_runtime.hardware # e.g., 'cpu-basic', 't4-medium', etc.
176
-
177
- # Send the status as a Server-Sent Event
178
- yield f"data: {status}\n\n"
179
- yield f"data: {hardware}\n\n"
180
-
181
- # Delay before checking the status again
182
- time.sleep(5) # Adjust polling interval as needed
183
-
184
- except Exception as e:
185
- # Handle errors and send an error message
186
- yield f"data: Error: {str(e)}\n\n"
187
- break # Stop the stream in case of an error
188
-
189
- return Response(stream_with_context(generate()), mimetype='text/event-stream')
190
-
191
-
192
-
193
-
194
- @app.route('/search_images', methods=['GET'])
195
- def api_search_images():
 
 
 
 
 
 
 
 
 
 
 
196
  try:
197
- # Get query parameters
198
- query = request.args.get('query', '')
199
- num_images = int(request.args.get('num_images', 5))
200
-
201
- if not query:
202
- return jsonify({'error': 'Query parameter is required'}), 400
203
-
204
- if num_images < 1 or num_images > 20:
205
- return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
206
-
207
- # Search for images
208
- results = search_images(query, num_images)
209
-
210
- response = jsonify({
211
- 'success': True,
212
- 'query': query,
213
- 'results': results
214
  })
215
 
216
- # Add CORS headers
217
- response.headers['Access-Control-Allow-Origin'] = '*'
218
- return response
219
-
220
- except Exception as e:
221
- logger.error(f"Error in search_images: {str(e)}")
222
- response = jsonify({
223
- 'success': False,
224
- 'error': str(e)
225
- }), 500
226
-
227
- # Add CORS headers
228
- response.headers['Access-Control-Allow-Origin'] = '*'
229
- return response
230
-
231
- def scrape_site_content(query, num_sites=5):
232
- headers = {
233
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
234
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
235
- 'Accept-Language': 'en-US,en;q=0.5',
236
- 'Accept-Encoding': 'gzip, deflate',
237
- 'DNT': '1',
238
- 'Connection': 'keep-alive',
239
- }
240
-
241
- results = []
242
- scraped = 0
243
- retries = 2 # Number of retries per URL
244
- timeout = 5 # Reduced timeout to 5 seconds
245
-
246
- try:
247
- # Get more URLs than needed to account for failures
248
- search_results = list(search(query, num_results=num_sites * 2))
249
-
250
- # Process each found URL
251
- for url in search_results:
252
- if scraped >= num_sites:
253
- break
254
-
255
- success = False
256
- for attempt in range(retries):
257
- try:
258
- # Get the HTML content
259
- logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
260
- logger.info(f"Scraping URL: {url}")
261
- response = requests.get(
262
- url,
263
- headers=headers,
264
- timeout=timeout,
265
- verify=False # Skip SSL verification
266
- )
267
- response.raise_for_status()
268
-
269
- # Verify it's HTML content
270
- content_type = response.headers.get('Content-Type', '').lower()
271
- if 'text/html' not in content_type:
272
- logger.info(f"Skipping {url} - not HTML content")
273
- break
274
-
275
- # Parse the HTML content
276
- soup = BeautifulSoup(response.text, 'html.parser')
277
-
278
- # Remove script and style elements
279
- for script in soup(["script", "style"]):
280
- script.decompose()
281
-
282
- # Extract text content (limit to first 10000 characters)
283
- text_content = soup.get_text(separator='\n', strip=True)[:10000]
284
-
285
- # Skip if not enough content
286
- if len(text_content.split()) < 100: # Skip if less than 100 words
287
- logger.info(f"Skipping {url} - not enough content")
288
- break
289
-
290
- # Extract all links (limit to first 10)
291
- links = []
292
- for link in soup.find_all('a', href=True)[:10]:
293
- href = link['href']
294
- if href.startswith('http'):
295
- links.append({
296
- 'text': link.get_text(strip=True),
297
- 'url': href
298
- })
299
-
300
- # Extract meta information
301
- title = soup.title.string if soup.title else ''
302
- meta_description = ''
303
- meta_keywords = ''
304
-
305
- meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
306
- if meta_desc_tag:
307
- meta_description = meta_desc_tag.get('content', '')
308
-
309
- meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
310
- if meta_keywords_tag:
311
- meta_keywords = meta_keywords_tag.get('content', '')
312
-
313
- results.append({
314
- 'url': url,
315
- 'title': title,
316
- 'meta_description': meta_description,
317
- 'meta_keywords': meta_keywords,
318
- 'text_content': text_content,
319
- 'links': links
320
- })
321
-
322
- scraped += 1
323
- success = True
324
- # Add a random delay between scrapes
325
- time.sleep(random.uniform(0.5, 1))
326
- break # Break retry loop on success
327
-
328
- except requests.Timeout:
329
- print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
330
- if attempt == retries - 1: # Last attempt
331
- print(f"Skipping {url} after {retries} timeout attempts")
332
- except requests.RequestException as e:
333
- print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
334
- if attempt == retries - 1: # Last attempt
335
- print(f"Skipping {url} after {retries} failed attempts")
336
-
337
- # Add a longer delay between retries
338
- if not success and attempt < retries - 1:
339
- time.sleep(random.uniform(1, 2))
340
-
341
- # If we haven't found enough valid content and have more URLs, continue
342
- if scraped < num_sites and len(results) < len(search_results):
343
- continue
344
-
345
- return results
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  except Exception as e:
348
- print(f"Error in search/scraping process: {str(e)}")
349
- # Return whatever results we've managed to gather
350
- return results
351
 
352
-
353
- @app.route('/scrape_sites', methods=['GET'])
354
- def api_scrape_sites():
355
  try:
356
- # Get query parameters
357
- query = request.args.get('query', '')
358
- num_sites = int(request.args.get('num_sites', 10))
359
-
360
- if not query:
361
- return jsonify({'error': 'Query parameter is required'}), 400
362
-
363
- if num_sites < 1 or num_sites > 20:
364
- return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
365
-
366
- # Scrape the websites
367
- results = scrape_site_content(query, num_sites)
368
-
369
- response = jsonify({
370
- 'success': True,
371
- 'query': query,
372
- 'results': results
373
- })
374
-
375
- # Add CORS headers
376
- response.headers['Access-Control-Allow-Origin'] = '*'
377
- return response
 
 
 
378
 
379
  except Exception as e:
380
- logger.error(f"Error in api_scrape_sites: {str(e)}")
381
- response = jsonify({
382
- 'success': False,
383
- 'error': str(e)
384
- }), 500
385
-
386
- # Add CORS headers
387
- response.headers['Access-Control-Allow-Origin'] = '*'
388
- return response
389
-
390
- @app.route('/logs/stream')
391
- def stream_logs():
392
- def generate():
393
- while True:
394
- try:
395
- # Get log message from queue, timeout after 1 second
396
- log_message = log_queue.get(timeout=1)
397
- yield f"data: {log_message}\n\n"
398
- except queue.Empty:
399
- # Send a heartbeat to keep the connection alive
400
- yield "data: heartbeat\n\n"
401
- except GeneratorExit:
402
- break
403
-
404
- response = Response(stream_with_context(generate()), mimetype='text/event-stream')
405
- response.headers['Cache-Control'] = 'no-cache'
406
- response.headers['Connection'] = 'keep-alive'
407
- return response
408
 
409
  if __name__ == '__main__':
410
- logger.info("Starting Flask API server...")
411
- app.run(host='0.0.0.0', port=5001, debug=True)
 
1
+ from flask import Flask, request, jsonify
2
+ from scrapy import Spider, Request
3
+ from scrapy.crawler import CrawlerRunner
4
+ from scrapy.utils.project import get_project_settings
5
+ from twisted.internet import reactor
6
+ from twisted.internet.defer import inlineCallbacks, returnValue, Deferred
7
+ from urllib.parse import urljoin, urlparse
8
+ import json
9
+ import threading
10
  import time
 
 
 
 
11
  import logging
12
+ import traceback
13
+ from queue import Queue
14
+ from functools import wraps
15
+
16
+ # Configure logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(levelname)s - %(message)s',
20
+ datefmt='%Y-%m-%d %H:%M:%S'
21
+ )
22
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Thread-safe queue for results
27
+ result_queue = Queue()
28
+
29
+ class URLSpider(Spider):
30
+ name = 'url_spider'
31
+ found_urls = set() # Class variable to store all found URLs
32
+
33
+ def __init__(self, start_url=None, max_urls=10, *args, **kwargs):
34
+ super(URLSpider, self).__init__(*args, **kwargs)
35
+ self.start_urls = [start_url]
36
+ self.allowed_domain = urlparse(start_url).netloc
37
+ self.max_urls = max_urls
38
+ self.url_count = 0
39
+ logger.info(f"Starting spider for URL: {start_url} with max_urls={max_urls}")
40
+
41
+ def start_requests(self):
42
+ for url in self.start_urls:
43
+ yield Request(url, callback=self.parse, dont_filter=True, errback=self.handle_error)
44
+
45
+ def handle_error(self, failure):
46
+ logger.error(f"Request failed: {failure.value}")
47
+ return None
48
+
49
+ def parse(self, response):
50
+ try:
51
+ if self.url_count >= self.max_urls:
52
+ logger.info(f"Reached maximum URL limit ({self.max_urls}). Stopping crawl.")
53
+ return
54
+
55
+ links = response.css('a::attr(href)').getall()
56
+ logger.info(f"Found {len(links)} links on {response.url}")
57
+
58
+ for link in links:
59
+ if self.url_count >= self.max_urls:
60
+ return
61
+
62
+ absolute_url = urljoin(response.url, link)
63
+ parsed_url = urlparse(absolute_url)
64
+
65
+ if parsed_url.netloc == self.allowed_domain and absolute_url not in self.found_urls:
66
+ self.found_urls.add(absolute_url)
67
+ self.url_count += 1
68
+ logger.info(f"Found URL ({self.url_count}/{self.max_urls}): {absolute_url}")
69
+
70
+ if self.url_count < self.max_urls:
71
+ logger.info(f"Following link: {absolute_url}")
72
+ yield Request(absolute_url, callback=self.parse, errback=self.handle_error)
73
+ except Exception as e:
74
+ logger.error(f"Error in parse method: {str(e)}")
75
+ traceback.print_exc()
76
+
77
+ def run_spider(url, max_urls):
78
  try:
79
+ settings = get_project_settings()
80
+ settings.update({
81
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
82
+ 'LOG_ENABLED': True,
83
+ 'LOG_LEVEL': 'INFO',
84
+ 'ROBOTSTXT_OBEY': True,
85
+ 'CONCURRENT_REQUESTS': 16,
86
+ 'DOWNLOAD_TIMEOUT': 30,
87
+ 'RETRY_TIMES': 3,
88
+ })
89
 
90
+ runner = CrawlerRunner(settings)
 
 
91
 
92
+ # Create a deferred to store the results
93
+ results = {'urls': set()}
 
94
 
95
+ def crawler_callback(result):
96
  try:
97
+ # Get URLs from the spider's class variable
98
+ urls = list(URLSpider.found_urls)
99
+ logger.info(f"Crawling completed. Found {len(urls)} URLs.")
100
+
101
+ # Put the results in the queue
102
+ result_queue.put({
103
+ 'status': 'success',
104
+ 'urls': urls,
105
+ 'count': len(urls)
 
 
 
 
 
 
 
 
 
 
 
106
  })
107
+ return result
 
 
 
 
 
108
  except Exception as e:
109
+ logger.error(f"Error in crawler_callback: {str(e)}")
110
+ traceback.print_exc()
111
+ result_queue.put({
112
+ 'status': 'error',
113
+ 'error': str(e),
114
+ 'urls': []
115
+ })
116
+ return result
117
+
118
+ # Run the spider
119
+ logger.info("Starting crawler...")
120
+ try:
121
+ deferred = runner.crawl(URLSpider, start_url=url, max_urls=max_urls)
122
+ deferred.addCallback(crawler_callback)
123
+ deferred.addErrback(lambda f: result_queue.put({
124
+ 'status': 'error',
125
+ 'error': str(f.value),
126
+ 'urls': []
127
+ }))
128
+ return deferred
129
+ except Exception as e:
130
+ logger.error(f"Error starting crawler: {str(e)}")
131
+ traceback.print_exc()
132
+ result_queue.put({
133
+ 'status': 'error',
134
+ 'error': str(e),
135
+ 'urls': []
136
+ })
137
+ return None
138
  except Exception as e:
139
+ logger.error(f"Error in run_spider: {str(e)}")
140
+ traceback.print_exc()
141
+ result_queue.put({'error': str(e)})
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ @app.route('/scrape', methods=['POST'])
144
+ def scrape_url():
145
  try:
146
+ data = request.get_json()
147
+ if not data:
148
+ logger.error("No JSON data provided in request")
149
+ return jsonify({'error': 'No JSON data provided'}), 400
150
+
151
+ url = data.get('url')
152
+ max_urls = data.get('max_urls', 50)
153
+
154
+ if not url:
155
+ logger.error("No URL provided in request")
156
+ return jsonify({'error': 'URL is required'}), 400
157
+
158
+ logger.info(f"Received scrape request for URL: {url} with max_urls={max_urls}")
159
+
160
+ # Run the spider in the reactor thread
161
+ reactor.callFromThread(run_spider, url, max_urls)
162
+
163
+ # Wait for results with timeout
164
+ try:
165
+ result = result_queue.get(timeout=60)
166
+ if 'error' in result:
167
+ logger.error(f"Scraping error: {result['error']}")
168
+ return jsonify({'error': 'Failed to scrape URL', 'details': {'error': result['error']}}), 500
169
+ return jsonify(result)
170
+ except Exception as e:
171
+ logger.error(f"Timeout waiting for results: {str(e)}")
172
+ return jsonify({'error': 'Scraping timed out'}), 500
173
 
174
  except Exception as e:
175
+ logger.error(f"Error during scraping: {str(e)}")
176
+ traceback.print_exc()
177
+ return jsonify({'error': str(e)}), 500
178
+
179
+ @app.route('/health', methods=['GET'])
180
+ def health_check():
181
+ return jsonify({'status': 'ok'})
182
+
183
+ def run_reactor():
184
+ reactor.run(installSignalHandlers=False)
185
+
186
+ # Start reactor in a separate thread when the app starts
187
+ if not reactor.running:
188
+ reactor_thread = threading.Thread(target=run_reactor, daemon=True)
189
+ reactor_thread.start()
190
+
191
+ class ContentSpider(Spider):
192
+ name = 'content_spider'
193
+ content_results = {} # Class variable to store content results
194
+
195
+ def __init__(self, urls=None, *args, **kwargs):
196
+ super(ContentSpider, self).__init__(*args, **kwargs)
197
+ self.start_urls = urls if urls else []
198
+ logger.info(f"Starting content spider for {len(self.start_urls)} URLs")
199
+
200
+ def parse(self, response):
201
+ try:
202
+ # Extract title
203
+ title = response.css('title::text').get() or ''
204
+
205
+ # Extract main content (this is a simple example, adjust selectors as needed)
206
+ content = ' '.join(response.css('p::text, h1::text, h2::text, h3::text, h4::text, h5::text, h6::text').getall())
207
+
208
+ # Store the result
209
+ self.content_results[response.url] = {
210
+ 'title': title,
211
+ 'content': content[:2000] + '...' if len(content) > 2000 else content, # Limit content length
212
+ 'status': 'success'
213
+ }
214
+
215
+ logger.info(f"Scraped content from {response.url}")
216
+ except Exception as e:
217
+ logger.error(f"Error scraping content from {response.url}: {str(e)}")
218
+ self.content_results[response.url] = {
219
+ 'title': '',
220
+ 'content': '',
221
+ 'status': 'error',
222
+ 'error': str(e)
223
+ }
224
+
225
+ def run_content_spider(urls):
226
  try:
227
+ settings = get_project_settings()
228
+ settings.update({
229
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
230
+ 'LOG_ENABLED': True,
231
+ 'LOG_LEVEL': 'INFO',
232
+ 'ROBOTSTXT_OBEY': True,
233
+ 'CONCURRENT_REQUESTS': 16,
234
+ 'DOWNLOAD_TIMEOUT': 30,
235
+ 'RETRY_TIMES': 3,
 
 
 
 
 
 
 
 
236
  })
237
 
238
+ runner = CrawlerRunner(settings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ def content_crawler_callback(result):
241
+ try:
242
+ # Get content results from the spider's class variable
243
+ content_results = ContentSpider.content_results
244
+ logger.info(f"Content scraping completed for {len(content_results)} URLs.")
245
+
246
+ # Put the results in the queue
247
+ result_queue.put({
248
+ 'status': 'success',
249
+ 'results': content_results
250
+ })
251
+ return result
252
+ except Exception as e:
253
+ logger.error(f"Error in content_crawler_callback: {str(e)}")
254
+ traceback.print_exc()
255
+ result_queue.put({
256
+ 'status': 'error',
257
+ 'error': str(e),
258
+ 'results': {}
259
+ })
260
+ return result
261
+
262
+ # Run the spider
263
+ logger.info("Starting content crawler...")
264
+ try:
265
+ deferred = runner.crawl(ContentSpider, urls=urls)
266
+ deferred.addCallback(content_crawler_callback)
267
+ deferred.addErrback(lambda f: result_queue.put({
268
+ 'status': 'error',
269
+ 'error': str(f.value),
270
+ 'results': {}
271
+ }))
272
+ return deferred
273
+ except Exception as e:
274
+ logger.error(f"Error starting content crawler: {str(e)}")
275
+ traceback.print_exc()
276
+ result_queue.put({
277
+ 'status': 'error',
278
+ 'error': str(e),
279
+ 'results': {}
280
+ })
281
+ return None
282
  except Exception as e:
283
+ logger.error(f"Error in run_content_spider: {str(e)}")
284
+ traceback.print_exc()
285
+ result_queue.put({'error': str(e)})
286
 
287
+ @app.route('/scrape-content', methods=['POST'])
288
+ def scrape_content():
 
289
  try:
290
+ data = request.get_json()
291
+ if not data:
292
+ logger.error("No JSON data provided in request")
293
+ return jsonify({'error': 'No JSON data provided'}), 400
294
+
295
+ urls = data.get('urls', [])
296
+ if not urls:
297
+ logger.error("No URLs provided in request")
298
+ return jsonify({'error': 'URLs are required'}), 400
299
+
300
+ logger.info(f"Received content scrape request for {len(urls)} URLs")
301
+
302
+ # Run the content spider in the reactor thread
303
+ reactor.callFromThread(run_content_spider, urls)
304
+
305
+ # Wait for results with timeout
306
+ try:
307
+ result = result_queue.get(timeout=60)
308
+ if 'error' in result:
309
+ logger.error(f"Content scraping error: {result['error']}")
310
+ return jsonify({'error': 'Failed to scrape content', 'details': {'error': result['error']}}), 500
311
+ return jsonify(result)
312
+ except Exception as e:
313
+ logger.error(f"Timeout waiting for content results: {str(e)}")
314
+ return jsonify({'error': 'Content scraping timed out'}), 500
315
 
316
  except Exception as e:
317
+ logger.error(f"Error during content scraping: {str(e)}")
318
+ traceback.print_exc()
319
+ return jsonify({'error': str(e)}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
  if __name__ == '__main__':
322
+ logger.info("Starting URL Scraper API on port 5001")
323
+ app.run(host='0.0.0.0', port=5001, threaded=True, use_reloader=False)