Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,411 +1,323 @@
|
|
1 |
-
from flask import Flask,
|
2 |
-
from
|
3 |
-
import
|
4 |
-
from
|
5 |
-
import
|
6 |
-
import
|
7 |
-
|
|
|
|
|
8 |
import time
|
9 |
-
import random
|
10 |
-
import base64
|
11 |
-
from io import BytesIO
|
12 |
-
from googlesearch import search
|
13 |
import logging
|
14 |
-
import
|
15 |
-
from
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
# Custom log handler that puts messages in the queue
|
26 |
-
class QueueHandler(logging.Handler):
|
27 |
-
def emit(self, record):
|
28 |
-
log_entry = self.format(record)
|
29 |
-
log_queue.put(log_entry)
|
30 |
-
|
31 |
-
# Set up logging with the custom handler
|
32 |
-
logger = logging.getLogger()
|
33 |
-
queue_handler = QueueHandler()
|
34 |
-
queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
35 |
-
queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler
|
36 |
-
logger.addHandler(queue_handler)
|
37 |
-
logger.setLevel(logging.INFO)
|
38 |
-
|
39 |
-
# Also add the filter to the root logger to catch all socket warnings
|
40 |
-
logging.getLogger().addFilter(SocketWarningFilter())
|
41 |
|
42 |
app = Flask(__name__)
|
43 |
-
# Enable CORS with specific settings
|
44 |
-
CORS(app, resources={
|
45 |
-
r"/*": {
|
46 |
-
"origins": "*",
|
47 |
-
"methods": ["GET", "POST", "OPTIONS"],
|
48 |
-
"allow_headers": ["Content-Type", "Authorization"]
|
49 |
-
}
|
50 |
-
})
|
51 |
-
|
52 |
-
def search_images(query, num_images=5):
|
53 |
-
# Headers to mimic a browser request
|
54 |
-
headers = {
|
55 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
56 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
57 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
58 |
-
'Accept-Encoding': 'gzip, deflate',
|
59 |
-
'DNT': '1',
|
60 |
-
'Connection': 'keep-alive',
|
61 |
-
}
|
62 |
-
|
63 |
-
# Format the query for URL
|
64 |
-
formatted_query = urllib.parse.quote(query)
|
65 |
-
|
66 |
-
# Google Images URL
|
67 |
-
url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
try:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
79 |
|
80 |
-
|
81 |
-
results = []
|
82 |
-
downloaded = 0
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
break
|
87 |
|
|
|
88 |
try:
|
89 |
-
#
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
content_type = img_response.headers.get('Content-Type', '')
|
99 |
-
if not content_type.startswith('image/'):
|
100 |
-
continue
|
101 |
-
|
102 |
-
# Convert image to base64
|
103 |
-
image_base64 = base64.b64encode(img_response.content).decode('utf-8')
|
104 |
-
|
105 |
-
# Add to results
|
106 |
-
results.append({
|
107 |
-
'image_url': img_url,
|
108 |
-
'base64_data': f"data:{content_type};base64,{image_base64}"
|
109 |
})
|
110 |
-
|
111 |
-
downloaded += 1
|
112 |
-
|
113 |
-
# Add a random delay between downloads
|
114 |
-
time.sleep(random.uniform(0.5, 1))
|
115 |
-
|
116 |
except Exception as e:
|
117 |
-
logger.error(f"Error
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
except Exception as e:
|
123 |
-
logger.error(f"
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment
|
129 |
-
|
130 |
-
@app.route('/restart_space', methods=['POST'])
|
131 |
-
def api_restart_space():
|
132 |
-
"""API route to restart a Hugging Face Space."""
|
133 |
-
space_id = 'Pamudu13/web-scraper'
|
134 |
-
factory_reboot = request.json.get('factory_reboot', False) # Optional: Set to True if you want a factory reboot
|
135 |
-
|
136 |
-
if not space_id:
|
137 |
-
return jsonify({'error': 'space_id parameter is required'}), 400
|
138 |
|
|
|
|
|
139 |
try:
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
'
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
except Exception as e:
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
try:
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
# Search for images
|
208 |
-
results = search_images(query, num_images)
|
209 |
-
|
210 |
-
response = jsonify({
|
211 |
-
'success': True,
|
212 |
-
'query': query,
|
213 |
-
'results': results
|
214 |
})
|
215 |
|
216 |
-
|
217 |
-
response.headers['Access-Control-Allow-Origin'] = '*'
|
218 |
-
return response
|
219 |
-
|
220 |
-
except Exception as e:
|
221 |
-
logger.error(f"Error in search_images: {str(e)}")
|
222 |
-
response = jsonify({
|
223 |
-
'success': False,
|
224 |
-
'error': str(e)
|
225 |
-
}), 500
|
226 |
-
|
227 |
-
# Add CORS headers
|
228 |
-
response.headers['Access-Control-Allow-Origin'] = '*'
|
229 |
-
return response
|
230 |
-
|
231 |
-
def scrape_site_content(query, num_sites=5):
|
232 |
-
headers = {
|
233 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
234 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
235 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
236 |
-
'Accept-Encoding': 'gzip, deflate',
|
237 |
-
'DNT': '1',
|
238 |
-
'Connection': 'keep-alive',
|
239 |
-
}
|
240 |
-
|
241 |
-
results = []
|
242 |
-
scraped = 0
|
243 |
-
retries = 2 # Number of retries per URL
|
244 |
-
timeout = 5 # Reduced timeout to 5 seconds
|
245 |
-
|
246 |
-
try:
|
247 |
-
# Get more URLs than needed to account for failures
|
248 |
-
search_results = list(search(query, num_results=num_sites * 2))
|
249 |
-
|
250 |
-
# Process each found URL
|
251 |
-
for url in search_results:
|
252 |
-
if scraped >= num_sites:
|
253 |
-
break
|
254 |
-
|
255 |
-
success = False
|
256 |
-
for attempt in range(retries):
|
257 |
-
try:
|
258 |
-
# Get the HTML content
|
259 |
-
logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
|
260 |
-
logger.info(f"Scraping URL: {url}")
|
261 |
-
response = requests.get(
|
262 |
-
url,
|
263 |
-
headers=headers,
|
264 |
-
timeout=timeout,
|
265 |
-
verify=False # Skip SSL verification
|
266 |
-
)
|
267 |
-
response.raise_for_status()
|
268 |
-
|
269 |
-
# Verify it's HTML content
|
270 |
-
content_type = response.headers.get('Content-Type', '').lower()
|
271 |
-
if 'text/html' not in content_type:
|
272 |
-
logger.info(f"Skipping {url} - not HTML content")
|
273 |
-
break
|
274 |
-
|
275 |
-
# Parse the HTML content
|
276 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
277 |
-
|
278 |
-
# Remove script and style elements
|
279 |
-
for script in soup(["script", "style"]):
|
280 |
-
script.decompose()
|
281 |
-
|
282 |
-
# Extract text content (limit to first 10000 characters)
|
283 |
-
text_content = soup.get_text(separator='\n', strip=True)[:10000]
|
284 |
-
|
285 |
-
# Skip if not enough content
|
286 |
-
if len(text_content.split()) < 100: # Skip if less than 100 words
|
287 |
-
logger.info(f"Skipping {url} - not enough content")
|
288 |
-
break
|
289 |
-
|
290 |
-
# Extract all links (limit to first 10)
|
291 |
-
links = []
|
292 |
-
for link in soup.find_all('a', href=True)[:10]:
|
293 |
-
href = link['href']
|
294 |
-
if href.startswith('http'):
|
295 |
-
links.append({
|
296 |
-
'text': link.get_text(strip=True),
|
297 |
-
'url': href
|
298 |
-
})
|
299 |
-
|
300 |
-
# Extract meta information
|
301 |
-
title = soup.title.string if soup.title else ''
|
302 |
-
meta_description = ''
|
303 |
-
meta_keywords = ''
|
304 |
-
|
305 |
-
meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
|
306 |
-
if meta_desc_tag:
|
307 |
-
meta_description = meta_desc_tag.get('content', '')
|
308 |
-
|
309 |
-
meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
|
310 |
-
if meta_keywords_tag:
|
311 |
-
meta_keywords = meta_keywords_tag.get('content', '')
|
312 |
-
|
313 |
-
results.append({
|
314 |
-
'url': url,
|
315 |
-
'title': title,
|
316 |
-
'meta_description': meta_description,
|
317 |
-
'meta_keywords': meta_keywords,
|
318 |
-
'text_content': text_content,
|
319 |
-
'links': links
|
320 |
-
})
|
321 |
-
|
322 |
-
scraped += 1
|
323 |
-
success = True
|
324 |
-
# Add a random delay between scrapes
|
325 |
-
time.sleep(random.uniform(0.5, 1))
|
326 |
-
break # Break retry loop on success
|
327 |
-
|
328 |
-
except requests.Timeout:
|
329 |
-
print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
|
330 |
-
if attempt == retries - 1: # Last attempt
|
331 |
-
print(f"Skipping {url} after {retries} timeout attempts")
|
332 |
-
except requests.RequestException as e:
|
333 |
-
print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
|
334 |
-
if attempt == retries - 1: # Last attempt
|
335 |
-
print(f"Skipping {url} after {retries} failed attempts")
|
336 |
-
|
337 |
-
# Add a longer delay between retries
|
338 |
-
if not success and attempt < retries - 1:
|
339 |
-
time.sleep(random.uniform(1, 2))
|
340 |
-
|
341 |
-
# If we haven't found enough valid content and have more URLs, continue
|
342 |
-
if scraped < num_sites and len(results) < len(search_results):
|
343 |
-
continue
|
344 |
-
|
345 |
-
return results
|
346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
except Exception as e:
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
|
352 |
-
|
353 |
-
|
354 |
-
def api_scrape_sites():
|
355 |
try:
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
return jsonify({'error': '
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
|
|
|
|
|
|
378 |
|
379 |
except Exception as e:
|
380 |
-
logger.error(f"Error
|
381 |
-
|
382 |
-
|
383 |
-
'error': str(e)
|
384 |
-
}), 500
|
385 |
-
|
386 |
-
# Add CORS headers
|
387 |
-
response.headers['Access-Control-Allow-Origin'] = '*'
|
388 |
-
return response
|
389 |
-
|
390 |
-
@app.route('/logs/stream')
|
391 |
-
def stream_logs():
|
392 |
-
def generate():
|
393 |
-
while True:
|
394 |
-
try:
|
395 |
-
# Get log message from queue, timeout after 1 second
|
396 |
-
log_message = log_queue.get(timeout=1)
|
397 |
-
yield f"data: {log_message}\n\n"
|
398 |
-
except queue.Empty:
|
399 |
-
# Send a heartbeat to keep the connection alive
|
400 |
-
yield "data: heartbeat\n\n"
|
401 |
-
except GeneratorExit:
|
402 |
-
break
|
403 |
-
|
404 |
-
response = Response(stream_with_context(generate()), mimetype='text/event-stream')
|
405 |
-
response.headers['Cache-Control'] = 'no-cache'
|
406 |
-
response.headers['Connection'] = 'keep-alive'
|
407 |
-
return response
|
408 |
|
409 |
if __name__ == '__main__':
|
410 |
-
logger.info("Starting
|
411 |
-
app.run(host='0.0.0.0', port=5001,
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
from scrapy import Spider, Request
|
3 |
+
from scrapy.crawler import CrawlerRunner
|
4 |
+
from scrapy.utils.project import get_project_settings
|
5 |
+
from twisted.internet import reactor
|
6 |
+
from twisted.internet.defer import inlineCallbacks, returnValue, Deferred
|
7 |
+
from urllib.parse import urljoin, urlparse
|
8 |
+
import json
|
9 |
+
import threading
|
10 |
import time
|
|
|
|
|
|
|
|
|
11 |
import logging
|
12 |
+
import traceback
|
13 |
+
from queue import Queue
|
14 |
+
from functools import wraps
|
15 |
+
|
16 |
+
# Configure logging
|
17 |
+
logging.basicConfig(
|
18 |
+
level=logging.INFO,
|
19 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
20 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
21 |
+
)
|
22 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
app = Flask(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Thread-safe queue for results
|
27 |
+
result_queue = Queue()
|
28 |
+
|
29 |
+
class URLSpider(Spider):
|
30 |
+
name = 'url_spider'
|
31 |
+
found_urls = set() # Class variable to store all found URLs
|
32 |
+
|
33 |
+
def __init__(self, start_url=None, max_urls=10, *args, **kwargs):
|
34 |
+
super(URLSpider, self).__init__(*args, **kwargs)
|
35 |
+
self.start_urls = [start_url]
|
36 |
+
self.allowed_domain = urlparse(start_url).netloc
|
37 |
+
self.max_urls = max_urls
|
38 |
+
self.url_count = 0
|
39 |
+
logger.info(f"Starting spider for URL: {start_url} with max_urls={max_urls}")
|
40 |
+
|
41 |
+
def start_requests(self):
|
42 |
+
for url in self.start_urls:
|
43 |
+
yield Request(url, callback=self.parse, dont_filter=True, errback=self.handle_error)
|
44 |
+
|
45 |
+
def handle_error(self, failure):
|
46 |
+
logger.error(f"Request failed: {failure.value}")
|
47 |
+
return None
|
48 |
+
|
49 |
+
def parse(self, response):
|
50 |
+
try:
|
51 |
+
if self.url_count >= self.max_urls:
|
52 |
+
logger.info(f"Reached maximum URL limit ({self.max_urls}). Stopping crawl.")
|
53 |
+
return
|
54 |
+
|
55 |
+
links = response.css('a::attr(href)').getall()
|
56 |
+
logger.info(f"Found {len(links)} links on {response.url}")
|
57 |
+
|
58 |
+
for link in links:
|
59 |
+
if self.url_count >= self.max_urls:
|
60 |
+
return
|
61 |
+
|
62 |
+
absolute_url = urljoin(response.url, link)
|
63 |
+
parsed_url = urlparse(absolute_url)
|
64 |
+
|
65 |
+
if parsed_url.netloc == self.allowed_domain and absolute_url not in self.found_urls:
|
66 |
+
self.found_urls.add(absolute_url)
|
67 |
+
self.url_count += 1
|
68 |
+
logger.info(f"Found URL ({self.url_count}/{self.max_urls}): {absolute_url}")
|
69 |
+
|
70 |
+
if self.url_count < self.max_urls:
|
71 |
+
logger.info(f"Following link: {absolute_url}")
|
72 |
+
yield Request(absolute_url, callback=self.parse, errback=self.handle_error)
|
73 |
+
except Exception as e:
|
74 |
+
logger.error(f"Error in parse method: {str(e)}")
|
75 |
+
traceback.print_exc()
|
76 |
+
|
77 |
+
def run_spider(url, max_urls):
|
78 |
try:
|
79 |
+
settings = get_project_settings()
|
80 |
+
settings.update({
|
81 |
+
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
82 |
+
'LOG_ENABLED': True,
|
83 |
+
'LOG_LEVEL': 'INFO',
|
84 |
+
'ROBOTSTXT_OBEY': True,
|
85 |
+
'CONCURRENT_REQUESTS': 16,
|
86 |
+
'DOWNLOAD_TIMEOUT': 30,
|
87 |
+
'RETRY_TIMES': 3,
|
88 |
+
})
|
89 |
|
90 |
+
runner = CrawlerRunner(settings)
|
|
|
|
|
91 |
|
92 |
+
# Create a deferred to store the results
|
93 |
+
results = {'urls': set()}
|
|
|
94 |
|
95 |
+
def crawler_callback(result):
|
96 |
try:
|
97 |
+
# Get URLs from the spider's class variable
|
98 |
+
urls = list(URLSpider.found_urls)
|
99 |
+
logger.info(f"Crawling completed. Found {len(urls)} URLs.")
|
100 |
+
|
101 |
+
# Put the results in the queue
|
102 |
+
result_queue.put({
|
103 |
+
'status': 'success',
|
104 |
+
'urls': urls,
|
105 |
+
'count': len(urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
})
|
107 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
108 |
except Exception as e:
|
109 |
+
logger.error(f"Error in crawler_callback: {str(e)}")
|
110 |
+
traceback.print_exc()
|
111 |
+
result_queue.put({
|
112 |
+
'status': 'error',
|
113 |
+
'error': str(e),
|
114 |
+
'urls': []
|
115 |
+
})
|
116 |
+
return result
|
117 |
+
|
118 |
+
# Run the spider
|
119 |
+
logger.info("Starting crawler...")
|
120 |
+
try:
|
121 |
+
deferred = runner.crawl(URLSpider, start_url=url, max_urls=max_urls)
|
122 |
+
deferred.addCallback(crawler_callback)
|
123 |
+
deferred.addErrback(lambda f: result_queue.put({
|
124 |
+
'status': 'error',
|
125 |
+
'error': str(f.value),
|
126 |
+
'urls': []
|
127 |
+
}))
|
128 |
+
return deferred
|
129 |
+
except Exception as e:
|
130 |
+
logger.error(f"Error starting crawler: {str(e)}")
|
131 |
+
traceback.print_exc()
|
132 |
+
result_queue.put({
|
133 |
+
'status': 'error',
|
134 |
+
'error': str(e),
|
135 |
+
'urls': []
|
136 |
+
})
|
137 |
+
return None
|
138 |
except Exception as e:
|
139 |
+
logger.error(f"Error in run_spider: {str(e)}")
|
140 |
+
traceback.print_exc()
|
141 |
+
result_queue.put({'error': str(e)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
@app.route('/scrape', methods=['POST'])
|
144 |
+
def scrape_url():
|
145 |
try:
|
146 |
+
data = request.get_json()
|
147 |
+
if not data:
|
148 |
+
logger.error("No JSON data provided in request")
|
149 |
+
return jsonify({'error': 'No JSON data provided'}), 400
|
150 |
+
|
151 |
+
url = data.get('url')
|
152 |
+
max_urls = data.get('max_urls', 50)
|
153 |
+
|
154 |
+
if not url:
|
155 |
+
logger.error("No URL provided in request")
|
156 |
+
return jsonify({'error': 'URL is required'}), 400
|
157 |
+
|
158 |
+
logger.info(f"Received scrape request for URL: {url} with max_urls={max_urls}")
|
159 |
+
|
160 |
+
# Run the spider in the reactor thread
|
161 |
+
reactor.callFromThread(run_spider, url, max_urls)
|
162 |
+
|
163 |
+
# Wait for results with timeout
|
164 |
+
try:
|
165 |
+
result = result_queue.get(timeout=60)
|
166 |
+
if 'error' in result:
|
167 |
+
logger.error(f"Scraping error: {result['error']}")
|
168 |
+
return jsonify({'error': 'Failed to scrape URL', 'details': {'error': result['error']}}), 500
|
169 |
+
return jsonify(result)
|
170 |
+
except Exception as e:
|
171 |
+
logger.error(f"Timeout waiting for results: {str(e)}")
|
172 |
+
return jsonify({'error': 'Scraping timed out'}), 500
|
173 |
|
174 |
except Exception as e:
|
175 |
+
logger.error(f"Error during scraping: {str(e)}")
|
176 |
+
traceback.print_exc()
|
177 |
+
return jsonify({'error': str(e)}), 500
|
178 |
+
|
179 |
+
@app.route('/health', methods=['GET'])
|
180 |
+
def health_check():
|
181 |
+
return jsonify({'status': 'ok'})
|
182 |
+
|
183 |
+
def run_reactor():
|
184 |
+
reactor.run(installSignalHandlers=False)
|
185 |
+
|
186 |
+
# Start reactor in a separate thread when the app starts
|
187 |
+
if not reactor.running:
|
188 |
+
reactor_thread = threading.Thread(target=run_reactor, daemon=True)
|
189 |
+
reactor_thread.start()
|
190 |
+
|
191 |
+
class ContentSpider(Spider):
|
192 |
+
name = 'content_spider'
|
193 |
+
content_results = {} # Class variable to store content results
|
194 |
+
|
195 |
+
def __init__(self, urls=None, *args, **kwargs):
|
196 |
+
super(ContentSpider, self).__init__(*args, **kwargs)
|
197 |
+
self.start_urls = urls if urls else []
|
198 |
+
logger.info(f"Starting content spider for {len(self.start_urls)} URLs")
|
199 |
+
|
200 |
+
def parse(self, response):
|
201 |
+
try:
|
202 |
+
# Extract title
|
203 |
+
title = response.css('title::text').get() or ''
|
204 |
+
|
205 |
+
# Extract main content (this is a simple example, adjust selectors as needed)
|
206 |
+
content = ' '.join(response.css('p::text, h1::text, h2::text, h3::text, h4::text, h5::text, h6::text').getall())
|
207 |
+
|
208 |
+
# Store the result
|
209 |
+
self.content_results[response.url] = {
|
210 |
+
'title': title,
|
211 |
+
'content': content[:2000] + '...' if len(content) > 2000 else content, # Limit content length
|
212 |
+
'status': 'success'
|
213 |
+
}
|
214 |
+
|
215 |
+
logger.info(f"Scraped content from {response.url}")
|
216 |
+
except Exception as e:
|
217 |
+
logger.error(f"Error scraping content from {response.url}: {str(e)}")
|
218 |
+
self.content_results[response.url] = {
|
219 |
+
'title': '',
|
220 |
+
'content': '',
|
221 |
+
'status': 'error',
|
222 |
+
'error': str(e)
|
223 |
+
}
|
224 |
+
|
225 |
+
def run_content_spider(urls):
|
226 |
try:
|
227 |
+
settings = get_project_settings()
|
228 |
+
settings.update({
|
229 |
+
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
230 |
+
'LOG_ENABLED': True,
|
231 |
+
'LOG_LEVEL': 'INFO',
|
232 |
+
'ROBOTSTXT_OBEY': True,
|
233 |
+
'CONCURRENT_REQUESTS': 16,
|
234 |
+
'DOWNLOAD_TIMEOUT': 30,
|
235 |
+
'RETRY_TIMES': 3,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
})
|
237 |
|
238 |
+
runner = CrawlerRunner(settings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
+
def content_crawler_callback(result):
|
241 |
+
try:
|
242 |
+
# Get content results from the spider's class variable
|
243 |
+
content_results = ContentSpider.content_results
|
244 |
+
logger.info(f"Content scraping completed for {len(content_results)} URLs.")
|
245 |
+
|
246 |
+
# Put the results in the queue
|
247 |
+
result_queue.put({
|
248 |
+
'status': 'success',
|
249 |
+
'results': content_results
|
250 |
+
})
|
251 |
+
return result
|
252 |
+
except Exception as e:
|
253 |
+
logger.error(f"Error in content_crawler_callback: {str(e)}")
|
254 |
+
traceback.print_exc()
|
255 |
+
result_queue.put({
|
256 |
+
'status': 'error',
|
257 |
+
'error': str(e),
|
258 |
+
'results': {}
|
259 |
+
})
|
260 |
+
return result
|
261 |
+
|
262 |
+
# Run the spider
|
263 |
+
logger.info("Starting content crawler...")
|
264 |
+
try:
|
265 |
+
deferred = runner.crawl(ContentSpider, urls=urls)
|
266 |
+
deferred.addCallback(content_crawler_callback)
|
267 |
+
deferred.addErrback(lambda f: result_queue.put({
|
268 |
+
'status': 'error',
|
269 |
+
'error': str(f.value),
|
270 |
+
'results': {}
|
271 |
+
}))
|
272 |
+
return deferred
|
273 |
+
except Exception as e:
|
274 |
+
logger.error(f"Error starting content crawler: {str(e)}")
|
275 |
+
traceback.print_exc()
|
276 |
+
result_queue.put({
|
277 |
+
'status': 'error',
|
278 |
+
'error': str(e),
|
279 |
+
'results': {}
|
280 |
+
})
|
281 |
+
return None
|
282 |
except Exception as e:
|
283 |
+
logger.error(f"Error in run_content_spider: {str(e)}")
|
284 |
+
traceback.print_exc()
|
285 |
+
result_queue.put({'error': str(e)})
|
286 |
|
287 |
+
@app.route('/scrape-content', methods=['POST'])
|
288 |
+
def scrape_content():
|
|
|
289 |
try:
|
290 |
+
data = request.get_json()
|
291 |
+
if not data:
|
292 |
+
logger.error("No JSON data provided in request")
|
293 |
+
return jsonify({'error': 'No JSON data provided'}), 400
|
294 |
+
|
295 |
+
urls = data.get('urls', [])
|
296 |
+
if not urls:
|
297 |
+
logger.error("No URLs provided in request")
|
298 |
+
return jsonify({'error': 'URLs are required'}), 400
|
299 |
+
|
300 |
+
logger.info(f"Received content scrape request for {len(urls)} URLs")
|
301 |
+
|
302 |
+
# Run the content spider in the reactor thread
|
303 |
+
reactor.callFromThread(run_content_spider, urls)
|
304 |
+
|
305 |
+
# Wait for results with timeout
|
306 |
+
try:
|
307 |
+
result = result_queue.get(timeout=60)
|
308 |
+
if 'error' in result:
|
309 |
+
logger.error(f"Content scraping error: {result['error']}")
|
310 |
+
return jsonify({'error': 'Failed to scrape content', 'details': {'error': result['error']}}), 500
|
311 |
+
return jsonify(result)
|
312 |
+
except Exception as e:
|
313 |
+
logger.error(f"Timeout waiting for content results: {str(e)}")
|
314 |
+
return jsonify({'error': 'Content scraping timed out'}), 500
|
315 |
|
316 |
except Exception as e:
|
317 |
+
logger.error(f"Error during content scraping: {str(e)}")
|
318 |
+
traceback.print_exc()
|
319 |
+
return jsonify({'error': str(e)}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
if __name__ == '__main__':
|
322 |
+
logger.info("Starting URL Scraper API on port 5001")
|
323 |
+
app.run(host='0.0.0.0', port=5001, threaded=True, use_reloader=False)
|