Hemang Thakur commited on
Commit
e0ab505
·
1 Parent(s): 83e870c

commented out crawl4ai

Browse files
Files changed (1) hide show
  1. src/crawl/crawler.py +566 -566
src/crawl/crawler.py CHANGED
@@ -1,17 +1,17 @@
1
- from crawl4ai import AsyncWebCrawler
2
- from urllib.parse import urlparse
3
  import aiohttp
4
  import asyncio
5
- from asyncio.exceptions import TimeoutError as async_timeout
6
  from fast_async import make_async
7
  from bs4 import BeautifulSoup, NavigableString
8
- import secrets
9
- from datetime import datetime
10
- import random
11
  import os
12
  import re
13
  import uuid
14
- from typing import List, Dict, Tuple, Optional
15
  from io import BytesIO
16
  import PyPDF2
17
  from fake_useragent import FakeUserAgent
@@ -20,597 +20,597 @@ from transformers import AutoTokenizer, AutoConfig
20
  import torch
21
  import time
22
 
23
- class Crawler:
24
- def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
25
- self.session_pool = {} # Track active sessions
26
- self.verbose = verbose
27
- self.rate_limit = rate_limit
28
- self.user_dir = user_dir
29
- self.headless = headless
30
- self.crawler = AsyncWebCrawler(
31
- context_options={"userDataDir": self.user_dir},
32
- headless=self.headless,
33
- verbose=self.verbose
34
- )
35
-
36
- # Browser context management
37
- self._browser_contexts = {}
38
- self._context_locks = {}
39
-
40
- async def get_browser_context(self, session_id):
41
- """Get or create a browser context with proper locking"""
42
- if session_id not in self._context_locks:
43
- self._context_locks[session_id] = asyncio.Lock()
44
 
45
- async with self._context_locks[session_id]:
46
- if session_id not in self._browser_contexts:
47
- context = await self.crawler.new_context()
48
- self._browser_contexts[session_id] = context
49
- return self._browser_contexts[session_id]
50
 
51
- async def cleanup_browser_context(self, session_id):
52
- """Safely cleanup browser context"""
53
- if session_id in self._context_locks:
54
- async with self._context_locks[session_id]:
55
- if session_id in self._browser_contexts:
56
- try:
57
- await asyncio.shield(
58
- self._browser_contexts[session_id].close()
59
- )
60
- except Exception as e:
61
- print(f"Error cleaning up browser context: {e}")
62
- finally:
63
- del self._browser_contexts[session_id]
64
-
65
- def create_session(self):
66
- """Create a new session with secure ID"""
67
- session_id = secrets.token_urlsafe(32) # Secure session ID
68
- self.session_pool[session_id] = {
69
- 'created_at': datetime.now(),
70
- 'last_used': datetime.now(),
71
- 'requests_count': 0
72
- }
73
- return session_id
74
-
75
- def rotate_session(self, session_id):
76
- """Implement session rotation logic"""
77
- if self.session_pool[session_id]['requests_count'] > 100:
78
- self.cleanup_session(session_id)
79
- return self.create_session()
80
- return session_id
81
-
82
- def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
83
- """Analyzes HTML content to determine if a webpage is dynamically loaded"""
84
- def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
85
- """Check structural indicators of dynamic content loading."""
86
- scores = {
87
- 'empty_containers': 0,
88
- 'repeated_structures': 0,
89
- 'api_endpoints': 0,
90
- 'state_management': 0
91
- }
92
 
93
- # 1. Check for empty content containers
94
- main_containers = soup.find_all(['main', 'div', 'section'],
95
- class_=lambda x: x and any(term in str(x).lower()
96
- for term in ['content', 'main', 'feed', 'list', 'container']))
97
 
98
- for container in main_containers:
99
- # Check if container is empty or has minimal content
100
- if len(container.find_all()) < 3:
101
- scores['empty_containers'] += 1
102
 
103
- # Check for repeated similar structures (common in dynamic lists)
104
- children = container.find_all(recursive=False)
105
- if children:
106
- first_child_class = children[0].get('class', [])
107
- similar_siblings = [c for c in children[1:]
108
- if c.get('class', []) == first_child_class]
109
- if len(similar_siblings) > 0:
110
- scores['repeated_structures'] += 1
111
-
112
- # 2. Check for API endpoints in scripts
113
- scripts = soup.find_all('script', {'src': True})
114
- api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
115
- for script in scripts:
116
- if any(pattern in script['src'] for pattern in api_patterns):
117
- scores['api_endpoints'] += 1
118
-
119
- # 3. Look for state management setup
120
- state_patterns = [
121
- r'window\.__INITIAL_STATE__',
122
- r'window\.__PRELOADED_STATE__',
123
- r'__REDUX_STATE__',
124
- r'__NUXT__',
125
- r'__NEXT_DATA__',
126
- r'window\.__data'
127
- ]
128
 
129
- inline_scripts = soup.find_all('script')
130
- for script in inline_scripts:
131
- if script.string:
132
- for pattern in state_patterns:
133
- if re.search(pattern, script.string):
134
- scores['state_management'] += 1
135
-
136
- return scores
137
-
138
- def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
139
- """Check for indicators of modern web frameworks and dynamic loading patterns."""
140
- scores = {
141
- 'framework_roots': 0,
142
- 'hydration': 0,
143
- 'routing': 0
144
- }
145
 
146
- # 1. Framework-specific root elements
147
- framework_roots = {
148
- 'react': ['react-root', 'react-app', 'root', '__next'],
149
- 'angular': ['ng-version', 'ng-app'],
150
- 'vue': ['v-app', '#app', 'nuxt-app'],
151
- 'modern': ['app-root', 'application', 'spa-root']
152
- }
153
 
154
- for framework, identifiers in framework_roots.items():
155
- for id_value in identifiers:
156
- if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
157
- soup.find(attrs={'class': re.compile(id_value, re.I)}) or
158
- soup.find(attrs={'data-': re.compile(id_value, re.I)})):
159
- scores['framework_roots'] += 1
160
-
161
- # 2. Check for hydration indicators
162
- hydration_patterns = [
163
- r'hydrate',
164
- r'createRoot',
165
- r'reactive',
166
- r'observable'
167
- ]
168
 
169
- scripts = soup.find_all('script')
170
- for script in scripts:
171
- if script.string:
172
- for pattern in hydration_patterns:
173
- if re.search(pattern, script.string):
174
- scores['hydration'] += 1
175
-
176
- # 3. Check for dynamic routing setup
177
- router_patterns = [
178
- 'router-view',
179
- 'router-link',
180
- 'route-link',
181
- 'history.push',
182
- 'navigation'
183
- ]
184
 
185
- for pattern in router_patterns:
186
- if soup.find(class_=re.compile(pattern, re.I)) or \
187
- soup.find(id=re.compile(pattern, re.I)):
188
- scores['routing'] += 1
189
-
190
- return scores
191
-
192
- def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
193
- """Check for various dynamic content loading patterns."""
194
- scores = {
195
- 'infinite_scroll': 0,
196
- 'load_more_buttons': 0,
197
- 'pagination': 0,
198
- 'lazy_loading': 0,
199
- 'loading_indicators': 0
200
- }
201
 
202
- # 1. Check for infinite scroll indicators
203
- scroll_indicators = [
204
- 'infinite-scroll',
205
- 'data-infinite',
206
- 'data-virtualized',
207
- 'virtual-scroll',
208
- 'scroll-container',
209
- 'scroll-viewport'
210
- ]
211
 
212
- for indicator in scroll_indicators:
213
- elements = soup.find_all(
214
- lambda tag: any(indicator.lower() in str(v).lower()
215
- for v in tag.attrs.values())
216
- )
217
- if elements:
218
- scores['infinite_scroll'] += len(elements)
219
-
220
- # 2. Check for load more buttons
221
- button_patterns = [
222
- r'load[_-]?more',
223
- r'show[_-]?more',
224
- r'view[_-]?more',
225
- r'see[_-]?more',
226
- r'more[_-]?posts',
227
- r'more[_-]?results'
228
- ]
229
 
230
- for pattern in button_patterns:
231
- elements = soup.find_all(
232
- ['button', 'a', 'div', 'span'],
233
- text=re.compile(pattern, re.I)
234
- )
235
- if elements:
236
- scores['load_more_buttons'] += len(elements)
237
-
238
- # 3. Check for pagination
239
- pagination_patterns = [
240
- 'pagination',
241
- 'page-numbers',
242
- 'page-nav',
243
- 'page-links'
244
- ]
245
 
246
- for pattern in pagination_patterns:
247
- elements = soup.find_all(class_=re.compile(pattern, re.I))
248
- if elements:
249
- scores['pagination'] += len(elements)
250
-
251
- # 4. Check for lazy loading
252
- lazy_patterns = ['lazy', 'data-src', 'data-lazy']
253
- for pattern in lazy_patterns:
254
- elements = soup.find_all(
255
- lambda tag: any(pattern.lower() in str(v).lower()
256
- for v in tag.attrs.values())
257
- )
258
- if elements:
259
- scores['lazy_loading'] += len(elements)
260
-
261
- # 5. Check for loading indicators
262
- loading_patterns = [
263
- 'loading',
264
- 'spinner',
265
- 'skeleton',
266
- 'placeholder',
267
- 'shimmer'
268
- ]
269
 
270
- for pattern in loading_patterns:
271
- elements = soup.find_all(class_=re.compile(pattern, re.I))
272
- if elements:
273
- scores['loading_indicators'] += len(elements)
274
-
275
- return scores
276
-
277
- def _evaluate_dynamic_indicators(
278
- structural: Dict[str, int],
279
- framework: Dict[str, int],
280
- loading: Dict[str, int]
281
- ) -> Tuple[bool, Optional[str]]:
282
- """Evaluate dynamic indicators and return JavaScript instructions."""
283
- methods = []
284
- js_snippets = []
285
-
286
- # Infinite Scroll
287
- if loading['infinite_scroll'] > 0:
288
- methods.append("scroll")
289
- js_snippets.append(
290
- """
291
- window.scrollTo(0, document.body.scrollHeight);
292
- await new Promise(resolve => setTimeout(resolve, 1000));
293
- """.strip().replace('\n', '')
294
- )
295
-
296
- # Load More Buttons
297
- if loading['load_more_buttons'] > 0:
298
- methods.append("button")
299
- js_snippets.append(
300
- """
301
- const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
302
- el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
303
- );
304
- if (button) {
305
- button.click();
306
- await new Promise(resolve => setTimeout(resolve, 1000));
307
- } else {
308
- console.warn("No 'Load More' button found.");
309
- }
310
- """.strip().replace('\n', '')
311
- )
312
-
313
- # Paginated Interfaces
314
- if loading.get('pagination', 0) > 0:
315
- methods.append("pagination")
316
- js_snippets.append(
317
- """
318
- const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
319
- if (nextPage) {
320
- nextPage.click();
321
- await new Promise(resolve => setTimeout(resolve, 1000));
322
- } else {
323
- console.warn("No pagination link found.");
324
- }
325
- """.strip().replace('\n', '')
326
- )
327
-
328
- # Lazy Loading
329
- if loading.get('lazy_loading', 0) > 0:
330
- methods.append("lazy")
331
- js_snippets.append(
332
- """
333
- if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
334
- console.log('Framework state detected. Consider monitoring network requests for further actions.');
335
- }
336
- """.strip().replace('\n', '')
337
- )
338
-
339
- # Framework and State Management Indicators
340
- if framework['framework_roots'] > 0 or structural['state_management'] > 0:
341
- methods.append("stateful")
342
- js_snippets.append(
343
- """
344
- if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
345
- console.log('Detected stateful framework data loading.');
346
- }
347
- """.strip().replace('\n', '')
348
- )
349
-
350
- # API-Driven Content
351
- if structural['api_endpoints'] > 0:
352
- methods.append("api")
353
- js_snippets.append(
354
- """
355
- console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
356
- """.strip().replace('\n', '')
357
- )
358
-
359
- # Aggregate and finalize
360
- if methods:
361
- js_code = "\n".join(js_snippets)
362
- return True, js_code
363
 
364
- return False, None
365
 
366
- # Main execution
367
- soup = BeautifulSoup(html_content, 'html.parser')
368
 
369
- # Run all checks
370
- structural_scores = _check_structural_indicators(soup)
371
- framework_scores = _check_modern_framework_indicators(soup)
372
- loading_scores = _check_dynamic_loading_patterns(soup)
373
 
374
- # Evaluate results
375
- return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
376
-
377
- async def crawl(
378
- self,
379
- url,
380
- depth=2,
381
- max_pages=5,
382
- session_id=None,
383
- human_simulation=True,
384
- rotate_user_agent=True,
385
- rotate_proxy=True,
386
- return_html=False
387
- ):
388
- if not session_id:
389
- session_id = self.create_session()
390
-
391
- session_id = self.rotate_session(session_id)
392
-
393
- # List of rotating user agents
394
- user_agents = [
395
- 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
396
- 'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
397
- 'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
398
- 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
399
- 'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
400
- ]
401
-
402
- # List of rotating proxies
403
- proxies = [
404
- "http://50.62.183.123:80",
405
- "http://104.129.60.84:6516",
406
- "http://156.228.118.163:3128",
407
- "http://142.111.104.97:6107",
408
- "http://156.228.99.99:3128"
409
- ]
410
-
411
- try:
412
- async with self.crawler as crawler:
413
- # Rotate user agent and optimize headers for each attempt
414
- headers = {
415
- "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
416
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
417
- "Accept-Language": "en-US,en;q=0.5",
418
- "Accept-Encoding": "gzip, deflate",
419
- "Connection": "keep-alive",
420
- "Upgrade-Insecure-Requests": "1",
421
- "Sec-Fetch-Dest": "document",
422
- "Sec-Fetch-Mode": "navigate",
423
- "Sec-Fetch-Site": "none",
424
- "Sec-Fetch-User": "?1",
425
- "Cache-Control": "max-age=0"
426
- }
427
 
428
- # Update crawler headers for rotation
429
- crawler.crawler_strategy.headers = headers
430
-
431
- if rotate_proxy:
432
- # Update crawler proxy for rotation
433
- crawler.crawler_strategy.proxy = random.choice(proxies)
434
-
435
- result_1 = await crawler.arun(
436
- session_id=session_id,
437
- url=url,
438
- magic=True if human_simulation else False,
439
- simulate_user=True if human_simulation else False,
440
- override_navigator=True if human_simulation else False,
441
- depth=depth,
442
- max_pages=max_pages,
443
- bypass_cache=True,
444
- remove_overlay_elements=True,
445
- delay_before_retrieve_html=1.0,
446
- verbose=self.verbose
447
- )
448
-
449
- # Update session metrics
450
- self.session_pool[session_id]['requests_count'] += 1
451
- self.session_pool[session_id]['last_used'] = datetime.now()
452
-
453
- if result_1.success:
454
- if hasattr(result_1, 'html'):
455
- success, js_code = self.is_dynamic_page(result_1.html)
456
-
457
- if success:
458
- async with crawler as crawler:
459
- # Update crawler headers for rotation
460
- crawler.crawler_strategy.headers = headers
461
-
462
- if rotate_proxy:
463
- # Update crawler proxy for rotation
464
- crawler.crawler_strategy.proxy = random.choice(proxies)
465
-
466
- print(f"Executing JS code: {js_code}")
467
- result_2 = await crawler.arun(
468
- session_id=session_id,
469
- url=url,
470
- magic=True if human_simulation else False,
471
- simulate_user=True if human_simulation else False,
472
- override_navigator=True if human_simulation else False,
473
- depth=depth,
474
- max_pages=max_pages,
475
- js_code=js_code,
476
- bypass_cache=True,
477
- remove_overlay_elements=True,
478
- delay_before_retrieve_html=1.0,
479
- verbose=self.verbose
480
- )
481
-
482
- if result_2.success:
483
- result = result_2
484
- else:
485
- result = result_1
486
-
487
- # Update session metrics
488
- self.session_pool[session_id]['requests_count'] += 1
489
- self.session_pool[session_id]['last_used'] = datetime.now()
490
-
491
- else:
492
- result = result_1
493
 
494
- if return_html and hasattr(result, 'html'):
495
- return result.html
496
- elif hasattr(result, 'fit_markdown'):
497
- return result.fit_markdown
498
- elif hasattr(result, 'markdown'):
499
- return self.extract_content(result.markdown)
500
 
501
- except Exception as e:
502
- print(f"Error crawling {url}: {str(e)}")
503
 
504
- return None
505
 
506
- async def crawl_with_retry(
507
- self,
508
- url,
509
- depth=2,
510
- max_pages=5,
511
- max_retries=3,
512
- backoff_factor=1,
513
- session_id=None,
514
- human_simulation=True,
515
- rotate_user_agent=True,
516
- rotate_proxy=True,
517
- return_html=False,
518
- timeout=10.0
519
- ):
520
- """Crawl with retry logic and anti-blocking measures"""
521
-
522
- async def attempt_crawl(attempt):
523
- try:
524
- async with async_timeout.timeout(timeout):
525
- context = await self.get_browser_context(session_id)
526
- return await self.crawl(
527
- context,
528
- url,
529
- depth,
530
- max_pages,
531
- session_id,
532
- human_simulation,
533
- rotate_user_agent,
534
- rotate_proxy,
535
- return_html
536
- )
537
- except asyncio.TimeoutError:
538
- print(f"Timeout on attempt {attempt} for {url}")
539
- raise
540
- except Exception as e:
541
- print(f"Error on attempt {attempt} for {url}: {e}")
542
- raise
543
-
544
- if not self.is_valid_url(url) and not self.is_html_url(url):
545
- print(f"Invalid URL: {url}")
546
- return f"No web results found for query: {url}"
547
-
548
- for attempt in range(max_retries):
549
- try:
550
- if attempt > 0:
551
- # Add delay between retries with exponential backoff
552
- delay = backoff_factor * (2 ** (attempt - 1))
553
- await asyncio.sleep(delay)
554
 
555
- return await attempt_crawl(attempt + 1)
556
- except Exception as e:
557
- if attempt == max_retries - 1:
558
- print(f"Max retries ({max_retries}) reached for {url}")
559
- return f"Failed to crawl after {max_retries} attempts: {url}"
560
- continue
561
-
562
- return f"No content found after {max_retries} attempts for: {url}"
563
-
564
- def extract_content(self, html_content):
565
- soup = BeautifulSoup(html_content, 'html.parser')
566
- for script in soup(["script", "style"]):
567
- script.decompose()
568
- text = soup.get_text()
569
- lines = (line.strip() for line in text.splitlines())
570
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
571
- text = '\n'.join(chunk for chunk in chunks if chunk)
572
- return text
573
 
574
- def cleanup_session(self, session_id):
575
- """Clean up a session"""
576
- print(f"Cleaning up session {session_id}")
577
- if session_id in self.session_pool:
578
- self.crawler.crawler_strategy.kill_session(session_id)
579
- del self.session_pool[session_id]
580
-
581
- def cleanup_expired_sessions(self):
582
- """Regular cleanup of expired sessions using proper time calculation"""
583
- try:
584
- current_time = datetime.now()
585
- expired_sessions = []
586
 
587
- for sid, data in self.session_pool.items():
588
- # Calculate time difference in seconds
589
- time_diff = (current_time - data['last_used']).total_seconds()
590
 
591
- # Check if more than 1 hour (3600 seconds)
592
- if time_diff > 3600:
593
- expired_sessions.append(sid)
594
 
595
- # Cleanup expired sessions
596
- for session_id in expired_sessions:
597
- self.cleanup_session(session_id)
598
 
599
- except Exception as e:
600
- if self.verbose:
601
- print(f"Error during session cleanup: {str(e)}")
602
 
603
- @staticmethod
604
- def is_valid_url(url):
605
- try:
606
- result = urlparse(url)
607
- return all([result.scheme, result.netloc])
608
- except ValueError:
609
- return False
610
 
611
- @staticmethod
612
- def is_html_url(url):
613
- return url.endswith(".html") or url.endswith(".htm")
614
 
615
  class CustomCrawler:
616
  def __init__(
 
1
+ # from crawl4ai import AsyncWebCrawler
2
+ # from urllib.parse import urlparse
3
  import aiohttp
4
  import asyncio
5
+ # from asyncio.exceptions import TimeoutError as async_timeout
6
  from fast_async import make_async
7
  from bs4 import BeautifulSoup, NavigableString
8
+ # import secrets
9
+ # from datetime import datetime
10
+ # import random
11
  import os
12
  import re
13
  import uuid
14
+ from typing import List, Dict, Optional #, Tuple
15
  from io import BytesIO
16
  import PyPDF2
17
  from fake_useragent import FakeUserAgent
 
20
  import torch
21
  import time
22
 
23
+ # class Crawler:
24
+ # def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
25
+ # self.session_pool = {} # Track active sessions
26
+ # self.verbose = verbose
27
+ # self.rate_limit = rate_limit
28
+ # self.user_dir = user_dir
29
+ # self.headless = headless
30
+ # self.crawler = AsyncWebCrawler(
31
+ # context_options={"userDataDir": self.user_dir},
32
+ # headless=self.headless,
33
+ # verbose=self.verbose
34
+ # )
35
+
36
+ # # Browser context management
37
+ # self._browser_contexts = {}
38
+ # self._context_locks = {}
39
+
40
+ # async def get_browser_context(self, session_id):
41
+ # """Get or create a browser context with proper locking"""
42
+ # if session_id not in self._context_locks:
43
+ # self._context_locks[session_id] = asyncio.Lock()
44
 
45
+ # async with self._context_locks[session_id]:
46
+ # if session_id not in self._browser_contexts:
47
+ # context = await self.crawler.new_context()
48
+ # self._browser_contexts[session_id] = context
49
+ # return self._browser_contexts[session_id]
50
 
51
+ # async def cleanup_browser_context(self, session_id):
52
+ # """Safely cleanup browser context"""
53
+ # if session_id in self._context_locks:
54
+ # async with self._context_locks[session_id]:
55
+ # if session_id in self._browser_contexts:
56
+ # try:
57
+ # await asyncio.shield(
58
+ # self._browser_contexts[session_id].close()
59
+ # )
60
+ # except Exception as e:
61
+ # print(f"Error cleaning up browser context: {e}")
62
+ # finally:
63
+ # del self._browser_contexts[session_id]
64
+
65
+ # def create_session(self):
66
+ # """Create a new session with secure ID"""
67
+ # session_id = secrets.token_urlsafe(32) # Secure session ID
68
+ # self.session_pool[session_id] = {
69
+ # 'created_at': datetime.now(),
70
+ # 'last_used': datetime.now(),
71
+ # 'requests_count': 0
72
+ # }
73
+ # return session_id
74
+
75
+ # def rotate_session(self, session_id):
76
+ # """Implement session rotation logic"""
77
+ # if self.session_pool[session_id]['requests_count'] > 100:
78
+ # self.cleanup_session(session_id)
79
+ # return self.create_session()
80
+ # return session_id
81
+
82
+ # def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
83
+ # """Analyzes HTML content to determine if a webpage is dynamically loaded"""
84
+ # def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
85
+ # """Check structural indicators of dynamic content loading."""
86
+ # scores = {
87
+ # 'empty_containers': 0,
88
+ # 'repeated_structures': 0,
89
+ # 'api_endpoints': 0,
90
+ # 'state_management': 0
91
+ # }
92
 
93
+ # # 1. Check for empty content containers
94
+ # main_containers = soup.find_all(['main', 'div', 'section'],
95
+ # class_=lambda x: x and any(term in str(x).lower()
96
+ # for term in ['content', 'main', 'feed', 'list', 'container']))
97
 
98
+ # for container in main_containers:
99
+ # # Check if container is empty or has minimal content
100
+ # if len(container.find_all()) < 3:
101
+ # scores['empty_containers'] += 1
102
 
103
+ # # Check for repeated similar structures (common in dynamic lists)
104
+ # children = container.find_all(recursive=False)
105
+ # if children:
106
+ # first_child_class = children[0].get('class', [])
107
+ # similar_siblings = [c for c in children[1:]
108
+ # if c.get('class', []) == first_child_class]
109
+ # if len(similar_siblings) > 0:
110
+ # scores['repeated_structures'] += 1
111
+
112
+ # # 2. Check for API endpoints in scripts
113
+ # scripts = soup.find_all('script', {'src': True})
114
+ # api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
115
+ # for script in scripts:
116
+ # if any(pattern in script['src'] for pattern in api_patterns):
117
+ # scores['api_endpoints'] += 1
118
+
119
+ # # 3. Look for state management setup
120
+ # state_patterns = [
121
+ # r'window\.__INITIAL_STATE__',
122
+ # r'window\.__PRELOADED_STATE__',
123
+ # r'__REDUX_STATE__',
124
+ # r'__NUXT__',
125
+ # r'__NEXT_DATA__',
126
+ # r'window\.__data'
127
+ # ]
128
 
129
+ # inline_scripts = soup.find_all('script')
130
+ # for script in inline_scripts:
131
+ # if script.string:
132
+ # for pattern in state_patterns:
133
+ # if re.search(pattern, script.string):
134
+ # scores['state_management'] += 1
135
+
136
+ # return scores
137
+
138
+ # def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
139
+ # """Check for indicators of modern web frameworks and dynamic loading patterns."""
140
+ # scores = {
141
+ # 'framework_roots': 0,
142
+ # 'hydration': 0,
143
+ # 'routing': 0
144
+ # }
145
 
146
+ # # 1. Framework-specific root elements
147
+ # framework_roots = {
148
+ # 'react': ['react-root', 'react-app', 'root', '__next'],
149
+ # 'angular': ['ng-version', 'ng-app'],
150
+ # 'vue': ['v-app', '#app', 'nuxt-app'],
151
+ # 'modern': ['app-root', 'application', 'spa-root']
152
+ # }
153
 
154
+ # for framework, identifiers in framework_roots.items():
155
+ # for id_value in identifiers:
156
+ # if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
157
+ # soup.find(attrs={'class': re.compile(id_value, re.I)}) or
158
+ # soup.find(attrs={'data-': re.compile(id_value, re.I)})):
159
+ # scores['framework_roots'] += 1
160
+
161
+ # # 2. Check for hydration indicators
162
+ # hydration_patterns = [
163
+ # r'hydrate',
164
+ # r'createRoot',
165
+ # r'reactive',
166
+ # r'observable'
167
+ # ]
168
 
169
+ # scripts = soup.find_all('script')
170
+ # for script in scripts:
171
+ # if script.string:
172
+ # for pattern in hydration_patterns:
173
+ # if re.search(pattern, script.string):
174
+ # scores['hydration'] += 1
175
+
176
+ # # 3. Check for dynamic routing setup
177
+ # router_patterns = [
178
+ # 'router-view',
179
+ # 'router-link',
180
+ # 'route-link',
181
+ # 'history.push',
182
+ # 'navigation'
183
+ # ]
184
 
185
+ # for pattern in router_patterns:
186
+ # if soup.find(class_=re.compile(pattern, re.I)) or \
187
+ # soup.find(id=re.compile(pattern, re.I)):
188
+ # scores['routing'] += 1
189
+
190
+ # return scores
191
+
192
+ # def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
193
+ # """Check for various dynamic content loading patterns."""
194
+ # scores = {
195
+ # 'infinite_scroll': 0,
196
+ # 'load_more_buttons': 0,
197
+ # 'pagination': 0,
198
+ # 'lazy_loading': 0,
199
+ # 'loading_indicators': 0
200
+ # }
201
 
202
+ # # 1. Check for infinite scroll indicators
203
+ # scroll_indicators = [
204
+ # 'infinite-scroll',
205
+ # 'data-infinite',
206
+ # 'data-virtualized',
207
+ # 'virtual-scroll',
208
+ # 'scroll-container',
209
+ # 'scroll-viewport'
210
+ # ]
211
 
212
+ # for indicator in scroll_indicators:
213
+ # elements = soup.find_all(
214
+ # lambda tag: any(indicator.lower() in str(v).lower()
215
+ # for v in tag.attrs.values())
216
+ # )
217
+ # if elements:
218
+ # scores['infinite_scroll'] += len(elements)
219
+
220
+ # # 2. Check for load more buttons
221
+ # button_patterns = [
222
+ # r'load[_-]?more',
223
+ # r'show[_-]?more',
224
+ # r'view[_-]?more',
225
+ # r'see[_-]?more',
226
+ # r'more[_-]?posts',
227
+ # r'more[_-]?results'
228
+ # ]
229
 
230
+ # for pattern in button_patterns:
231
+ # elements = soup.find_all(
232
+ # ['button', 'a', 'div', 'span'],
233
+ # text=re.compile(pattern, re.I)
234
+ # )
235
+ # if elements:
236
+ # scores['load_more_buttons'] += len(elements)
237
+
238
+ # # 3. Check for pagination
239
+ # pagination_patterns = [
240
+ # 'pagination',
241
+ # 'page-numbers',
242
+ # 'page-nav',
243
+ # 'page-links'
244
+ # ]
245
 
246
+ # for pattern in pagination_patterns:
247
+ # elements = soup.find_all(class_=re.compile(pattern, re.I))
248
+ # if elements:
249
+ # scores['pagination'] += len(elements)
250
+
251
+ # # 4. Check for lazy loading
252
+ # lazy_patterns = ['lazy', 'data-src', 'data-lazy']
253
+ # for pattern in lazy_patterns:
254
+ # elements = soup.find_all(
255
+ # lambda tag: any(pattern.lower() in str(v).lower()
256
+ # for v in tag.attrs.values())
257
+ # )
258
+ # if elements:
259
+ # scores['lazy_loading'] += len(elements)
260
+
261
+ # # 5. Check for loading indicators
262
+ # loading_patterns = [
263
+ # 'loading',
264
+ # 'spinner',
265
+ # 'skeleton',
266
+ # 'placeholder',
267
+ # 'shimmer'
268
+ # ]
269
 
270
+ # for pattern in loading_patterns:
271
+ # elements = soup.find_all(class_=re.compile(pattern, re.I))
272
+ # if elements:
273
+ # scores['loading_indicators'] += len(elements)
274
+
275
+ # return scores
276
+
277
+ # def _evaluate_dynamic_indicators(
278
+ # structural: Dict[str, int],
279
+ # framework: Dict[str, int],
280
+ # loading: Dict[str, int]
281
+ # ) -> Tuple[bool, Optional[str]]:
282
+ # """Evaluate dynamic indicators and return JavaScript instructions."""
283
+ # methods = []
284
+ # js_snippets = []
285
+
286
+ # # Infinite Scroll
287
+ # if loading['infinite_scroll'] > 0:
288
+ # methods.append("scroll")
289
+ # js_snippets.append(
290
+ # """
291
+ # window.scrollTo(0, document.body.scrollHeight);
292
+ # await new Promise(resolve => setTimeout(resolve, 1000));
293
+ # """.strip().replace('\n', '')
294
+ # )
295
+
296
+ # # Load More Buttons
297
+ # if loading['load_more_buttons'] > 0:
298
+ # methods.append("button")
299
+ # js_snippets.append(
300
+ # """
301
+ # const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
302
+ # el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
303
+ # );
304
+ # if (button) {
305
+ # button.click();
306
+ # await new Promise(resolve => setTimeout(resolve, 1000));
307
+ # } else {
308
+ # console.warn("No 'Load More' button found.");
309
+ # }
310
+ # """.strip().replace('\n', '')
311
+ # )
312
+
313
+ # # Paginated Interfaces
314
+ # if loading.get('pagination', 0) > 0:
315
+ # methods.append("pagination")
316
+ # js_snippets.append(
317
+ # """
318
+ # const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
319
+ # if (nextPage) {
320
+ # nextPage.click();
321
+ # await new Promise(resolve => setTimeout(resolve, 1000));
322
+ # } else {
323
+ # console.warn("No pagination link found.");
324
+ # }
325
+ # """.strip().replace('\n', '')
326
+ # )
327
+
328
+ # # Lazy Loading
329
+ # if loading.get('lazy_loading', 0) > 0:
330
+ # methods.append("lazy")
331
+ # js_snippets.append(
332
+ # """
333
+ # if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
334
+ # console.log('Framework state detected. Consider monitoring network requests for further actions.');
335
+ # }
336
+ # """.strip().replace('\n', '')
337
+ # )
338
+
339
+ # # Framework and State Management Indicators
340
+ # if framework['framework_roots'] > 0 or structural['state_management'] > 0:
341
+ # methods.append("stateful")
342
+ # js_snippets.append(
343
+ # """
344
+ # if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
345
+ # console.log('Detected stateful framework data loading.');
346
+ # }
347
+ # """.strip().replace('\n', '')
348
+ # )
349
+
350
+ # # API-Driven Content
351
+ # if structural['api_endpoints'] > 0:
352
+ # methods.append("api")
353
+ # js_snippets.append(
354
+ # """
355
+ # console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
356
+ # """.strip().replace('\n', '')
357
+ # )
358
+
359
+ # # Aggregate and finalize
360
+ # if methods:
361
+ # js_code = "\n".join(js_snippets)
362
+ # return True, js_code
363
 
364
+ # return False, None
365
 
366
+ # # Main execution
367
+ # soup = BeautifulSoup(html_content, 'html.parser')
368
 
369
+ # # Run all checks
370
+ # structural_scores = _check_structural_indicators(soup)
371
+ # framework_scores = _check_modern_framework_indicators(soup)
372
+ # loading_scores = _check_dynamic_loading_patterns(soup)
373
 
374
+ # # Evaluate results
375
+ # return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
376
+
377
+ # async def crawl(
378
+ # self,
379
+ # url,
380
+ # depth=2,
381
+ # max_pages=5,
382
+ # session_id=None,
383
+ # human_simulation=True,
384
+ # rotate_user_agent=True,
385
+ # rotate_proxy=True,
386
+ # return_html=False
387
+ # ):
388
+ # if not session_id:
389
+ # session_id = self.create_session()
390
+
391
+ # session_id = self.rotate_session(session_id)
392
+
393
+ # # List of rotating user agents
394
+ # user_agents = [
395
+ # 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
396
+ # 'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
397
+ # 'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
398
+ # 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
399
+ # 'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
400
+ # ]
401
+
402
+ # # List of rotating proxies
403
+ # proxies = [
404
+ # "http://50.62.183.123:80",
405
+ # "http://104.129.60.84:6516",
406
+ # "http://156.228.118.163:3128",
407
+ # "http://142.111.104.97:6107",
408
+ # "http://156.228.99.99:3128"
409
+ # ]
410
+
411
+ # try:
412
+ # async with self.crawler as crawler:
413
+ # # Rotate user agent and optimize headers for each attempt
414
+ # headers = {
415
+ # "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
416
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
417
+ # "Accept-Language": "en-US,en;q=0.5",
418
+ # "Accept-Encoding": "gzip, deflate",
419
+ # "Connection": "keep-alive",
420
+ # "Upgrade-Insecure-Requests": "1",
421
+ # "Sec-Fetch-Dest": "document",
422
+ # "Sec-Fetch-Mode": "navigate",
423
+ # "Sec-Fetch-Site": "none",
424
+ # "Sec-Fetch-User": "?1",
425
+ # "Cache-Control": "max-age=0"
426
+ # }
427
 
428
+ # # Update crawler headers for rotation
429
+ # crawler.crawler_strategy.headers = headers
430
+
431
+ # if rotate_proxy:
432
+ # # Update crawler proxy for rotation
433
+ # crawler.crawler_strategy.proxy = random.choice(proxies)
434
+
435
+ # result_1 = await crawler.arun(
436
+ # session_id=session_id,
437
+ # url=url,
438
+ # magic=True if human_simulation else False,
439
+ # simulate_user=True if human_simulation else False,
440
+ # override_navigator=True if human_simulation else False,
441
+ # depth=depth,
442
+ # max_pages=max_pages,
443
+ # bypass_cache=True,
444
+ # remove_overlay_elements=True,
445
+ # delay_before_retrieve_html=1.0,
446
+ # verbose=self.verbose
447
+ # )
448
+
449
+ # # Update session metrics
450
+ # self.session_pool[session_id]['requests_count'] += 1
451
+ # self.session_pool[session_id]['last_used'] = datetime.now()
452
+
453
+ # if result_1.success:
454
+ # if hasattr(result_1, 'html'):
455
+ # success, js_code = self.is_dynamic_page(result_1.html)
456
+
457
+ # if success:
458
+ # async with crawler as crawler:
459
+ # # Update crawler headers for rotation
460
+ # crawler.crawler_strategy.headers = headers
461
+
462
+ # if rotate_proxy:
463
+ # # Update crawler proxy for rotation
464
+ # crawler.crawler_strategy.proxy = random.choice(proxies)
465
+
466
+ # print(f"Executing JS code: {js_code}")
467
+ # result_2 = await crawler.arun(
468
+ # session_id=session_id,
469
+ # url=url,
470
+ # magic=True if human_simulation else False,
471
+ # simulate_user=True if human_simulation else False,
472
+ # override_navigator=True if human_simulation else False,
473
+ # depth=depth,
474
+ # max_pages=max_pages,
475
+ # js_code=js_code,
476
+ # bypass_cache=True,
477
+ # remove_overlay_elements=True,
478
+ # delay_before_retrieve_html=1.0,
479
+ # verbose=self.verbose
480
+ # )
481
+
482
+ # if result_2.success:
483
+ # result = result_2
484
+ # else:
485
+ # result = result_1
486
+
487
+ # # Update session metrics
488
+ # self.session_pool[session_id]['requests_count'] += 1
489
+ # self.session_pool[session_id]['last_used'] = datetime.now()
490
+
491
+ # else:
492
+ # result = result_1
493
 
494
+ # if return_html and hasattr(result, 'html'):
495
+ # return result.html
496
+ # elif hasattr(result, 'fit_markdown'):
497
+ # return result.fit_markdown
498
+ # elif hasattr(result, 'markdown'):
499
+ # return self.extract_content(result.markdown)
500
 
501
+ # except Exception as e:
502
+ # print(f"Error crawling {url}: {str(e)}")
503
 
504
+ # return None
505
 
506
+ # async def crawl_with_retry(
507
+ # self,
508
+ # url,
509
+ # depth=2,
510
+ # max_pages=5,
511
+ # max_retries=3,
512
+ # backoff_factor=1,
513
+ # session_id=None,
514
+ # human_simulation=True,
515
+ # rotate_user_agent=True,
516
+ # rotate_proxy=True,
517
+ # return_html=False,
518
+ # timeout=10.0
519
+ # ):
520
+ # """Crawl with retry logic and anti-blocking measures"""
521
+
522
+ # async def attempt_crawl(attempt):
523
+ # try:
524
+ # async with async_timeout.timeout(timeout):
525
+ # context = await self.get_browser_context(session_id)
526
+ # return await self.crawl(
527
+ # context,
528
+ # url,
529
+ # depth,
530
+ # max_pages,
531
+ # session_id,
532
+ # human_simulation,
533
+ # rotate_user_agent,
534
+ # rotate_proxy,
535
+ # return_html
536
+ # )
537
+ # except asyncio.TimeoutError:
538
+ # print(f"Timeout on attempt {attempt} for {url}")
539
+ # raise
540
+ # except Exception as e:
541
+ # print(f"Error on attempt {attempt} for {url}: {e}")
542
+ # raise
543
+
544
+ # if not self.is_valid_url(url) and not self.is_html_url(url):
545
+ # print(f"Invalid URL: {url}")
546
+ # return f"No web results found for query: {url}"
547
+
548
+ # for attempt in range(max_retries):
549
+ # try:
550
+ # if attempt > 0:
551
+ # # Add delay between retries with exponential backoff
552
+ # delay = backoff_factor * (2 ** (attempt - 1))
553
+ # await asyncio.sleep(delay)
554
 
555
+ # return await attempt_crawl(attempt + 1)
556
+ # except Exception as e:
557
+ # if attempt == max_retries - 1:
558
+ # print(f"Max retries ({max_retries}) reached for {url}")
559
+ # return f"Failed to crawl after {max_retries} attempts: {url}"
560
+ # continue
561
+
562
+ # return f"No content found after {max_retries} attempts for: {url}"
563
+
564
+ # def extract_content(self, html_content):
565
+ # soup = BeautifulSoup(html_content, 'html.parser')
566
+ # for script in soup(["script", "style"]):
567
+ # script.decompose()
568
+ # text = soup.get_text()
569
+ # lines = (line.strip() for line in text.splitlines())
570
+ # chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
571
+ # text = '\n'.join(chunk for chunk in chunks if chunk)
572
+ # return text
573
 
574
+ # def cleanup_session(self, session_id):
575
+ # """Clean up a session"""
576
+ # print(f"Cleaning up session {session_id}")
577
+ # if session_id in self.session_pool:
578
+ # self.crawler.crawler_strategy.kill_session(session_id)
579
+ # del self.session_pool[session_id]
580
+
581
+ # def cleanup_expired_sessions(self):
582
+ # """Regular cleanup of expired sessions using proper time calculation"""
583
+ # try:
584
+ # current_time = datetime.now()
585
+ # expired_sessions = []
586
 
587
+ # for sid, data in self.session_pool.items():
588
+ # # Calculate time difference in seconds
589
+ # time_diff = (current_time - data['last_used']).total_seconds()
590
 
591
+ # # Check if more than 1 hour (3600 seconds)
592
+ # if time_diff > 3600:
593
+ # expired_sessions.append(sid)
594
 
595
+ # # Cleanup expired sessions
596
+ # for session_id in expired_sessions:
597
+ # self.cleanup_session(session_id)
598
 
599
+ # except Exception as e:
600
+ # if self.verbose:
601
+ # print(f"Error during session cleanup: {str(e)}")
602
 
603
+ # @staticmethod
604
+ # def is_valid_url(url):
605
+ # try:
606
+ # result = urlparse(url)
607
+ # return all([result.scheme, result.netloc])
608
+ # except ValueError:
609
+ # return False
610
 
611
+ # @staticmethod
612
+ # def is_html_url(url):
613
+ # return url.endswith(".html") or url.endswith(".htm")
614
 
615
  class CustomCrawler:
616
  def __init__(