Hemang Thakur commited on
Commit
83e870c
·
1 Parent(s): b0bc4d2

commented out crawl4ai in neo4j file

Browse files
Files changed (2) hide show
  1. src/crawl/crawler.py +566 -566
  2. src/rag/neo4j_graphrag.py +1 -1
src/crawl/crawler.py CHANGED
@@ -1,17 +1,17 @@
1
- # from crawl4ai import AsyncWebCrawler
2
- # from urllib.parse import urlparse
3
  import aiohttp
4
  import asyncio
5
- # from asyncio.exceptions import TimeoutError as async_timeout
6
  from fast_async import make_async
7
  from bs4 import BeautifulSoup, NavigableString
8
- # import secrets
9
- # from datetime import datetime
10
- # import random
11
  import os
12
  import re
13
  import uuid
14
- from typing import List, Dict, Optional #, Tuple
15
  from io import BytesIO
16
  import PyPDF2
17
  from fake_useragent import FakeUserAgent
@@ -20,597 +20,597 @@ from transformers import AutoTokenizer, AutoConfig
20
  import torch
21
  import time
22
 
23
- # class Crawler:
24
- # def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
25
- # self.session_pool = {} # Track active sessions
26
- # self.verbose = verbose
27
- # self.rate_limit = rate_limit
28
- # self.user_dir = user_dir
29
- # self.headless = headless
30
- # self.crawler = AsyncWebCrawler(
31
- # context_options={"userDataDir": self.user_dir},
32
- # headless=self.headless,
33
- # verbose=self.verbose
34
- # )
35
-
36
- # # Browser context management
37
- # self._browser_contexts = {}
38
- # self._context_locks = {}
39
-
40
- # async def get_browser_context(self, session_id):
41
- # """Get or create a browser context with proper locking"""
42
- # if session_id not in self._context_locks:
43
- # self._context_locks[session_id] = asyncio.Lock()
44
 
45
- # async with self._context_locks[session_id]:
46
- # if session_id not in self._browser_contexts:
47
- # context = await self.crawler.new_context()
48
- # self._browser_contexts[session_id] = context
49
- # return self._browser_contexts[session_id]
50
 
51
- # async def cleanup_browser_context(self, session_id):
52
- # """Safely cleanup browser context"""
53
- # if session_id in self._context_locks:
54
- # async with self._context_locks[session_id]:
55
- # if session_id in self._browser_contexts:
56
- # try:
57
- # await asyncio.shield(
58
- # self._browser_contexts[session_id].close()
59
- # )
60
- # except Exception as e:
61
- # print(f"Error cleaning up browser context: {e}")
62
- # finally:
63
- # del self._browser_contexts[session_id]
64
-
65
- # def create_session(self):
66
- # """Create a new session with secure ID"""
67
- # session_id = secrets.token_urlsafe(32) # Secure session ID
68
- # self.session_pool[session_id] = {
69
- # 'created_at': datetime.now(),
70
- # 'last_used': datetime.now(),
71
- # 'requests_count': 0
72
- # }
73
- # return session_id
74
-
75
- # def rotate_session(self, session_id):
76
- # """Implement session rotation logic"""
77
- # if self.session_pool[session_id]['requests_count'] > 100:
78
- # self.cleanup_session(session_id)
79
- # return self.create_session()
80
- # return session_id
81
-
82
- # def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
83
- # """Analyzes HTML content to determine if a webpage is dynamically loaded"""
84
- # def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
85
- # """Check structural indicators of dynamic content loading."""
86
- # scores = {
87
- # 'empty_containers': 0,
88
- # 'repeated_structures': 0,
89
- # 'api_endpoints': 0,
90
- # 'state_management': 0
91
- # }
92
 
93
- # # 1. Check for empty content containers
94
- # main_containers = soup.find_all(['main', 'div', 'section'],
95
- # class_=lambda x: x and any(term in str(x).lower()
96
- # for term in ['content', 'main', 'feed', 'list', 'container']))
97
 
98
- # for container in main_containers:
99
- # # Check if container is empty or has minimal content
100
- # if len(container.find_all()) < 3:
101
- # scores['empty_containers'] += 1
102
 
103
- # # Check for repeated similar structures (common in dynamic lists)
104
- # children = container.find_all(recursive=False)
105
- # if children:
106
- # first_child_class = children[0].get('class', [])
107
- # similar_siblings = [c for c in children[1:]
108
- # if c.get('class', []) == first_child_class]
109
- # if len(similar_siblings) > 0:
110
- # scores['repeated_structures'] += 1
111
-
112
- # # 2. Check for API endpoints in scripts
113
- # scripts = soup.find_all('script', {'src': True})
114
- # api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
115
- # for script in scripts:
116
- # if any(pattern in script['src'] for pattern in api_patterns):
117
- # scores['api_endpoints'] += 1
118
-
119
- # # 3. Look for state management setup
120
- # state_patterns = [
121
- # r'window\.__INITIAL_STATE__',
122
- # r'window\.__PRELOADED_STATE__',
123
- # r'__REDUX_STATE__',
124
- # r'__NUXT__',
125
- # r'__NEXT_DATA__',
126
- # r'window\.__data'
127
- # ]
128
 
129
- # inline_scripts = soup.find_all('script')
130
- # for script in inline_scripts:
131
- # if script.string:
132
- # for pattern in state_patterns:
133
- # if re.search(pattern, script.string):
134
- # scores['state_management'] += 1
135
-
136
- # return scores
137
-
138
- # def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
139
- # """Check for indicators of modern web frameworks and dynamic loading patterns."""
140
- # scores = {
141
- # 'framework_roots': 0,
142
- # 'hydration': 0,
143
- # 'routing': 0
144
- # }
145
 
146
- # # 1. Framework-specific root elements
147
- # framework_roots = {
148
- # 'react': ['react-root', 'react-app', 'root', '__next'],
149
- # 'angular': ['ng-version', 'ng-app'],
150
- # 'vue': ['v-app', '#app', 'nuxt-app'],
151
- # 'modern': ['app-root', 'application', 'spa-root']
152
- # }
153
 
154
- # for framework, identifiers in framework_roots.items():
155
- # for id_value in identifiers:
156
- # if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
157
- # soup.find(attrs={'class': re.compile(id_value, re.I)}) or
158
- # soup.find(attrs={'data-': re.compile(id_value, re.I)})):
159
- # scores['framework_roots'] += 1
160
-
161
- # # 2. Check for hydration indicators
162
- # hydration_patterns = [
163
- # r'hydrate',
164
- # r'createRoot',
165
- # r'reactive',
166
- # r'observable'
167
- # ]
168
 
169
- # scripts = soup.find_all('script')
170
- # for script in scripts:
171
- # if script.string:
172
- # for pattern in hydration_patterns:
173
- # if re.search(pattern, script.string):
174
- # scores['hydration'] += 1
175
-
176
- # # 3. Check for dynamic routing setup
177
- # router_patterns = [
178
- # 'router-view',
179
- # 'router-link',
180
- # 'route-link',
181
- # 'history.push',
182
- # 'navigation'
183
- # ]
184
 
185
- # for pattern in router_patterns:
186
- # if soup.find(class_=re.compile(pattern, re.I)) or \
187
- # soup.find(id=re.compile(pattern, re.I)):
188
- # scores['routing'] += 1
189
-
190
- # return scores
191
-
192
- # def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
193
- # """Check for various dynamic content loading patterns."""
194
- # scores = {
195
- # 'infinite_scroll': 0,
196
- # 'load_more_buttons': 0,
197
- # 'pagination': 0,
198
- # 'lazy_loading': 0,
199
- # 'loading_indicators': 0
200
- # }
201
 
202
- # # 1. Check for infinite scroll indicators
203
- # scroll_indicators = [
204
- # 'infinite-scroll',
205
- # 'data-infinite',
206
- # 'data-virtualized',
207
- # 'virtual-scroll',
208
- # 'scroll-container',
209
- # 'scroll-viewport'
210
- # ]
211
 
212
- # for indicator in scroll_indicators:
213
- # elements = soup.find_all(
214
- # lambda tag: any(indicator.lower() in str(v).lower()
215
- # for v in tag.attrs.values())
216
- # )
217
- # if elements:
218
- # scores['infinite_scroll'] += len(elements)
219
-
220
- # # 2. Check for load more buttons
221
- # button_patterns = [
222
- # r'load[_-]?more',
223
- # r'show[_-]?more',
224
- # r'view[_-]?more',
225
- # r'see[_-]?more',
226
- # r'more[_-]?posts',
227
- # r'more[_-]?results'
228
- # ]
229
 
230
- # for pattern in button_patterns:
231
- # elements = soup.find_all(
232
- # ['button', 'a', 'div', 'span'],
233
- # text=re.compile(pattern, re.I)
234
- # )
235
- # if elements:
236
- # scores['load_more_buttons'] += len(elements)
237
-
238
- # # 3. Check for pagination
239
- # pagination_patterns = [
240
- # 'pagination',
241
- # 'page-numbers',
242
- # 'page-nav',
243
- # 'page-links'
244
- # ]
245
 
246
- # for pattern in pagination_patterns:
247
- # elements = soup.find_all(class_=re.compile(pattern, re.I))
248
- # if elements:
249
- # scores['pagination'] += len(elements)
250
-
251
- # # 4. Check for lazy loading
252
- # lazy_patterns = ['lazy', 'data-src', 'data-lazy']
253
- # for pattern in lazy_patterns:
254
- # elements = soup.find_all(
255
- # lambda tag: any(pattern.lower() in str(v).lower()
256
- # for v in tag.attrs.values())
257
- # )
258
- # if elements:
259
- # scores['lazy_loading'] += len(elements)
260
-
261
- # # 5. Check for loading indicators
262
- # loading_patterns = [
263
- # 'loading',
264
- # 'spinner',
265
- # 'skeleton',
266
- # 'placeholder',
267
- # 'shimmer'
268
- # ]
269
 
270
- # for pattern in loading_patterns:
271
- # elements = soup.find_all(class_=re.compile(pattern, re.I))
272
- # if elements:
273
- # scores['loading_indicators'] += len(elements)
274
-
275
- # return scores
276
-
277
- # def _evaluate_dynamic_indicators(
278
- # structural: Dict[str, int],
279
- # framework: Dict[str, int],
280
- # loading: Dict[str, int]
281
- # ) -> Tuple[bool, Optional[str]]:
282
- # """Evaluate dynamic indicators and return JavaScript instructions."""
283
- # methods = []
284
- # js_snippets = []
285
-
286
- # # Infinite Scroll
287
- # if loading['infinite_scroll'] > 0:
288
- # methods.append("scroll")
289
- # js_snippets.append(
290
- # """
291
- # window.scrollTo(0, document.body.scrollHeight);
292
- # await new Promise(resolve => setTimeout(resolve, 1000));
293
- # """.strip().replace('\n', '')
294
- # )
295
-
296
- # # Load More Buttons
297
- # if loading['load_more_buttons'] > 0:
298
- # methods.append("button")
299
- # js_snippets.append(
300
- # """
301
- # const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
302
- # el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
303
- # );
304
- # if (button) {
305
- # button.click();
306
- # await new Promise(resolve => setTimeout(resolve, 1000));
307
- # } else {
308
- # console.warn("No 'Load More' button found.");
309
- # }
310
- # """.strip().replace('\n', '')
311
- # )
312
-
313
- # # Paginated Interfaces
314
- # if loading.get('pagination', 0) > 0:
315
- # methods.append("pagination")
316
- # js_snippets.append(
317
- # """
318
- # const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
319
- # if (nextPage) {
320
- # nextPage.click();
321
- # await new Promise(resolve => setTimeout(resolve, 1000));
322
- # } else {
323
- # console.warn("No pagination link found.");
324
- # }
325
- # """.strip().replace('\n', '')
326
- # )
327
-
328
- # # Lazy Loading
329
- # if loading.get('lazy_loading', 0) > 0:
330
- # methods.append("lazy")
331
- # js_snippets.append(
332
- # """
333
- # if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
334
- # console.log('Framework state detected. Consider monitoring network requests for further actions.');
335
- # }
336
- # """.strip().replace('\n', '')
337
- # )
338
-
339
- # # Framework and State Management Indicators
340
- # if framework['framework_roots'] > 0 or structural['state_management'] > 0:
341
- # methods.append("stateful")
342
- # js_snippets.append(
343
- # """
344
- # if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
345
- # console.log('Detected stateful framework data loading.');
346
- # }
347
- # """.strip().replace('\n', '')
348
- # )
349
-
350
- # # API-Driven Content
351
- # if structural['api_endpoints'] > 0:
352
- # methods.append("api")
353
- # js_snippets.append(
354
- # """
355
- # console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
356
- # """.strip().replace('\n', '')
357
- # )
358
-
359
- # # Aggregate and finalize
360
- # if methods:
361
- # js_code = "\n".join(js_snippets)
362
- # return True, js_code
363
 
364
- # return False, None
365
 
366
- # # Main execution
367
- # soup = BeautifulSoup(html_content, 'html.parser')
368
 
369
- # # Run all checks
370
- # structural_scores = _check_structural_indicators(soup)
371
- # framework_scores = _check_modern_framework_indicators(soup)
372
- # loading_scores = _check_dynamic_loading_patterns(soup)
373
 
374
- # # Evaluate results
375
- # return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
376
-
377
- # async def crawl(
378
- # self,
379
- # url,
380
- # depth=2,
381
- # max_pages=5,
382
- # session_id=None,
383
- # human_simulation=True,
384
- # rotate_user_agent=True,
385
- # rotate_proxy=True,
386
- # return_html=False
387
- # ):
388
- # if not session_id:
389
- # session_id = self.create_session()
390
-
391
- # session_id = self.rotate_session(session_id)
392
-
393
- # # List of rotating user agents
394
- # user_agents = [
395
- # 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
396
- # 'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
397
- # 'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
398
- # 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
399
- # 'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
400
- # ]
401
-
402
- # # List of rotating proxies
403
- # proxies = [
404
- # "http://50.62.183.123:80",
405
- # "http://104.129.60.84:6516",
406
- # "http://156.228.118.163:3128",
407
- # "http://142.111.104.97:6107",
408
- # "http://156.228.99.99:3128"
409
- # ]
410
-
411
- # try:
412
- # async with self.crawler as crawler:
413
- # # Rotate user agent and optimize headers for each attempt
414
- # headers = {
415
- # "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
416
- # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
417
- # "Accept-Language": "en-US,en;q=0.5",
418
- # "Accept-Encoding": "gzip, deflate",
419
- # "Connection": "keep-alive",
420
- # "Upgrade-Insecure-Requests": "1",
421
- # "Sec-Fetch-Dest": "document",
422
- # "Sec-Fetch-Mode": "navigate",
423
- # "Sec-Fetch-Site": "none",
424
- # "Sec-Fetch-User": "?1",
425
- # "Cache-Control": "max-age=0"
426
- # }
427
 
428
- # # Update crawler headers for rotation
429
- # crawler.crawler_strategy.headers = headers
430
-
431
- # if rotate_proxy:
432
- # # Update crawler proxy for rotation
433
- # crawler.crawler_strategy.proxy = random.choice(proxies)
434
-
435
- # result_1 = await crawler.arun(
436
- # session_id=session_id,
437
- # url=url,
438
- # magic=True if human_simulation else False,
439
- # simulate_user=True if human_simulation else False,
440
- # override_navigator=True if human_simulation else False,
441
- # depth=depth,
442
- # max_pages=max_pages,
443
- # bypass_cache=True,
444
- # remove_overlay_elements=True,
445
- # delay_before_retrieve_html=1.0,
446
- # verbose=self.verbose
447
- # )
448
-
449
- # # Update session metrics
450
- # self.session_pool[session_id]['requests_count'] += 1
451
- # self.session_pool[session_id]['last_used'] = datetime.now()
452
-
453
- # if result_1.success:
454
- # if hasattr(result_1, 'html'):
455
- # success, js_code = self.is_dynamic_page(result_1.html)
456
-
457
- # if success:
458
- # async with crawler as crawler:
459
- # # Update crawler headers for rotation
460
- # crawler.crawler_strategy.headers = headers
461
-
462
- # if rotate_proxy:
463
- # # Update crawler proxy for rotation
464
- # crawler.crawler_strategy.proxy = random.choice(proxies)
465
-
466
- # print(f"Executing JS code: {js_code}")
467
- # result_2 = await crawler.arun(
468
- # session_id=session_id,
469
- # url=url,
470
- # magic=True if human_simulation else False,
471
- # simulate_user=True if human_simulation else False,
472
- # override_navigator=True if human_simulation else False,
473
- # depth=depth,
474
- # max_pages=max_pages,
475
- # js_code=js_code,
476
- # bypass_cache=True,
477
- # remove_overlay_elements=True,
478
- # delay_before_retrieve_html=1.0,
479
- # verbose=self.verbose
480
- # )
481
-
482
- # if result_2.success:
483
- # result = result_2
484
- # else:
485
- # result = result_1
486
-
487
- # # Update session metrics
488
- # self.session_pool[session_id]['requests_count'] += 1
489
- # self.session_pool[session_id]['last_used'] = datetime.now()
490
-
491
- # else:
492
- # result = result_1
493
 
494
- # if return_html and hasattr(result, 'html'):
495
- # return result.html
496
- # elif hasattr(result, 'fit_markdown'):
497
- # return result.fit_markdown
498
- # elif hasattr(result, 'markdown'):
499
- # return self.extract_content(result.markdown)
500
 
501
- # except Exception as e:
502
- # print(f"Error crawling {url}: {str(e)}")
503
 
504
- # return None
505
 
506
- # async def crawl_with_retry(
507
- # self,
508
- # url,
509
- # depth=2,
510
- # max_pages=5,
511
- # max_retries=3,
512
- # backoff_factor=1,
513
- # session_id=None,
514
- # human_simulation=True,
515
- # rotate_user_agent=True,
516
- # rotate_proxy=True,
517
- # return_html=False,
518
- # timeout=10.0
519
- # ):
520
- # """Crawl with retry logic and anti-blocking measures"""
521
-
522
- # async def attempt_crawl(attempt):
523
- # try:
524
- # async with async_timeout.timeout(timeout):
525
- # context = await self.get_browser_context(session_id)
526
- # return await self.crawl(
527
- # context,
528
- # url,
529
- # depth,
530
- # max_pages,
531
- # session_id,
532
- # human_simulation,
533
- # rotate_user_agent,
534
- # rotate_proxy,
535
- # return_html
536
- # )
537
- # except asyncio.TimeoutError:
538
- # print(f"Timeout on attempt {attempt} for {url}")
539
- # raise
540
- # except Exception as e:
541
- # print(f"Error on attempt {attempt} for {url}: {e}")
542
- # raise
543
-
544
- # if not self.is_valid_url(url) and not self.is_html_url(url):
545
- # print(f"Invalid URL: {url}")
546
- # return f"No web results found for query: {url}"
547
-
548
- # for attempt in range(max_retries):
549
- # try:
550
- # if attempt > 0:
551
- # # Add delay between retries with exponential backoff
552
- # delay = backoff_factor * (2 ** (attempt - 1))
553
- # await asyncio.sleep(delay)
554
 
555
- # return await attempt_crawl(attempt + 1)
556
- # except Exception as e:
557
- # if attempt == max_retries - 1:
558
- # print(f"Max retries ({max_retries}) reached for {url}")
559
- # return f"Failed to crawl after {max_retries} attempts: {url}"
560
- # continue
561
-
562
- # return f"No content found after {max_retries} attempts for: {url}"
563
-
564
- # def extract_content(self, html_content):
565
- # soup = BeautifulSoup(html_content, 'html.parser')
566
- # for script in soup(["script", "style"]):
567
- # script.decompose()
568
- # text = soup.get_text()
569
- # lines = (line.strip() for line in text.splitlines())
570
- # chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
571
- # text = '\n'.join(chunk for chunk in chunks if chunk)
572
- # return text
573
 
574
- # def cleanup_session(self, session_id):
575
- # """Clean up a session"""
576
- # print(f"Cleaning up session {session_id}")
577
- # if session_id in self.session_pool:
578
- # self.crawler.crawler_strategy.kill_session(session_id)
579
- # del self.session_pool[session_id]
580
-
581
- # def cleanup_expired_sessions(self):
582
- # """Regular cleanup of expired sessions using proper time calculation"""
583
- # try:
584
- # current_time = datetime.now()
585
- # expired_sessions = []
586
 
587
- # for sid, data in self.session_pool.items():
588
- # # Calculate time difference in seconds
589
- # time_diff = (current_time - data['last_used']).total_seconds()
590
 
591
- # # Check if more than 1 hour (3600 seconds)
592
- # if time_diff > 3600:
593
- # expired_sessions.append(sid)
594
 
595
- # # Cleanup expired sessions
596
- # for session_id in expired_sessions:
597
- # self.cleanup_session(session_id)
598
 
599
- # except Exception as e:
600
- # if self.verbose:
601
- # print(f"Error during session cleanup: {str(e)}")
602
 
603
- # @staticmethod
604
- # def is_valid_url(url):
605
- # try:
606
- # result = urlparse(url)
607
- # return all([result.scheme, result.netloc])
608
- # except ValueError:
609
- # return False
610
 
611
- # @staticmethod
612
- # def is_html_url(url):
613
- # return url.endswith(".html") or url.endswith(".htm")
614
 
615
  class CustomCrawler:
616
  def __init__(
 
1
+ from crawl4ai import AsyncWebCrawler
2
+ from urllib.parse import urlparse
3
  import aiohttp
4
  import asyncio
5
+ from asyncio.exceptions import TimeoutError as async_timeout
6
  from fast_async import make_async
7
  from bs4 import BeautifulSoup, NavigableString
8
+ import secrets
9
+ from datetime import datetime
10
+ import random
11
  import os
12
  import re
13
  import uuid
14
+ from typing import List, Dict, Tuple, Optional
15
  from io import BytesIO
16
  import PyPDF2
17
  from fake_useragent import FakeUserAgent
 
20
  import torch
21
  import time
22
 
23
+ class Crawler:
24
+ def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
25
+ self.session_pool = {} # Track active sessions
26
+ self.verbose = verbose
27
+ self.rate_limit = rate_limit
28
+ self.user_dir = user_dir
29
+ self.headless = headless
30
+ self.crawler = AsyncWebCrawler(
31
+ context_options={"userDataDir": self.user_dir},
32
+ headless=self.headless,
33
+ verbose=self.verbose
34
+ )
35
+
36
+ # Browser context management
37
+ self._browser_contexts = {}
38
+ self._context_locks = {}
39
+
40
+ async def get_browser_context(self, session_id):
41
+ """Get or create a browser context with proper locking"""
42
+ if session_id not in self._context_locks:
43
+ self._context_locks[session_id] = asyncio.Lock()
44
 
45
+ async with self._context_locks[session_id]:
46
+ if session_id not in self._browser_contexts:
47
+ context = await self.crawler.new_context()
48
+ self._browser_contexts[session_id] = context
49
+ return self._browser_contexts[session_id]
50
 
51
+ async def cleanup_browser_context(self, session_id):
52
+ """Safely cleanup browser context"""
53
+ if session_id in self._context_locks:
54
+ async with self._context_locks[session_id]:
55
+ if session_id in self._browser_contexts:
56
+ try:
57
+ await asyncio.shield(
58
+ self._browser_contexts[session_id].close()
59
+ )
60
+ except Exception as e:
61
+ print(f"Error cleaning up browser context: {e}")
62
+ finally:
63
+ del self._browser_contexts[session_id]
64
+
65
+ def create_session(self):
66
+ """Create a new session with secure ID"""
67
+ session_id = secrets.token_urlsafe(32) # Secure session ID
68
+ self.session_pool[session_id] = {
69
+ 'created_at': datetime.now(),
70
+ 'last_used': datetime.now(),
71
+ 'requests_count': 0
72
+ }
73
+ return session_id
74
+
75
+ def rotate_session(self, session_id):
76
+ """Implement session rotation logic"""
77
+ if self.session_pool[session_id]['requests_count'] > 100:
78
+ self.cleanup_session(session_id)
79
+ return self.create_session()
80
+ return session_id
81
+
82
+ def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
83
+ """Analyzes HTML content to determine if a webpage is dynamically loaded"""
84
+ def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
85
+ """Check structural indicators of dynamic content loading."""
86
+ scores = {
87
+ 'empty_containers': 0,
88
+ 'repeated_structures': 0,
89
+ 'api_endpoints': 0,
90
+ 'state_management': 0
91
+ }
92
 
93
+ # 1. Check for empty content containers
94
+ main_containers = soup.find_all(['main', 'div', 'section'],
95
+ class_=lambda x: x and any(term in str(x).lower()
96
+ for term in ['content', 'main', 'feed', 'list', 'container']))
97
 
98
+ for container in main_containers:
99
+ # Check if container is empty or has minimal content
100
+ if len(container.find_all()) < 3:
101
+ scores['empty_containers'] += 1
102
 
103
+ # Check for repeated similar structures (common in dynamic lists)
104
+ children = container.find_all(recursive=False)
105
+ if children:
106
+ first_child_class = children[0].get('class', [])
107
+ similar_siblings = [c for c in children[1:]
108
+ if c.get('class', []) == first_child_class]
109
+ if len(similar_siblings) > 0:
110
+ scores['repeated_structures'] += 1
111
+
112
+ # 2. Check for API endpoints in scripts
113
+ scripts = soup.find_all('script', {'src': True})
114
+ api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
115
+ for script in scripts:
116
+ if any(pattern in script['src'] for pattern in api_patterns):
117
+ scores['api_endpoints'] += 1
118
+
119
+ # 3. Look for state management setup
120
+ state_patterns = [
121
+ r'window\.__INITIAL_STATE__',
122
+ r'window\.__PRELOADED_STATE__',
123
+ r'__REDUX_STATE__',
124
+ r'__NUXT__',
125
+ r'__NEXT_DATA__',
126
+ r'window\.__data'
127
+ ]
128
 
129
+ inline_scripts = soup.find_all('script')
130
+ for script in inline_scripts:
131
+ if script.string:
132
+ for pattern in state_patterns:
133
+ if re.search(pattern, script.string):
134
+ scores['state_management'] += 1
135
+
136
+ return scores
137
+
138
+ def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
139
+ """Check for indicators of modern web frameworks and dynamic loading patterns."""
140
+ scores = {
141
+ 'framework_roots': 0,
142
+ 'hydration': 0,
143
+ 'routing': 0
144
+ }
145
 
146
+ # 1. Framework-specific root elements
147
+ framework_roots = {
148
+ 'react': ['react-root', 'react-app', 'root', '__next'],
149
+ 'angular': ['ng-version', 'ng-app'],
150
+ 'vue': ['v-app', '#app', 'nuxt-app'],
151
+ 'modern': ['app-root', 'application', 'spa-root']
152
+ }
153
 
154
+ for framework, identifiers in framework_roots.items():
155
+ for id_value in identifiers:
156
+ if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
157
+ soup.find(attrs={'class': re.compile(id_value, re.I)}) or
158
+ soup.find(attrs={'data-': re.compile(id_value, re.I)})):
159
+ scores['framework_roots'] += 1
160
+
161
+ # 2. Check for hydration indicators
162
+ hydration_patterns = [
163
+ r'hydrate',
164
+ r'createRoot',
165
+ r'reactive',
166
+ r'observable'
167
+ ]
168
 
169
+ scripts = soup.find_all('script')
170
+ for script in scripts:
171
+ if script.string:
172
+ for pattern in hydration_patterns:
173
+ if re.search(pattern, script.string):
174
+ scores['hydration'] += 1
175
+
176
+ # 3. Check for dynamic routing setup
177
+ router_patterns = [
178
+ 'router-view',
179
+ 'router-link',
180
+ 'route-link',
181
+ 'history.push',
182
+ 'navigation'
183
+ ]
184
 
185
+ for pattern in router_patterns:
186
+ if soup.find(class_=re.compile(pattern, re.I)) or \
187
+ soup.find(id=re.compile(pattern, re.I)):
188
+ scores['routing'] += 1
189
+
190
+ return scores
191
+
192
+ def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
193
+ """Check for various dynamic content loading patterns."""
194
+ scores = {
195
+ 'infinite_scroll': 0,
196
+ 'load_more_buttons': 0,
197
+ 'pagination': 0,
198
+ 'lazy_loading': 0,
199
+ 'loading_indicators': 0
200
+ }
201
 
202
+ # 1. Check for infinite scroll indicators
203
+ scroll_indicators = [
204
+ 'infinite-scroll',
205
+ 'data-infinite',
206
+ 'data-virtualized',
207
+ 'virtual-scroll',
208
+ 'scroll-container',
209
+ 'scroll-viewport'
210
+ ]
211
 
212
+ for indicator in scroll_indicators:
213
+ elements = soup.find_all(
214
+ lambda tag: any(indicator.lower() in str(v).lower()
215
+ for v in tag.attrs.values())
216
+ )
217
+ if elements:
218
+ scores['infinite_scroll'] += len(elements)
219
+
220
+ # 2. Check for load more buttons
221
+ button_patterns = [
222
+ r'load[_-]?more',
223
+ r'show[_-]?more',
224
+ r'view[_-]?more',
225
+ r'see[_-]?more',
226
+ r'more[_-]?posts',
227
+ r'more[_-]?results'
228
+ ]
229
 
230
+ for pattern in button_patterns:
231
+ elements = soup.find_all(
232
+ ['button', 'a', 'div', 'span'],
233
+ text=re.compile(pattern, re.I)
234
+ )
235
+ if elements:
236
+ scores['load_more_buttons'] += len(elements)
237
+
238
+ # 3. Check for pagination
239
+ pagination_patterns = [
240
+ 'pagination',
241
+ 'page-numbers',
242
+ 'page-nav',
243
+ 'page-links'
244
+ ]
245
 
246
+ for pattern in pagination_patterns:
247
+ elements = soup.find_all(class_=re.compile(pattern, re.I))
248
+ if elements:
249
+ scores['pagination'] += len(elements)
250
+
251
+ # 4. Check for lazy loading
252
+ lazy_patterns = ['lazy', 'data-src', 'data-lazy']
253
+ for pattern in lazy_patterns:
254
+ elements = soup.find_all(
255
+ lambda tag: any(pattern.lower() in str(v).lower()
256
+ for v in tag.attrs.values())
257
+ )
258
+ if elements:
259
+ scores['lazy_loading'] += len(elements)
260
+
261
+ # 5. Check for loading indicators
262
+ loading_patterns = [
263
+ 'loading',
264
+ 'spinner',
265
+ 'skeleton',
266
+ 'placeholder',
267
+ 'shimmer'
268
+ ]
269
 
270
+ for pattern in loading_patterns:
271
+ elements = soup.find_all(class_=re.compile(pattern, re.I))
272
+ if elements:
273
+ scores['loading_indicators'] += len(elements)
274
+
275
+ return scores
276
+
277
+ def _evaluate_dynamic_indicators(
278
+ structural: Dict[str, int],
279
+ framework: Dict[str, int],
280
+ loading: Dict[str, int]
281
+ ) -> Tuple[bool, Optional[str]]:
282
+ """Evaluate dynamic indicators and return JavaScript instructions."""
283
+ methods = []
284
+ js_snippets = []
285
+
286
+ # Infinite Scroll
287
+ if loading['infinite_scroll'] > 0:
288
+ methods.append("scroll")
289
+ js_snippets.append(
290
+ """
291
+ window.scrollTo(0, document.body.scrollHeight);
292
+ await new Promise(resolve => setTimeout(resolve, 1000));
293
+ """.strip().replace('\n', '')
294
+ )
295
+
296
+ # Load More Buttons
297
+ if loading['load_more_buttons'] > 0:
298
+ methods.append("button")
299
+ js_snippets.append(
300
+ """
301
+ const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
302
+ el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
303
+ );
304
+ if (button) {
305
+ button.click();
306
+ await new Promise(resolve => setTimeout(resolve, 1000));
307
+ } else {
308
+ console.warn("No 'Load More' button found.");
309
+ }
310
+ """.strip().replace('\n', '')
311
+ )
312
+
313
+ # Paginated Interfaces
314
+ if loading.get('pagination', 0) > 0:
315
+ methods.append("pagination")
316
+ js_snippets.append(
317
+ """
318
+ const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
319
+ if (nextPage) {
320
+ nextPage.click();
321
+ await new Promise(resolve => setTimeout(resolve, 1000));
322
+ } else {
323
+ console.warn("No pagination link found.");
324
+ }
325
+ """.strip().replace('\n', '')
326
+ )
327
+
328
+ # Lazy Loading
329
+ if loading.get('lazy_loading', 0) > 0:
330
+ methods.append("lazy")
331
+ js_snippets.append(
332
+ """
333
+ if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
334
+ console.log('Framework state detected. Consider monitoring network requests for further actions.');
335
+ }
336
+ """.strip().replace('\n', '')
337
+ )
338
+
339
+ # Framework and State Management Indicators
340
+ if framework['framework_roots'] > 0 or structural['state_management'] > 0:
341
+ methods.append("stateful")
342
+ js_snippets.append(
343
+ """
344
+ if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
345
+ console.log('Detected stateful framework data loading.');
346
+ }
347
+ """.strip().replace('\n', '')
348
+ )
349
+
350
+ # API-Driven Content
351
+ if structural['api_endpoints'] > 0:
352
+ methods.append("api")
353
+ js_snippets.append(
354
+ """
355
+ console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
356
+ """.strip().replace('\n', '')
357
+ )
358
+
359
+ # Aggregate and finalize
360
+ if methods:
361
+ js_code = "\n".join(js_snippets)
362
+ return True, js_code
363
 
364
+ return False, None
365
 
366
+ # Main execution
367
+ soup = BeautifulSoup(html_content, 'html.parser')
368
 
369
+ # Run all checks
370
+ structural_scores = _check_structural_indicators(soup)
371
+ framework_scores = _check_modern_framework_indicators(soup)
372
+ loading_scores = _check_dynamic_loading_patterns(soup)
373
 
374
+ # Evaluate results
375
+ return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
376
+
377
+ async def crawl(
378
+ self,
379
+ url,
380
+ depth=2,
381
+ max_pages=5,
382
+ session_id=None,
383
+ human_simulation=True,
384
+ rotate_user_agent=True,
385
+ rotate_proxy=True,
386
+ return_html=False
387
+ ):
388
+ if not session_id:
389
+ session_id = self.create_session()
390
+
391
+ session_id = self.rotate_session(session_id)
392
+
393
+ # List of rotating user agents
394
+ user_agents = [
395
+ 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
396
+ 'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
397
+ 'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
398
+ 'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
399
+ 'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
400
+ ]
401
+
402
+ # List of rotating proxies
403
+ proxies = [
404
+ "http://50.62.183.123:80",
405
+ "http://104.129.60.84:6516",
406
+ "http://156.228.118.163:3128",
407
+ "http://142.111.104.97:6107",
408
+ "http://156.228.99.99:3128"
409
+ ]
410
+
411
+ try:
412
+ async with self.crawler as crawler:
413
+ # Rotate user agent and optimize headers for each attempt
414
+ headers = {
415
+ "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
416
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
417
+ "Accept-Language": "en-US,en;q=0.5",
418
+ "Accept-Encoding": "gzip, deflate",
419
+ "Connection": "keep-alive",
420
+ "Upgrade-Insecure-Requests": "1",
421
+ "Sec-Fetch-Dest": "document",
422
+ "Sec-Fetch-Mode": "navigate",
423
+ "Sec-Fetch-Site": "none",
424
+ "Sec-Fetch-User": "?1",
425
+ "Cache-Control": "max-age=0"
426
+ }
427
 
428
+ # Update crawler headers for rotation
429
+ crawler.crawler_strategy.headers = headers
430
+
431
+ if rotate_proxy:
432
+ # Update crawler proxy for rotation
433
+ crawler.crawler_strategy.proxy = random.choice(proxies)
434
+
435
+ result_1 = await crawler.arun(
436
+ session_id=session_id,
437
+ url=url,
438
+ magic=True if human_simulation else False,
439
+ simulate_user=True if human_simulation else False,
440
+ override_navigator=True if human_simulation else False,
441
+ depth=depth,
442
+ max_pages=max_pages,
443
+ bypass_cache=True,
444
+ remove_overlay_elements=True,
445
+ delay_before_retrieve_html=1.0,
446
+ verbose=self.verbose
447
+ )
448
+
449
+ # Update session metrics
450
+ self.session_pool[session_id]['requests_count'] += 1
451
+ self.session_pool[session_id]['last_used'] = datetime.now()
452
+
453
+ if result_1.success:
454
+ if hasattr(result_1, 'html'):
455
+ success, js_code = self.is_dynamic_page(result_1.html)
456
+
457
+ if success:
458
+ async with crawler as crawler:
459
+ # Update crawler headers for rotation
460
+ crawler.crawler_strategy.headers = headers
461
+
462
+ if rotate_proxy:
463
+ # Update crawler proxy for rotation
464
+ crawler.crawler_strategy.proxy = random.choice(proxies)
465
+
466
+ print(f"Executing JS code: {js_code}")
467
+ result_2 = await crawler.arun(
468
+ session_id=session_id,
469
+ url=url,
470
+ magic=True if human_simulation else False,
471
+ simulate_user=True if human_simulation else False,
472
+ override_navigator=True if human_simulation else False,
473
+ depth=depth,
474
+ max_pages=max_pages,
475
+ js_code=js_code,
476
+ bypass_cache=True,
477
+ remove_overlay_elements=True,
478
+ delay_before_retrieve_html=1.0,
479
+ verbose=self.verbose
480
+ )
481
+
482
+ if result_2.success:
483
+ result = result_2
484
+ else:
485
+ result = result_1
486
+
487
+ # Update session metrics
488
+ self.session_pool[session_id]['requests_count'] += 1
489
+ self.session_pool[session_id]['last_used'] = datetime.now()
490
+
491
+ else:
492
+ result = result_1
493
 
494
+ if return_html and hasattr(result, 'html'):
495
+ return result.html
496
+ elif hasattr(result, 'fit_markdown'):
497
+ return result.fit_markdown
498
+ elif hasattr(result, 'markdown'):
499
+ return self.extract_content(result.markdown)
500
 
501
+ except Exception as e:
502
+ print(f"Error crawling {url}: {str(e)}")
503
 
504
+ return None
505
 
506
+ async def crawl_with_retry(
507
+ self,
508
+ url,
509
+ depth=2,
510
+ max_pages=5,
511
+ max_retries=3,
512
+ backoff_factor=1,
513
+ session_id=None,
514
+ human_simulation=True,
515
+ rotate_user_agent=True,
516
+ rotate_proxy=True,
517
+ return_html=False,
518
+ timeout=10.0
519
+ ):
520
+ """Crawl with retry logic and anti-blocking measures"""
521
+
522
+ async def attempt_crawl(attempt):
523
+ try:
524
+ async with async_timeout.timeout(timeout):
525
+ context = await self.get_browser_context(session_id)
526
+ return await self.crawl(
527
+ context,
528
+ url,
529
+ depth,
530
+ max_pages,
531
+ session_id,
532
+ human_simulation,
533
+ rotate_user_agent,
534
+ rotate_proxy,
535
+ return_html
536
+ )
537
+ except asyncio.TimeoutError:
538
+ print(f"Timeout on attempt {attempt} for {url}")
539
+ raise
540
+ except Exception as e:
541
+ print(f"Error on attempt {attempt} for {url}: {e}")
542
+ raise
543
+
544
+ if not self.is_valid_url(url) and not self.is_html_url(url):
545
+ print(f"Invalid URL: {url}")
546
+ return f"No web results found for query: {url}"
547
+
548
+ for attempt in range(max_retries):
549
+ try:
550
+ if attempt > 0:
551
+ # Add delay between retries with exponential backoff
552
+ delay = backoff_factor * (2 ** (attempt - 1))
553
+ await asyncio.sleep(delay)
554
 
555
+ return await attempt_crawl(attempt + 1)
556
+ except Exception as e:
557
+ if attempt == max_retries - 1:
558
+ print(f"Max retries ({max_retries}) reached for {url}")
559
+ return f"Failed to crawl after {max_retries} attempts: {url}"
560
+ continue
561
+
562
+ return f"No content found after {max_retries} attempts for: {url}"
563
+
564
+ def extract_content(self, html_content):
565
+ soup = BeautifulSoup(html_content, 'html.parser')
566
+ for script in soup(["script", "style"]):
567
+ script.decompose()
568
+ text = soup.get_text()
569
+ lines = (line.strip() for line in text.splitlines())
570
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
571
+ text = '\n'.join(chunk for chunk in chunks if chunk)
572
+ return text
573
 
574
+ def cleanup_session(self, session_id):
575
+ """Clean up a session"""
576
+ print(f"Cleaning up session {session_id}")
577
+ if session_id in self.session_pool:
578
+ self.crawler.crawler_strategy.kill_session(session_id)
579
+ del self.session_pool[session_id]
580
+
581
+ def cleanup_expired_sessions(self):
582
+ """Regular cleanup of expired sessions using proper time calculation"""
583
+ try:
584
+ current_time = datetime.now()
585
+ expired_sessions = []
586
 
587
+ for sid, data in self.session_pool.items():
588
+ # Calculate time difference in seconds
589
+ time_diff = (current_time - data['last_used']).total_seconds()
590
 
591
+ # Check if more than 1 hour (3600 seconds)
592
+ if time_diff > 3600:
593
+ expired_sessions.append(sid)
594
 
595
+ # Cleanup expired sessions
596
+ for session_id in expired_sessions:
597
+ self.cleanup_session(session_id)
598
 
599
+ except Exception as e:
600
+ if self.verbose:
601
+ print(f"Error during session cleanup: {str(e)}")
602
 
603
+ @staticmethod
604
+ def is_valid_url(url):
605
+ try:
606
+ result = urlparse(url)
607
+ return all([result.scheme, result.netloc])
608
+ except ValueError:
609
+ return False
610
 
611
+ @staticmethod
612
+ def is_html_url(url):
613
+ return url.endswith(".html") or url.endswith(".htm")
614
 
615
  class CustomCrawler:
616
  def __init__(
src/rag/neo4j_graphrag.py CHANGED
@@ -12,7 +12,7 @@ from src.query_processing.query_processor import QueryProcessor
12
  from src.reasoning.reasoner import Reasoner
13
  from src.utils.api_key_manager import APIKeyManager
14
  from src.search.search_engine import SearchEngine
15
- from src.crawl.crawler import Crawler, CustomCrawler
16
  from sentence_transformers import SentenceTransformer
17
  from bert_score.scorer import BERTScorer
18
  import numpy as np
 
12
  from src.reasoning.reasoner import Reasoner
13
  from src.utils.api_key_manager import APIKeyManager
14
  from src.search.search_engine import SearchEngine
15
+ from src.crawl.crawler import CustomCrawler #, Crawler
16
  from sentence_transformers import SentenceTransformer
17
  from bert_score.scorer import BERTScorer
18
  import numpy as np