Spaces:

hemangthakur
/

seekr

Paused

App Files Files Community

Hemang Thakur commited on Feb 19

Commit

83e870c

1 Parent(s): b0bc4d2

commented out crawl4ai in neo4j file

Browse files

Files changed (2) hide show

src/crawl/crawler.py +566 -566
src/rag/neo4j_graphrag.py +1 -1

src/crawl/crawler.py CHANGED Viewed

@@ -1,17 +1,17 @@
-# from crawl4ai import AsyncWebCrawler
-# from urllib.parse import urlparse
 import aiohttp
 import asyncio
-# from asyncio.exceptions import TimeoutError as async_timeout
 from fast_async import make_async
 from bs4 import BeautifulSoup, NavigableString
-# import secrets
-# from datetime import datetime
-# import random
 import os
 import re
 import uuid
-from typing import List, Dict, Optional #, Tuple
 from io import BytesIO
 import PyPDF2
 from fake_useragent import FakeUserAgent
@@ -20,597 +20,597 @@ from transformers import AutoTokenizer, AutoConfig
 import torch
 import time
-# class Crawler:
-#     def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
-#         self.session_pool = {}  # Track active sessions
-#         self.verbose = verbose
-#         self.rate_limit = rate_limit
-#         self.user_dir = user_dir
-#         self.headless = headless
-#         self.crawler = AsyncWebCrawler(
-#             context_options={"userDataDir": self.user_dir},
-#             headless=self.headless,
-#             verbose=self.verbose
-#         )
-#         # Browser context management
-#         self._browser_contexts = {}
-#         self._context_locks = {}
-#     async def get_browser_context(self, session_id):
-#         """Get or create a browser context with proper locking"""
-#         if session_id not in self._context_locks:
-#             self._context_locks[session_id] = asyncio.Lock()
-#         async with self._context_locks[session_id]:
-#             if session_id not in self._browser_contexts:
-#                 context = await self.crawler.new_context()
-#                 self._browser_contexts[session_id] = context
-#             return self._browser_contexts[session_id]
-#     async def cleanup_browser_context(self, session_id):
-#         """Safely cleanup browser context"""
-#         if session_id in self._context_locks:
-#             async with self._context_locks[session_id]:
-#                 if session_id in self._browser_contexts:
-#                     try:
-#                         await asyncio.shield(
-#                             self._browser_contexts[session_id].close()
-#                         )
-#                     except Exception as e:
-#                         print(f"Error cleaning up browser context: {e}")
-#                     finally:
-#                         del self._browser_contexts[session_id]
-#     def create_session(self):
-#         """Create a new session with secure ID"""
-#         session_id = secrets.token_urlsafe(32)  # Secure session ID
-#         self.session_pool[session_id] = {
-#             'created_at': datetime.now(),
-#             'last_used': datetime.now(),
-#             'requests_count': 0
-#         }
-#         return session_id
-#     def rotate_session(self, session_id):
-#         """Implement session rotation logic"""
-#         if self.session_pool[session_id]['requests_count'] > 100:
-#             self.cleanup_session(session_id)
-#             return self.create_session()
-#         return session_id
-#     def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
-#         """Analyzes HTML content to determine if a webpage is dynamically loaded"""
-#         def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
-#             """Check structural indicators of dynamic content loading."""
-#             scores = {
-#                 'empty_containers': 0,
-#                 'repeated_structures': 0,
-#                 'api_endpoints': 0,
-#                 'state_management': 0
-#             }
-#             # 1. Check for empty content containers
-#             main_containers = soup.find_all(['main', 'div', 'section'],
-#                                         class_=lambda x: x and any(term in str(x).lower()
-#                                         for term in ['content', 'main', 'feed', 'list', 'container']))
-#             for container in main_containers:
-#                 # Check if container is empty or has minimal content
-#                 if len(container.find_all()) < 3:
-#                     scores['empty_containers'] += 1
-#                 # Check for repeated similar structures (common in dynamic lists)
-#                 children = container.find_all(recursive=False)
-#                 if children:
-#                     first_child_class = children[0].get('class', [])
-#                     similar_siblings = [c for c in children[1:]
-#                                     if c.get('class', []) == first_child_class]
-#                     if len(similar_siblings) > 0:
-#                         scores['repeated_structures'] += 1
-#             # 2. Check for API endpoints in scripts
-#             scripts = soup.find_all('script', {'src': True})
-#             api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
-#             for script in scripts:
-#                 if any(pattern in script['src'] for pattern in api_patterns):
-#                     scores['api_endpoints'] += 1
-#             # 3. Look for state management setup
-#             state_patterns = [
-#                 r'window\.__INITIAL_STATE__',
-#                 r'window\.__PRELOADED_STATE__',
-#                 r'__REDUX_STATE__',
-#                 r'__NUXT__',
-#                 r'__NEXT_DATA__',
-#                 r'window\.__data'
-#             ]
-#             inline_scripts = soup.find_all('script')
-#             for script in inline_scripts:
-#                 if script.string:
-#                     for pattern in state_patterns:
-#                         if re.search(pattern, script.string):
-#                             scores['state_management'] += 1
-#             return scores
-#         def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
-#             """Check for indicators of modern web frameworks and dynamic loading patterns."""
-#             scores = {
-#                 'framework_roots': 0,
-#                 'hydration': 0,
-#                 'routing': 0
-#             }
-#             # 1. Framework-specific root elements
-#             framework_roots = {
-#                 'react': ['react-root', 'react-app', 'root', '__next'],
-#                 'angular': ['ng-version', 'ng-app'],
-#                 'vue': ['v-app', '#app', 'nuxt-app'],
-#                 'modern': ['app-root', 'application', 'spa-root']
-#             }
-#             for framework, identifiers in framework_roots.items():
-#                 for id_value in identifiers:
-#                     if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
-#                         soup.find(attrs={'class': re.compile(id_value, re.I)}) or
-#                         soup.find(attrs={'data-': re.compile(id_value, re.I)})):
-#                         scores['framework_roots'] += 1
-#             # 2. Check for hydration indicators
-#             hydration_patterns = [
-#                 r'hydrate',
-#                 r'createRoot',
-#                 r'reactive',
-#                 r'observable'
-#             ]
-#             scripts = soup.find_all('script')
-#             for script in scripts:
-#                 if script.string:
-#                     for pattern in hydration_patterns:
-#                         if re.search(pattern, script.string):
-#                             scores['hydration'] += 1
-#             # 3. Check for dynamic routing setup
-#             router_patterns = [
-#                 'router-view',
-#                 'router-link',
-#                 'route-link',
-#                 'history.push',
-#                 'navigation'
-#             ]
-#             for pattern in router_patterns:
-#                 if soup.find(class_=re.compile(pattern, re.I)) or \
-#                 soup.find(id=re.compile(pattern, re.I)):
-#                     scores['routing'] += 1
-#             return scores
-#         def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
-#             """Check for various dynamic content loading patterns."""
-#             scores = {
-#                 'infinite_scroll': 0,
-#                 'load_more_buttons': 0,
-#                 'pagination': 0,
-#                 'lazy_loading': 0,
-#                 'loading_indicators': 0
-#             }
-#             # 1. Check for infinite scroll indicators
-#             scroll_indicators = [
-#                 'infinite-scroll',
-#                 'data-infinite',
-#                 'data-virtualized',
-#                 'virtual-scroll',
-#                 'scroll-container',
-#                 'scroll-viewport'
-#             ]
-#             for indicator in scroll_indicators:
-#                 elements = soup.find_all(
-#                     lambda tag: any(indicator.lower() in str(v).lower()
-#                                 for v in tag.attrs.values())
-#                 )
-#                 if elements:
-#                     scores['infinite_scroll'] += len(elements)
-#             # 2. Check for load more buttons
-#             button_patterns = [
-#                 r'load[_-]?more',
-#                 r'show[_-]?more',
-#                 r'view[_-]?more',
-#                 r'see[_-]?more',
-#                 r'more[_-]?posts',
-#                 r'more[_-]?results'
-#             ]
-#             for pattern in button_patterns:
-#                 elements = soup.find_all(
-#                     ['button', 'a', 'div', 'span'],
-#                     text=re.compile(pattern, re.I)
-#                 )
-#                 if elements:
-#                     scores['load_more_buttons'] += len(elements)
-#             # 3. Check for pagination
-#             pagination_patterns = [
-#                 'pagination',
-#                 'page-numbers',
-#                 'page-nav',
-#                 'page-links'
-#             ]
-#             for pattern in pagination_patterns:
-#                 elements = soup.find_all(class_=re.compile(pattern, re.I))
-#                 if elements:
-#                     scores['pagination'] += len(elements)
-#             # 4. Check for lazy loading
-#             lazy_patterns = ['lazy', 'data-src', 'data-lazy']
-#             for pattern in lazy_patterns:
-#                 elements = soup.find_all(
-#                     lambda tag: any(pattern.lower() in str(v).lower()
-#                                 for v in tag.attrs.values())
-#                 )
-#                 if elements:
-#                     scores['lazy_loading'] += len(elements)
-#             # 5. Check for loading indicators
-#             loading_patterns = [
-#                 'loading',
-#                 'spinner',
-#                 'skeleton',
-#                 'placeholder',
-#                 'shimmer'
-#             ]
-#             for pattern in loading_patterns:
-#                 elements = soup.find_all(class_=re.compile(pattern, re.I))
-#                 if elements:
-#                     scores['loading_indicators'] += len(elements)
-#             return scores
-#         def _evaluate_dynamic_indicators(
-#             structural: Dict[str, int],
-#             framework: Dict[str, int],
-#             loading: Dict[str, int]
-#         ) -> Tuple[bool, Optional[str]]:
-#             """Evaluate dynamic indicators and return JavaScript instructions."""
-#             methods = []
-#             js_snippets = []
-#             # Infinite Scroll
-#             if loading['infinite_scroll'] > 0:
-#                 methods.append("scroll")
-#                 js_snippets.append(
-#                     """
-# window.scrollTo(0, document.body.scrollHeight);
-# await new Promise(resolve => setTimeout(resolve, 1000));
-#                     """.strip().replace('\n', '')
-#                 )
-#             # Load More Buttons
-#             if loading['load_more_buttons'] > 0:
-#                 methods.append("button")
-#                 js_snippets.append(
-#                     """
-# const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
-#     el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
-# );
-# if (button) {
-#     button.click();
-#     await new Promise(resolve => setTimeout(resolve, 1000));
-# } else {
-#     console.warn("No 'Load More' button found.");
-# }
-#                     """.strip().replace('\n', '')
-#                 )
-#             # Paginated Interfaces
-#             if loading.get('pagination', 0) > 0:
-#                 methods.append("pagination")
-#                 js_snippets.append(
-#                     """
-#   const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
-# if (nextPage) {
-#     nextPage.click();
-#     await new Promise(resolve => setTimeout(resolve, 1000));
-# } else {
-#     console.warn("No pagination link found.");
-# }
-#                     """.strip().replace('\n', '')
-#                 )
-#             # Lazy Loading
-#             if loading.get('lazy_loading', 0) > 0:
-#                 methods.append("lazy")
-#                 js_snippets.append(
-#                     """
-# if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
-#     console.log('Framework state detected. Consider monitoring network requests for further actions.');
-# }
-#                     """.strip().replace('\n', '')
-#                 )
-#             # Framework and State Management Indicators
-#             if framework['framework_roots'] > 0 or structural['state_management'] > 0:
-#                 methods.append("stateful")
-#                 js_snippets.append(
-#                     """
-# if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
-#     console.log('Detected stateful framework data loading.');
-#                     }
-#                     """.strip().replace('\n', '')
-#                 )
-#             # API-Driven Content
-#             if structural['api_endpoints'] > 0:
-#                 methods.append("api")
-#                 js_snippets.append(
-#                     """
-# console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
-#                     """.strip().replace('\n', '')
-#                 )
-#             # Aggregate and finalize
-#             if methods:
-#                 js_code = "\n".join(js_snippets)
-#                 return True, js_code
-#             return False, None
-#         # Main execution
-#         soup = BeautifulSoup(html_content, 'html.parser')
-#         # Run all checks
-#         structural_scores = _check_structural_indicators(soup)
-#         framework_scores = _check_modern_framework_indicators(soup)
-#         loading_scores = _check_dynamic_loading_patterns(soup)
-#         # Evaluate results
-#         return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
-#     async def crawl(
-#             self,
-#             url,
-#             depth=2,
-#             max_pages=5,
-#             session_id=None,
-#             human_simulation=True,
-#             rotate_user_agent=True,
-#             rotate_proxy=True,
-#             return_html=False
-#         ):
-#         if not session_id:
-#             session_id = self.create_session()
-#         session_id = self.rotate_session(session_id)
-#         # List of rotating user agents
-#         user_agents = [
-#             'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-#             'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-#             'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-#             'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-#             'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
-#         ]
-#         # List of rotating proxies
-#         proxies = [
-#             "http://50.62.183.123:80",
-#             "http://104.129.60.84:6516",
-#             "http://156.228.118.163:3128",
-#             "http://142.111.104.97:6107",
-#             "http://156.228.99.99:3128"
-#         ]
-#         try:
-#             async with self.crawler as crawler:
-#                 # Rotate user agent and optimize headers for each attempt
-#                 headers = {
-#                     "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
-#                     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-#                     "Accept-Language": "en-US,en;q=0.5",
-#                     "Accept-Encoding": "gzip, deflate",
-#                     "Connection": "keep-alive",
-#                     "Upgrade-Insecure-Requests": "1",
-#                     "Sec-Fetch-Dest": "document",
-#                     "Sec-Fetch-Mode": "navigate",
-#                     "Sec-Fetch-Site": "none",
-#                     "Sec-Fetch-User": "?1",
-#                     "Cache-Control": "max-age=0"
-#                 }
-#                 # Update crawler headers for rotation
-#                 crawler.crawler_strategy.headers = headers
-#                 if rotate_proxy:
-#                     # Update crawler proxy for rotation
-#                     crawler.crawler_strategy.proxy = random.choice(proxies)
-#                 result_1 = await crawler.arun(
-#                     session_id=session_id,
-#                     url=url,
-#                     magic=True if human_simulation else False,
-#                     simulate_user=True if human_simulation else False,
-#                     override_navigator=True if human_simulation else False,
-#                     depth=depth,
-#                     max_pages=max_pages,
-#                     bypass_cache=True,
-#                     remove_overlay_elements=True,
-#                     delay_before_retrieve_html=1.0,
-#                     verbose=self.verbose
-#                 )
-#             # Update session metrics
-#             self.session_pool[session_id]['requests_count'] += 1
-#             self.session_pool[session_id]['last_used'] = datetime.now()
-#             if result_1.success:
-#                 if hasattr(result_1, 'html'):
-#                     success, js_code = self.is_dynamic_page(result_1.html)
-#                     if success:
-#                         async with crawler as crawler:
-#                             # Update crawler headers for rotation
-#                             crawler.crawler_strategy.headers = headers
-#                             if rotate_proxy:
-#                                 # Update crawler proxy for rotation
-#                                 crawler.crawler_strategy.proxy = random.choice(proxies)
-#                             print(f"Executing JS code: {js_code}")
-#                             result_2 = await crawler.arun(
-#                                 session_id=session_id,
-#                                 url=url,
-#                                 magic=True if human_simulation else False,
-#                                 simulate_user=True if human_simulation else False,
-#                                 override_navigator=True if human_simulation else False,
-#                                 depth=depth,
-#                                 max_pages=max_pages,
-#                                 js_code=js_code,
-#                                 bypass_cache=True,
-#                                 remove_overlay_elements=True,
-#                                 delay_before_retrieve_html=1.0,
-#                                 verbose=self.verbose
-#                             )
-#                         if result_2.success:
-#                             result = result_2
-#                         else:
-#                             result = result_1
-#                         # Update session metrics
-#                         self.session_pool[session_id]['requests_count'] += 1
-#                         self.session_pool[session_id]['last_used'] = datetime.now()
-#                     else:
-#                         result = result_1
-#                     if return_html and hasattr(result, 'html'):
-#                         return result.html
-#                     elif hasattr(result, 'fit_markdown'):
-#                         return result.fit_markdown
-#                     elif hasattr(result, 'markdown'):
-#                         return self.extract_content(result.markdown)
-#         except Exception as e:
-#             print(f"Error crawling {url}: {str(e)}")
-#         return None
-#     async def crawl_with_retry(
-#             self,
-#             url,
-#             depth=2,
-#             max_pages=5,
-#             max_retries=3,
-#             backoff_factor=1,
-#             session_id=None,
-#             human_simulation=True,
-#             rotate_user_agent=True,
-#             rotate_proxy=True,
-#             return_html=False,
-#             timeout=10.0
-#         ):
-#         """Crawl with retry logic and anti-blocking measures"""
-#         async def attempt_crawl(attempt):
-#             try:
-#                 async with async_timeout.timeout(timeout):
-#                     context = await self.get_browser_context(session_id)
-#                     return await self.crawl(
-#                         context,
-#                         url,
-#                         depth,
-#                         max_pages,
-#                         session_id,
-#                         human_simulation,
-#                         rotate_user_agent,
-#                         rotate_proxy,
-#                         return_html
-#                     )
-#             except asyncio.TimeoutError:
-#                 print(f"Timeout on attempt {attempt} for {url}")
-#                 raise
-#             except Exception as e:
-#                 print(f"Error on attempt {attempt} for {url}: {e}")
-#                 raise
-#         if not self.is_valid_url(url) and not self.is_html_url(url):
-#             print(f"Invalid URL: {url}")
-#             return f"No web results found for query: {url}"
-#         for attempt in range(max_retries):
-#             try:
-#                 if attempt > 0:
-#                     # Add delay between retries with exponential backoff
-#                     delay = backoff_factor * (2 ** (attempt - 1))
-#                     await asyncio.sleep(delay)
-#                 return await attempt_crawl(attempt + 1)
-#             except Exception as e:
-#                 if attempt == max_retries - 1:
-#                     print(f"Max retries ({max_retries}) reached for {url}")
-#                     return f"Failed to crawl after {max_retries} attempts: {url}"
-#                 continue
-#         return f"No content found after {max_retries} attempts for: {url}"
-#     def extract_content(self, html_content):
-#         soup = BeautifulSoup(html_content, 'html.parser')
-#         for script in soup(["script", "style"]):
-#             script.decompose()
-#         text = soup.get_text()
-#         lines = (line.strip() for line in text.splitlines())
-#         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-#         text = '\n'.join(chunk for chunk in chunks if chunk)
-#         return text
-#     def cleanup_session(self, session_id):
-#         """Clean up a session"""
-#         print(f"Cleaning up session {session_id}")
-#         if session_id in self.session_pool:
-#             self.crawler.crawler_strategy.kill_session(session_id)
-#             del self.session_pool[session_id]
-#     def cleanup_expired_sessions(self):
-#         """Regular cleanup of expired sessions using proper time calculation"""
-#         try:
-#             current_time = datetime.now()
-#             expired_sessions = []
-#             for sid, data in self.session_pool.items():
-#                 # Calculate time difference in seconds
-#                 time_diff = (current_time - data['last_used']).total_seconds()
-#                 # Check if more than 1 hour (3600 seconds)
-#                 if time_diff > 3600:
-#                     expired_sessions.append(sid)
-#             # Cleanup expired sessions
-#             for session_id in expired_sessions:
-#                 self.cleanup_session(session_id)
-#         except Exception as e:
-#             if self.verbose:
-#                 print(f"Error during session cleanup: {str(e)}")
-#     @staticmethod
-#     def is_valid_url(url):
-#         try:
-#             result = urlparse(url)
-#             return all([result.scheme, result.netloc])
-#         except ValueError:
-#             return False
-#     @staticmethod
-#     def is_html_url(url):
-#         return url.endswith(".html") or url.endswith(".htm")
 class CustomCrawler:
     def __init__(

+from crawl4ai import AsyncWebCrawler
+from urllib.parse import urlparse
 import aiohttp
 import asyncio
+from asyncio.exceptions import TimeoutError as async_timeout
 from fast_async import make_async
 from bs4 import BeautifulSoup, NavigableString
+import secrets
+from datetime import datetime
+import random
 import os
 import re
 import uuid
+from typing import List, Dict, Tuple, Optional
 from io import BytesIO
 import PyPDF2
 from fake_useragent import FakeUserAgent
 import torch
 import time
+class Crawler:
+    def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
+        self.session_pool = {}  # Track active sessions
+        self.verbose = verbose
+        self.rate_limit = rate_limit
+        self.user_dir = user_dir
+        self.headless = headless
+        self.crawler = AsyncWebCrawler(
+            context_options={"userDataDir": self.user_dir},
+            headless=self.headless,
+            verbose=self.verbose
+        )
+        # Browser context management
+        self._browser_contexts = {}
+        self._context_locks = {}
+    async def get_browser_context(self, session_id):
+        """Get or create a browser context with proper locking"""
+        if session_id not in self._context_locks:
+            self._context_locks[session_id] = asyncio.Lock()
+        async with self._context_locks[session_id]:
+            if session_id not in self._browser_contexts:
+                context = await self.crawler.new_context()
+                self._browser_contexts[session_id] = context
+            return self._browser_contexts[session_id]
+    async def cleanup_browser_context(self, session_id):
+        """Safely cleanup browser context"""
+        if session_id in self._context_locks:
+            async with self._context_locks[session_id]:
+                if session_id in self._browser_contexts:
+                    try:
+                        await asyncio.shield(
+                            self._browser_contexts[session_id].close()
+                        )
+                    except Exception as e:
+                        print(f"Error cleaning up browser context: {e}")
+                    finally:
+                        del self._browser_contexts[session_id]
+    def create_session(self):
+        """Create a new session with secure ID"""
+        session_id = secrets.token_urlsafe(32)  # Secure session ID
+        self.session_pool[session_id] = {
+            'created_at': datetime.now(),
+            'last_used': datetime.now(),
+            'requests_count': 0
+        }
+        return session_id
+    def rotate_session(self, session_id):
+        """Implement session rotation logic"""
+        if self.session_pool[session_id]['requests_count'] > 100:
+            self.cleanup_session(session_id)
+            return self.create_session()
+        return session_id
+    def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
+        """Analyzes HTML content to determine if a webpage is dynamically loaded"""
+        def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
+            """Check structural indicators of dynamic content loading."""
+            scores = {
+                'empty_containers': 0,
+                'repeated_structures': 0,
+                'api_endpoints': 0,
+                'state_management': 0
+            }
+            # 1. Check for empty content containers
+            main_containers = soup.find_all(['main', 'div', 'section'],
+                                        class_=lambda x: x and any(term in str(x).lower()
+                                        for term in ['content', 'main', 'feed', 'list', 'container']))
+            for container in main_containers:
+                # Check if container is empty or has minimal content
+                if len(container.find_all()) < 3:
+                    scores['empty_containers'] += 1
+                # Check for repeated similar structures (common in dynamic lists)
+                children = container.find_all(recursive=False)
+                if children:
+                    first_child_class = children[0].get('class', [])
+                    similar_siblings = [c for c in children[1:]
+                                    if c.get('class', []) == first_child_class]
+                    if len(similar_siblings) > 0:
+                        scores['repeated_structures'] += 1
+            # 2. Check for API endpoints in scripts
+            scripts = soup.find_all('script', {'src': True})
+            api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
+            for script in scripts:
+                if any(pattern in script['src'] for pattern in api_patterns):
+                    scores['api_endpoints'] += 1
+            # 3. Look for state management setup
+            state_patterns = [
+                r'window\.__INITIAL_STATE__',
+                r'window\.__PRELOADED_STATE__',
+                r'__REDUX_STATE__',
+                r'__NUXT__',
+                r'__NEXT_DATA__',
+                r'window\.__data'
+            ]
+            inline_scripts = soup.find_all('script')
+            for script in inline_scripts:
+                if script.string:
+                    for pattern in state_patterns:
+                        if re.search(pattern, script.string):
+                            scores['state_management'] += 1
+            return scores
+        def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
+            """Check for indicators of modern web frameworks and dynamic loading patterns."""
+            scores = {
+                'framework_roots': 0,
+                'hydration': 0,
+                'routing': 0
+            }
+            # 1. Framework-specific root elements
+            framework_roots = {
+                'react': ['react-root', 'react-app', 'root', '__next'],
+                'angular': ['ng-version', 'ng-app'],
+                'vue': ['v-app', '#app', 'nuxt-app'],
+                'modern': ['app-root', 'application', 'spa-root']
+            }
+            for framework, identifiers in framework_roots.items():
+                for id_value in identifiers:
+                    if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
+                        soup.find(attrs={'class': re.compile(id_value, re.I)}) or
+                        soup.find(attrs={'data-': re.compile(id_value, re.I)})):
+                        scores['framework_roots'] += 1
+            # 2. Check for hydration indicators
+            hydration_patterns = [
+                r'hydrate',
+                r'createRoot',
+                r'reactive',
+                r'observable'
+            ]
+            scripts = soup.find_all('script')
+            for script in scripts:
+                if script.string:
+                    for pattern in hydration_patterns:
+                        if re.search(pattern, script.string):
+                            scores['hydration'] += 1
+            # 3. Check for dynamic routing setup
+            router_patterns = [
+                'router-view',
+                'router-link',
+                'route-link',
+                'history.push',
+                'navigation'
+            ]
+            for pattern in router_patterns:
+                if soup.find(class_=re.compile(pattern, re.I)) or \
+                soup.find(id=re.compile(pattern, re.I)):
+                    scores['routing'] += 1
+            return scores
+        def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
+            """Check for various dynamic content loading patterns."""
+            scores = {
+                'infinite_scroll': 0,
+                'load_more_buttons': 0,
+                'pagination': 0,
+                'lazy_loading': 0,
+                'loading_indicators': 0
+            }
+            # 1. Check for infinite scroll indicators
+            scroll_indicators = [
+                'infinite-scroll',
+                'data-infinite',
+                'data-virtualized',
+                'virtual-scroll',
+                'scroll-container',
+                'scroll-viewport'
+            ]
+            for indicator in scroll_indicators:
+                elements = soup.find_all(
+                    lambda tag: any(indicator.lower() in str(v).lower()
+                                for v in tag.attrs.values())
+                )
+                if elements:
+                    scores['infinite_scroll'] += len(elements)
+            # 2. Check for load more buttons
+            button_patterns = [
+                r'load[_-]?more',
+                r'show[_-]?more',
+                r'view[_-]?more',
+                r'see[_-]?more',
+                r'more[_-]?posts',
+                r'more[_-]?results'
+            ]
+            for pattern in button_patterns:
+                elements = soup.find_all(
+                    ['button', 'a', 'div', 'span'],
+                    text=re.compile(pattern, re.I)
+                )
+                if elements:
+                    scores['load_more_buttons'] += len(elements)
+            # 3. Check for pagination
+            pagination_patterns = [
+                'pagination',
+                'page-numbers',
+                'page-nav',
+                'page-links'
+            ]
+            for pattern in pagination_patterns:
+                elements = soup.find_all(class_=re.compile(pattern, re.I))
+                if elements:
+                    scores['pagination'] += len(elements)
+            # 4. Check for lazy loading
+            lazy_patterns = ['lazy', 'data-src', 'data-lazy']
+            for pattern in lazy_patterns:
+                elements = soup.find_all(
+                    lambda tag: any(pattern.lower() in str(v).lower()
+                                for v in tag.attrs.values())
+                )
+                if elements:
+                    scores['lazy_loading'] += len(elements)
+            # 5. Check for loading indicators
+            loading_patterns = [
+                'loading',
+                'spinner',
+                'skeleton',
+                'placeholder',
+                'shimmer'
+            ]
+            for pattern in loading_patterns:
+                elements = soup.find_all(class_=re.compile(pattern, re.I))
+                if elements:
+                    scores['loading_indicators'] += len(elements)
+            return scores
+        def _evaluate_dynamic_indicators(
+            structural: Dict[str, int],
+            framework: Dict[str, int],
+            loading: Dict[str, int]
+        ) -> Tuple[bool, Optional[str]]:
+            """Evaluate dynamic indicators and return JavaScript instructions."""
+            methods = []
+            js_snippets = []
+            # Infinite Scroll
+            if loading['infinite_scroll'] > 0:
+                methods.append("scroll")
+                js_snippets.append(
+                    """
+window.scrollTo(0, document.body.scrollHeight);
+await new Promise(resolve => setTimeout(resolve, 1000));
+                    """.strip().replace('\n', '')
+                )
+            # Load More Buttons
+            if loading['load_more_buttons'] > 0:
+                methods.append("button")
+                js_snippets.append(
+                    """
+const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
+    el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
+);
+if (button) {
+    button.click();
+    await new Promise(resolve => setTimeout(resolve, 1000));
+} else {
+    console.warn("No 'Load More' button found.");
+}
+                    """.strip().replace('\n', '')
+                )
+            # Paginated Interfaces
+            if loading.get('pagination', 0) > 0:
+                methods.append("pagination")
+                js_snippets.append(
+                    """
+  const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
+if (nextPage) {
+    nextPage.click();
+    await new Promise(resolve => setTimeout(resolve, 1000));
+} else {
+    console.warn("No pagination link found.");
+}
+                    """.strip().replace('\n', '')
+                )
+            # Lazy Loading
+            if loading.get('lazy_loading', 0) > 0:
+                methods.append("lazy")
+                js_snippets.append(
+                    """
+if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
+    console.log('Framework state detected. Consider monitoring network requests for further actions.');
+}
+                    """.strip().replace('\n', '')
+                )
+            # Framework and State Management Indicators
+            if framework['framework_roots'] > 0 or structural['state_management'] > 0:
+                methods.append("stateful")
+                js_snippets.append(
+                    """
+if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
+    console.log('Detected stateful framework data loading.');
+                    }
+                    """.strip().replace('\n', '')
+                )
+            # API-Driven Content
+            if structural['api_endpoints'] > 0:
+                methods.append("api")
+                js_snippets.append(
+                    """
+console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
+                    """.strip().replace('\n', '')
+                )
+            # Aggregate and finalize
+            if methods:
+                js_code = "\n".join(js_snippets)
+                return True, js_code
+            return False, None
+        # Main execution
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Run all checks
+        structural_scores = _check_structural_indicators(soup)
+        framework_scores = _check_modern_framework_indicators(soup)
+        loading_scores = _check_dynamic_loading_patterns(soup)
+        # Evaluate results
+        return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
+    async def crawl(
+            self,
+            url,
+            depth=2,
+            max_pages=5,
+            session_id=None,
+            human_simulation=True,
+            rotate_user_agent=True,
+            rotate_proxy=True,
+            return_html=False
+        ):
+        if not session_id:
+            session_id = self.create_session()
+        session_id = self.rotate_session(session_id)
+        # List of rotating user agents
+        user_agents = [
+            'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+            'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+            'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+            'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+            'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
+        ]
+        # List of rotating proxies
+        proxies = [
+            "http://50.62.183.123:80",
+            "http://104.129.60.84:6516",
+            "http://156.228.118.163:3128",
+            "http://142.111.104.97:6107",
+            "http://156.228.99.99:3128"
+        ]
+        try:
+            async with self.crawler as crawler:
+                # Rotate user agent and optimize headers for each attempt
+                headers = {
+                    "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+                    "Accept-Language": "en-US,en;q=0.5",
+                    "Accept-Encoding": "gzip, deflate",
+                    "Connection": "keep-alive",
+                    "Upgrade-Insecure-Requests": "1",
+                    "Sec-Fetch-Dest": "document",
+                    "Sec-Fetch-Mode": "navigate",
+                    "Sec-Fetch-Site": "none",
+                    "Sec-Fetch-User": "?1",
+                    "Cache-Control": "max-age=0"
+                }
+                # Update crawler headers for rotation
+                crawler.crawler_strategy.headers = headers
+                if rotate_proxy:
+                    # Update crawler proxy for rotation
+                    crawler.crawler_strategy.proxy = random.choice(proxies)
+                result_1 = await crawler.arun(
+                    session_id=session_id,
+                    url=url,
+                    magic=True if human_simulation else False,
+                    simulate_user=True if human_simulation else False,
+                    override_navigator=True if human_simulation else False,
+                    depth=depth,
+                    max_pages=max_pages,
+                    bypass_cache=True,
+                    remove_overlay_elements=True,
+                    delay_before_retrieve_html=1.0,
+                    verbose=self.verbose
+                )
+            # Update session metrics
+            self.session_pool[session_id]['requests_count'] += 1
+            self.session_pool[session_id]['last_used'] = datetime.now()
+            if result_1.success:
+                if hasattr(result_1, 'html'):
+                    success, js_code = self.is_dynamic_page(result_1.html)
+                    if success:
+                        async with crawler as crawler:
+                            # Update crawler headers for rotation
+                            crawler.crawler_strategy.headers = headers
+                            if rotate_proxy:
+                                # Update crawler proxy for rotation
+                                crawler.crawler_strategy.proxy = random.choice(proxies)
+                            print(f"Executing JS code: {js_code}")
+                            result_2 = await crawler.arun(
+                                session_id=session_id,
+                                url=url,
+                                magic=True if human_simulation else False,
+                                simulate_user=True if human_simulation else False,
+                                override_navigator=True if human_simulation else False,
+                                depth=depth,
+                                max_pages=max_pages,
+                                js_code=js_code,
+                                bypass_cache=True,
+                                remove_overlay_elements=True,
+                                delay_before_retrieve_html=1.0,
+                                verbose=self.verbose
+                            )
+                        if result_2.success:
+                            result = result_2
+                        else:
+                            result = result_1
+                        # Update session metrics
+                        self.session_pool[session_id]['requests_count'] += 1
+                        self.session_pool[session_id]['last_used'] = datetime.now()
+                    else:
+                        result = result_1
+                    if return_html and hasattr(result, 'html'):
+                        return result.html
+                    elif hasattr(result, 'fit_markdown'):
+                        return result.fit_markdown
+                    elif hasattr(result, 'markdown'):
+                        return self.extract_content(result.markdown)
+        except Exception as e:
+            print(f"Error crawling {url}: {str(e)}")
+        return None
+    async def crawl_with_retry(
+            self,
+            url,
+            depth=2,
+            max_pages=5,
+            max_retries=3,
+            backoff_factor=1,
+            session_id=None,
+            human_simulation=True,
+            rotate_user_agent=True,
+            rotate_proxy=True,
+            return_html=False,
+            timeout=10.0
+        ):
+        """Crawl with retry logic and anti-blocking measures"""
+        async def attempt_crawl(attempt):
+            try:
+                async with async_timeout.timeout(timeout):
+                    context = await self.get_browser_context(session_id)
+                    return await self.crawl(
+                        context,
+                        url,
+                        depth,
+                        max_pages,
+                        session_id,
+                        human_simulation,
+                        rotate_user_agent,
+                        rotate_proxy,
+                        return_html
+                    )
+            except asyncio.TimeoutError:
+                print(f"Timeout on attempt {attempt} for {url}")
+                raise
+            except Exception as e:
+                print(f"Error on attempt {attempt} for {url}: {e}")
+                raise
+        if not self.is_valid_url(url) and not self.is_html_url(url):
+            print(f"Invalid URL: {url}")
+            return f"No web results found for query: {url}"
+        for attempt in range(max_retries):
+            try:
+                if attempt > 0:
+                    # Add delay between retries with exponential backoff
+                    delay = backoff_factor * (2 ** (attempt - 1))
+                    await asyncio.sleep(delay)
+                return await attempt_crawl(attempt + 1)
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    print(f"Max retries ({max_retries}) reached for {url}")
+                    return f"Failed to crawl after {max_retries} attempts: {url}"
+                continue
+        return f"No content found after {max_retries} attempts for: {url}"
+    def extract_content(self, html_content):
+        soup = BeautifulSoup(html_content, 'html.parser')
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        return text
+    def cleanup_session(self, session_id):
+        """Clean up a session"""
+        print(f"Cleaning up session {session_id}")
+        if session_id in self.session_pool:
+            self.crawler.crawler_strategy.kill_session(session_id)
+            del self.session_pool[session_id]
+    def cleanup_expired_sessions(self):
+        """Regular cleanup of expired sessions using proper time calculation"""
+        try:
+            current_time = datetime.now()
+            expired_sessions = []
+            for sid, data in self.session_pool.items():
+                # Calculate time difference in seconds
+                time_diff = (current_time - data['last_used']).total_seconds()
+                # Check if more than 1 hour (3600 seconds)
+                if time_diff > 3600:
+                    expired_sessions.append(sid)
+            # Cleanup expired sessions
+            for session_id in expired_sessions:
+                self.cleanup_session(session_id)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error during session cleanup: {str(e)}")
+    @staticmethod
+    def is_valid_url(url):
+        try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc])
+        except ValueError:
+            return False
+    @staticmethod
+    def is_html_url(url):
+        return url.endswith(".html") or url.endswith(".htm")
 class CustomCrawler:
     def __init__(

src/rag/neo4j_graphrag.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.query_processing.query_processor import QueryProcessor
 from src.reasoning.reasoner import Reasoner
 from src.utils.api_key_manager import APIKeyManager
 from src.search.search_engine import SearchEngine
-from src.crawl.crawler import Crawler, CustomCrawler
 from sentence_transformers import SentenceTransformer
 from bert_score.scorer import BERTScorer
 import numpy as np

 from src.reasoning.reasoner import Reasoner
 from src.utils.api_key_manager import APIKeyManager
 from src.search.search_engine import SearchEngine
+from src.crawl.crawler import CustomCrawler #, Crawler
 from sentence_transformers import SentenceTransformer
 from bert_score.scorer import BERTScorer
 import numpy as np