Spaces:

hemangthakur
/

seekr

Paused

App Files Files Community

Hemang Thakur commited on Feb 19

Commit

e0ab505

1 Parent(s): 83e870c

commented out crawl4ai

Browse files

Files changed (1) hide show

src/crawl/crawler.py +566 -566

src/crawl/crawler.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from crawl4ai import AsyncWebCrawler
-from urllib.parse import urlparse
 import aiohttp
 import asyncio
-from asyncio.exceptions import TimeoutError as async_timeout
 from fast_async import make_async
 from bs4 import BeautifulSoup, NavigableString
-import secrets
-from datetime import datetime
-import random
 import os
 import re
 import uuid
-from typing import List, Dict, Tuple, Optional
 from io import BytesIO
 import PyPDF2
 from fake_useragent import FakeUserAgent
@@ -20,597 +20,597 @@ from transformers import AutoTokenizer, AutoConfig
 import torch
 import time
-class Crawler:
-    def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
-        self.session_pool = {}  # Track active sessions
-        self.verbose = verbose
-        self.rate_limit = rate_limit
-        self.user_dir = user_dir
-        self.headless = headless
-        self.crawler = AsyncWebCrawler(
-            context_options={"userDataDir": self.user_dir},
-            headless=self.headless,
-            verbose=self.verbose
-        )
-        # Browser context management
-        self._browser_contexts = {}
-        self._context_locks = {}
-    async def get_browser_context(self, session_id):
-        """Get or create a browser context with proper locking"""
-        if session_id not in self._context_locks:
-            self._context_locks[session_id] = asyncio.Lock()
-        async with self._context_locks[session_id]:
-            if session_id not in self._browser_contexts:
-                context = await self.crawler.new_context()
-                self._browser_contexts[session_id] = context
-            return self._browser_contexts[session_id]
-    async def cleanup_browser_context(self, session_id):
-        """Safely cleanup browser context"""
-        if session_id in self._context_locks:
-            async with self._context_locks[session_id]:
-                if session_id in self._browser_contexts:
-                    try:
-                        await asyncio.shield(
-                            self._browser_contexts[session_id].close()
-                        )
-                    except Exception as e:
-                        print(f"Error cleaning up browser context: {e}")
-                    finally:
-                        del self._browser_contexts[session_id]
-    def create_session(self):
-        """Create a new session with secure ID"""
-        session_id = secrets.token_urlsafe(32)  # Secure session ID
-        self.session_pool[session_id] = {
-            'created_at': datetime.now(),
-            'last_used': datetime.now(),
-            'requests_count': 0
-        }
-        return session_id
-    def rotate_session(self, session_id):
-        """Implement session rotation logic"""
-        if self.session_pool[session_id]['requests_count'] > 100:
-            self.cleanup_session(session_id)
-            return self.create_session()
-        return session_id
-    def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
-        """Analyzes HTML content to determine if a webpage is dynamically loaded"""
-        def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
-            """Check structural indicators of dynamic content loading."""
-            scores = {
-                'empty_containers': 0,
-                'repeated_structures': 0,
-                'api_endpoints': 0,
-                'state_management': 0
-            }
-            # 1. Check for empty content containers
-            main_containers = soup.find_all(['main', 'div', 'section'],
-                                        class_=lambda x: x and any(term in str(x).lower()
-                                        for term in ['content', 'main', 'feed', 'list', 'container']))
-            for container in main_containers:
-                # Check if container is empty or has minimal content
-                if len(container.find_all()) < 3:
-                    scores['empty_containers'] += 1
-                # Check for repeated similar structures (common in dynamic lists)
-                children = container.find_all(recursive=False)
-                if children:
-                    first_child_class = children[0].get('class', [])
-                    similar_siblings = [c for c in children[1:]
-                                    if c.get('class', []) == first_child_class]
-                    if len(similar_siblings) > 0:
-                        scores['repeated_structures'] += 1
-            # 2. Check for API endpoints in scripts
-            scripts = soup.find_all('script', {'src': True})
-            api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
-            for script in scripts:
-                if any(pattern in script['src'] for pattern in api_patterns):
-                    scores['api_endpoints'] += 1
-            # 3. Look for state management setup
-            state_patterns = [
-                r'window\.__INITIAL_STATE__',
-                r'window\.__PRELOADED_STATE__',
-                r'__REDUX_STATE__',
-                r'__NUXT__',
-                r'__NEXT_DATA__',
-                r'window\.__data'
-            ]
-            inline_scripts = soup.find_all('script')
-            for script in inline_scripts:
-                if script.string:
-                    for pattern in state_patterns:
-                        if re.search(pattern, script.string):
-                            scores['state_management'] += 1
-            return scores
-        def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
-            """Check for indicators of modern web frameworks and dynamic loading patterns."""
-            scores = {
-                'framework_roots': 0,
-                'hydration': 0,
-                'routing': 0
-            }
-            # 1. Framework-specific root elements
-            framework_roots = {
-                'react': ['react-root', 'react-app', 'root', '__next'],
-                'angular': ['ng-version', 'ng-app'],
-                'vue': ['v-app', '#app', 'nuxt-app'],
-                'modern': ['app-root', 'application', 'spa-root']
-            }
-            for framework, identifiers in framework_roots.items():
-                for id_value in identifiers:
-                    if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
-                        soup.find(attrs={'class': re.compile(id_value, re.I)}) or
-                        soup.find(attrs={'data-': re.compile(id_value, re.I)})):
-                        scores['framework_roots'] += 1
-            # 2. Check for hydration indicators
-            hydration_patterns = [
-                r'hydrate',
-                r'createRoot',
-                r'reactive',
-                r'observable'
-            ]
-            scripts = soup.find_all('script')
-            for script in scripts:
-                if script.string:
-                    for pattern in hydration_patterns:
-                        if re.search(pattern, script.string):
-                            scores['hydration'] += 1
-            # 3. Check for dynamic routing setup
-            router_patterns = [
-                'router-view',
-                'router-link',
-                'route-link',
-                'history.push',
-                'navigation'
-            ]
-            for pattern in router_patterns:
-                if soup.find(class_=re.compile(pattern, re.I)) or \
-                soup.find(id=re.compile(pattern, re.I)):
-                    scores['routing'] += 1
-            return scores
-        def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
-            """Check for various dynamic content loading patterns."""
-            scores = {
-                'infinite_scroll': 0,
-                'load_more_buttons': 0,
-                'pagination': 0,
-                'lazy_loading': 0,
-                'loading_indicators': 0
-            }
-            # 1. Check for infinite scroll indicators
-            scroll_indicators = [
-                'infinite-scroll',
-                'data-infinite',
-                'data-virtualized',
-                'virtual-scroll',
-                'scroll-container',
-                'scroll-viewport'
-            ]
-            for indicator in scroll_indicators:
-                elements = soup.find_all(
-                    lambda tag: any(indicator.lower() in str(v).lower()
-                                for v in tag.attrs.values())
-                )
-                if elements:
-                    scores['infinite_scroll'] += len(elements)
-            # 2. Check for load more buttons
-            button_patterns = [
-                r'load[_-]?more',
-                r'show[_-]?more',
-                r'view[_-]?more',
-                r'see[_-]?more',
-                r'more[_-]?posts',
-                r'more[_-]?results'
-            ]
-            for pattern in button_patterns:
-                elements = soup.find_all(
-                    ['button', 'a', 'div', 'span'],
-                    text=re.compile(pattern, re.I)
-                )
-                if elements:
-                    scores['load_more_buttons'] += len(elements)
-            # 3. Check for pagination
-            pagination_patterns = [
-                'pagination',
-                'page-numbers',
-                'page-nav',
-                'page-links'
-            ]
-            for pattern in pagination_patterns:
-                elements = soup.find_all(class_=re.compile(pattern, re.I))
-                if elements:
-                    scores['pagination'] += len(elements)
-            # 4. Check for lazy loading
-            lazy_patterns = ['lazy', 'data-src', 'data-lazy']
-            for pattern in lazy_patterns:
-                elements = soup.find_all(
-                    lambda tag: any(pattern.lower() in str(v).lower()
-                                for v in tag.attrs.values())
-                )
-                if elements:
-                    scores['lazy_loading'] += len(elements)
-            # 5. Check for loading indicators
-            loading_patterns = [
-                'loading',
-                'spinner',
-                'skeleton',
-                'placeholder',
-                'shimmer'
-            ]
-            for pattern in loading_patterns:
-                elements = soup.find_all(class_=re.compile(pattern, re.I))
-                if elements:
-                    scores['loading_indicators'] += len(elements)
-            return scores
-        def _evaluate_dynamic_indicators(
-            structural: Dict[str, int],
-            framework: Dict[str, int],
-            loading: Dict[str, int]
-        ) -> Tuple[bool, Optional[str]]:
-            """Evaluate dynamic indicators and return JavaScript instructions."""
-            methods = []
-            js_snippets = []
-            # Infinite Scroll
-            if loading['infinite_scroll'] > 0:
-                methods.append("scroll")
-                js_snippets.append(
-                    """
-window.scrollTo(0, document.body.scrollHeight);
-await new Promise(resolve => setTimeout(resolve, 1000));
-                    """.strip().replace('\n', '')
-                )
-            # Load More Buttons
-            if loading['load_more_buttons'] > 0:
-                methods.append("button")
-                js_snippets.append(
-                    """
-const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
-    el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
-);
-if (button) {
-    button.click();
-    await new Promise(resolve => setTimeout(resolve, 1000));
-} else {
-    console.warn("No 'Load More' button found.");
-}
-                    """.strip().replace('\n', '')
-                )
-            # Paginated Interfaces
-            if loading.get('pagination', 0) > 0:
-                methods.append("pagination")
-                js_snippets.append(
-                    """
-  const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
-if (nextPage) {
-    nextPage.click();
-    await new Promise(resolve => setTimeout(resolve, 1000));
-} else {
-    console.warn("No pagination link found.");
-}
-                    """.strip().replace('\n', '')
-                )
-            # Lazy Loading
-            if loading.get('lazy_loading', 0) > 0:
-                methods.append("lazy")
-                js_snippets.append(
-                    """
-if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
-    console.log('Framework state detected. Consider monitoring network requests for further actions.');
-}
-                    """.strip().replace('\n', '')
-                )
-            # Framework and State Management Indicators
-            if framework['framework_roots'] > 0 or structural['state_management'] > 0:
-                methods.append("stateful")
-                js_snippets.append(
-                    """
-if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
-    console.log('Detected stateful framework data loading.');
-                    }
-                    """.strip().replace('\n', '')
-                )
-            # API-Driven Content
-            if structural['api_endpoints'] > 0:
-                methods.append("api")
-                js_snippets.append(
-                    """
-console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
-                    """.strip().replace('\n', '')
-                )
-            # Aggregate and finalize
-            if methods:
-                js_code = "\n".join(js_snippets)
-                return True, js_code
-            return False, None
-        # Main execution
-        soup = BeautifulSoup(html_content, 'html.parser')
-        # Run all checks
-        structural_scores = _check_structural_indicators(soup)
-        framework_scores = _check_modern_framework_indicators(soup)
-        loading_scores = _check_dynamic_loading_patterns(soup)
-        # Evaluate results
-        return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
-    async def crawl(
-            self,
-            url,
-            depth=2,
-            max_pages=5,
-            session_id=None,
-            human_simulation=True,
-            rotate_user_agent=True,
-            rotate_proxy=True,
-            return_html=False
-        ):
-        if not session_id:
-            session_id = self.create_session()
-        session_id = self.rotate_session(session_id)
-        # List of rotating user agents
-        user_agents = [
-            'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-            'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-            'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-            'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-            'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
-        ]
-        # List of rotating proxies
-        proxies = [
-            "http://50.62.183.123:80",
-            "http://104.129.60.84:6516",
-            "http://156.228.118.163:3128",
-            "http://142.111.104.97:6107",
-            "http://156.228.99.99:3128"
-        ]
-        try:
-            async with self.crawler as crawler:
-                # Rotate user agent and optimize headers for each attempt
-                headers = {
-                    "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
-                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-                    "Accept-Language": "en-US,en;q=0.5",
-                    "Accept-Encoding": "gzip, deflate",
-                    "Connection": "keep-alive",
-                    "Upgrade-Insecure-Requests": "1",
-                    "Sec-Fetch-Dest": "document",
-                    "Sec-Fetch-Mode": "navigate",
-                    "Sec-Fetch-Site": "none",
-                    "Sec-Fetch-User": "?1",
-                    "Cache-Control": "max-age=0"
-                }
-                # Update crawler headers for rotation
-                crawler.crawler_strategy.headers = headers
-                if rotate_proxy:
-                    # Update crawler proxy for rotation
-                    crawler.crawler_strategy.proxy = random.choice(proxies)
-                result_1 = await crawler.arun(
-                    session_id=session_id,
-                    url=url,
-                    magic=True if human_simulation else False,
-                    simulate_user=True if human_simulation else False,
-                    override_navigator=True if human_simulation else False,
-                    depth=depth,
-                    max_pages=max_pages,
-                    bypass_cache=True,
-                    remove_overlay_elements=True,
-                    delay_before_retrieve_html=1.0,
-                    verbose=self.verbose
-                )
-            # Update session metrics
-            self.session_pool[session_id]['requests_count'] += 1
-            self.session_pool[session_id]['last_used'] = datetime.now()
-            if result_1.success:
-                if hasattr(result_1, 'html'):
-                    success, js_code = self.is_dynamic_page(result_1.html)
-                    if success:
-                        async with crawler as crawler:
-                            # Update crawler headers for rotation
-                            crawler.crawler_strategy.headers = headers
-                            if rotate_proxy:
-                                # Update crawler proxy for rotation
-                                crawler.crawler_strategy.proxy = random.choice(proxies)
-                            print(f"Executing JS code: {js_code}")
-                            result_2 = await crawler.arun(
-                                session_id=session_id,
-                                url=url,
-                                magic=True if human_simulation else False,
-                                simulate_user=True if human_simulation else False,
-                                override_navigator=True if human_simulation else False,
-                                depth=depth,
-                                max_pages=max_pages,
-                                js_code=js_code,
-                                bypass_cache=True,
-                                remove_overlay_elements=True,
-                                delay_before_retrieve_html=1.0,
-                                verbose=self.verbose
-                            )
-                        if result_2.success:
-                            result = result_2
-                        else:
-                            result = result_1
-                        # Update session metrics
-                        self.session_pool[session_id]['requests_count'] += 1
-                        self.session_pool[session_id]['last_used'] = datetime.now()
-                    else:
-                        result = result_1
-                    if return_html and hasattr(result, 'html'):
-                        return result.html
-                    elif hasattr(result, 'fit_markdown'):
-                        return result.fit_markdown
-                    elif hasattr(result, 'markdown'):
-                        return self.extract_content(result.markdown)
-        except Exception as e:
-            print(f"Error crawling {url}: {str(e)}")
-        return None
-    async def crawl_with_retry(
-            self,
-            url,
-            depth=2,
-            max_pages=5,
-            max_retries=3,
-            backoff_factor=1,
-            session_id=None,
-            human_simulation=True,
-            rotate_user_agent=True,
-            rotate_proxy=True,
-            return_html=False,
-            timeout=10.0
-        ):
-        """Crawl with retry logic and anti-blocking measures"""
-        async def attempt_crawl(attempt):
-            try:
-                async with async_timeout.timeout(timeout):
-                    context = await self.get_browser_context(session_id)
-                    return await self.crawl(
-                        context,
-                        url,
-                        depth,
-                        max_pages,
-                        session_id,
-                        human_simulation,
-                        rotate_user_agent,
-                        rotate_proxy,
-                        return_html
-                    )
-            except asyncio.TimeoutError:
-                print(f"Timeout on attempt {attempt} for {url}")
-                raise
-            except Exception as e:
-                print(f"Error on attempt {attempt} for {url}: {e}")
-                raise
-        if not self.is_valid_url(url) and not self.is_html_url(url):
-            print(f"Invalid URL: {url}")
-            return f"No web results found for query: {url}"
-        for attempt in range(max_retries):
-            try:
-                if attempt > 0:
-                    # Add delay between retries with exponential backoff
-                    delay = backoff_factor * (2 ** (attempt - 1))
-                    await asyncio.sleep(delay)
-                return await attempt_crawl(attempt + 1)
-            except Exception as e:
-                if attempt == max_retries - 1:
-                    print(f"Max retries ({max_retries}) reached for {url}")
-                    return f"Failed to crawl after {max_retries} attempts: {url}"
-                continue
-        return f"No content found after {max_retries} attempts for: {url}"
-    def extract_content(self, html_content):
-        soup = BeautifulSoup(html_content, 'html.parser')
-        for script in soup(["script", "style"]):
-            script.decompose()
-        text = soup.get_text()
-        lines = (line.strip() for line in text.splitlines())
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        text = '\n'.join(chunk for chunk in chunks if chunk)
-        return text
-    def cleanup_session(self, session_id):
-        """Clean up a session"""
-        print(f"Cleaning up session {session_id}")
-        if session_id in self.session_pool:
-            self.crawler.crawler_strategy.kill_session(session_id)
-            del self.session_pool[session_id]
-    def cleanup_expired_sessions(self):
-        """Regular cleanup of expired sessions using proper time calculation"""
-        try:
-            current_time = datetime.now()
-            expired_sessions = []
-            for sid, data in self.session_pool.items():
-                # Calculate time difference in seconds
-                time_diff = (current_time - data['last_used']).total_seconds()
-                # Check if more than 1 hour (3600 seconds)
-                if time_diff > 3600:
-                    expired_sessions.append(sid)
-            # Cleanup expired sessions
-            for session_id in expired_sessions:
-                self.cleanup_session(session_id)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error during session cleanup: {str(e)}")
-    @staticmethod
-    def is_valid_url(url):
-        try:
-            result = urlparse(url)
-            return all([result.scheme, result.netloc])
-        except ValueError:
-            return False
-    @staticmethod
-    def is_html_url(url):
-        return url.endswith(".html") or url.endswith(".htm")
 class CustomCrawler:
     def __init__(

+# from crawl4ai import AsyncWebCrawler
+# from urllib.parse import urlparse
 import aiohttp
 import asyncio
+# from asyncio.exceptions import TimeoutError as async_timeout
 from fast_async import make_async
 from bs4 import BeautifulSoup, NavigableString
+# import secrets
+# from datetime import datetime
+# import random
 import os
 import re
 import uuid
+from typing import List, Dict, Optional #, Tuple
 from io import BytesIO
 import PyPDF2
 from fake_useragent import FakeUserAgent
 import torch
 import time
+# class Crawler:
+#     def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
+#         self.session_pool = {}  # Track active sessions
+#         self.verbose = verbose
+#         self.rate_limit = rate_limit
+#         self.user_dir = user_dir
+#         self.headless = headless
+#         self.crawler = AsyncWebCrawler(
+#             context_options={"userDataDir": self.user_dir},
+#             headless=self.headless,
+#             verbose=self.verbose
+#         )
+#         # Browser context management
+#         self._browser_contexts = {}
+#         self._context_locks = {}
+#     async def get_browser_context(self, session_id):
+#         """Get or create a browser context with proper locking"""
+#         if session_id not in self._context_locks:
+#             self._context_locks[session_id] = asyncio.Lock()
+#         async with self._context_locks[session_id]:
+#             if session_id not in self._browser_contexts:
+#                 context = await self.crawler.new_context()
+#                 self._browser_contexts[session_id] = context
+#             return self._browser_contexts[session_id]
+#     async def cleanup_browser_context(self, session_id):
+#         """Safely cleanup browser context"""
+#         if session_id in self._context_locks:
+#             async with self._context_locks[session_id]:
+#                 if session_id in self._browser_contexts:
+#                     try:
+#                         await asyncio.shield(
+#                             self._browser_contexts[session_id].close()
+#                         )
+#                     except Exception as e:
+#                         print(f"Error cleaning up browser context: {e}")
+#                     finally:
+#                         del self._browser_contexts[session_id]
+#     def create_session(self):
+#         """Create a new session with secure ID"""
+#         session_id = secrets.token_urlsafe(32)  # Secure session ID
+#         self.session_pool[session_id] = {
+#             'created_at': datetime.now(),
+#             'last_used': datetime.now(),
+#             'requests_count': 0
+#         }
+#         return session_id
+#     def rotate_session(self, session_id):
+#         """Implement session rotation logic"""
+#         if self.session_pool[session_id]['requests_count'] > 100:
+#             self.cleanup_session(session_id)
+#             return self.create_session()
+#         return session_id
+#     def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
+#         """Analyzes HTML content to determine if a webpage is dynamically loaded"""
+#         def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
+#             """Check structural indicators of dynamic content loading."""
+#             scores = {
+#                 'empty_containers': 0,
+#                 'repeated_structures': 0,
+#                 'api_endpoints': 0,
+#                 'state_management': 0
+#             }
+#             # 1. Check for empty content containers
+#             main_containers = soup.find_all(['main', 'div', 'section'],
+#                                         class_=lambda x: x and any(term in str(x).lower()
+#                                         for term in ['content', 'main', 'feed', 'list', 'container']))
+#             for container in main_containers:
+#                 # Check if container is empty or has minimal content
+#                 if len(container.find_all()) < 3:
+#                     scores['empty_containers'] += 1
+#                 # Check for repeated similar structures (common in dynamic lists)
+#                 children = container.find_all(recursive=False)
+#                 if children:
+#                     first_child_class = children[0].get('class', [])
+#                     similar_siblings = [c for c in children[1:]
+#                                     if c.get('class', []) == first_child_class]
+#                     if len(similar_siblings) > 0:
+#                         scores['repeated_structures'] += 1
+#             # 2. Check for API endpoints in scripts
+#             scripts = soup.find_all('script', {'src': True})
+#             api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
+#             for script in scripts:
+#                 if any(pattern in script['src'] for pattern in api_patterns):
+#                     scores['api_endpoints'] += 1
+#             # 3. Look for state management setup
+#             state_patterns = [
+#                 r'window\.__INITIAL_STATE__',
+#                 r'window\.__PRELOADED_STATE__',
+#                 r'__REDUX_STATE__',
+#                 r'__NUXT__',
+#                 r'__NEXT_DATA__',
+#                 r'window\.__data'
+#             ]
+#             inline_scripts = soup.find_all('script')
+#             for script in inline_scripts:
+#                 if script.string:
+#                     for pattern in state_patterns:
+#                         if re.search(pattern, script.string):
+#                             scores['state_management'] += 1
+#             return scores
+#         def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
+#             """Check for indicators of modern web frameworks and dynamic loading patterns."""
+#             scores = {
+#                 'framework_roots': 0,
+#                 'hydration': 0,
+#                 'routing': 0
+#             }
+#             # 1. Framework-specific root elements
+#             framework_roots = {
+#                 'react': ['react-root', 'react-app', 'root', '__next'],
+#                 'angular': ['ng-version', 'ng-app'],
+#                 'vue': ['v-app', '#app', 'nuxt-app'],
+#                 'modern': ['app-root', 'application', 'spa-root']
+#             }
+#             for framework, identifiers in framework_roots.items():
+#                 for id_value in identifiers:
+#                     if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or
+#                         soup.find(attrs={'class': re.compile(id_value, re.I)}) or
+#                         soup.find(attrs={'data-': re.compile(id_value, re.I)})):
+#                         scores['framework_roots'] += 1
+#             # 2. Check for hydration indicators
+#             hydration_patterns = [
+#                 r'hydrate',
+#                 r'createRoot',
+#                 r'reactive',
+#                 r'observable'
+#             ]
+#             scripts = soup.find_all('script')
+#             for script in scripts:
+#                 if script.string:
+#                     for pattern in hydration_patterns:
+#                         if re.search(pattern, script.string):
+#                             scores['hydration'] += 1
+#             # 3. Check for dynamic routing setup
+#             router_patterns = [
+#                 'router-view',
+#                 'router-link',
+#                 'route-link',
+#                 'history.push',
+#                 'navigation'
+#             ]
+#             for pattern in router_patterns:
+#                 if soup.find(class_=re.compile(pattern, re.I)) or \
+#                 soup.find(id=re.compile(pattern, re.I)):
+#                     scores['routing'] += 1
+#             return scores
+#         def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
+#             """Check for various dynamic content loading patterns."""
+#             scores = {
+#                 'infinite_scroll': 0,
+#                 'load_more_buttons': 0,
+#                 'pagination': 0,
+#                 'lazy_loading': 0,
+#                 'loading_indicators': 0
+#             }
+#             # 1. Check for infinite scroll indicators
+#             scroll_indicators = [
+#                 'infinite-scroll',
+#                 'data-infinite',
+#                 'data-virtualized',
+#                 'virtual-scroll',
+#                 'scroll-container',
+#                 'scroll-viewport'
+#             ]
+#             for indicator in scroll_indicators:
+#                 elements = soup.find_all(
+#                     lambda tag: any(indicator.lower() in str(v).lower()
+#                                 for v in tag.attrs.values())
+#                 )
+#                 if elements:
+#                     scores['infinite_scroll'] += len(elements)
+#             # 2. Check for load more buttons
+#             button_patterns = [
+#                 r'load[_-]?more',
+#                 r'show[_-]?more',
+#                 r'view[_-]?more',
+#                 r'see[_-]?more',
+#                 r'more[_-]?posts',
+#                 r'more[_-]?results'
+#             ]
+#             for pattern in button_patterns:
+#                 elements = soup.find_all(
+#                     ['button', 'a', 'div', 'span'],
+#                     text=re.compile(pattern, re.I)
+#                 )
+#                 if elements:
+#                     scores['load_more_buttons'] += len(elements)
+#             # 3. Check for pagination
+#             pagination_patterns = [
+#                 'pagination',
+#                 'page-numbers',
+#                 'page-nav',
+#                 'page-links'
+#             ]
+#             for pattern in pagination_patterns:
+#                 elements = soup.find_all(class_=re.compile(pattern, re.I))
+#                 if elements:
+#                     scores['pagination'] += len(elements)
+#             # 4. Check for lazy loading
+#             lazy_patterns = ['lazy', 'data-src', 'data-lazy']
+#             for pattern in lazy_patterns:
+#                 elements = soup.find_all(
+#                     lambda tag: any(pattern.lower() in str(v).lower()
+#                                 for v in tag.attrs.values())
+#                 )
+#                 if elements:
+#                     scores['lazy_loading'] += len(elements)
+#             # 5. Check for loading indicators
+#             loading_patterns = [
+#                 'loading',
+#                 'spinner',
+#                 'skeleton',
+#                 'placeholder',
+#                 'shimmer'
+#             ]
+#             for pattern in loading_patterns:
+#                 elements = soup.find_all(class_=re.compile(pattern, re.I))
+#                 if elements:
+#                     scores['loading_indicators'] += len(elements)
+#             return scores
+#         def _evaluate_dynamic_indicators(
+#             structural: Dict[str, int],
+#             framework: Dict[str, int],
+#             loading: Dict[str, int]
+#         ) -> Tuple[bool, Optional[str]]:
+#             """Evaluate dynamic indicators and return JavaScript instructions."""
+#             methods = []
+#             js_snippets = []
+#             # Infinite Scroll
+#             if loading['infinite_scroll'] > 0:
+#                 methods.append("scroll")
+#                 js_snippets.append(
+#                     """
+# window.scrollTo(0, document.body.scrollHeight);
+# await new Promise(resolve => setTimeout(resolve, 1000));
+#                     """.strip().replace('\n', '')
+#                 )
+#             # Load More Buttons
+#             if loading['load_more_buttons'] > 0:
+#                 methods.append("button")
+#                 js_snippets.append(
+#                     """
+# const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
+#     el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
+# );
+# if (button) {
+#     button.click();
+#     await new Promise(resolve => setTimeout(resolve, 1000));
+# } else {
+#     console.warn("No 'Load More' button found.");
+# }
+#                     """.strip().replace('\n', '')
+#                 )
+#             # Paginated Interfaces
+#             if loading.get('pagination', 0) > 0:
+#                 methods.append("pagination")
+#                 js_snippets.append(
+#                     """
+#   const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
+# if (nextPage) {
+#     nextPage.click();
+#     await new Promise(resolve => setTimeout(resolve, 1000));
+# } else {
+#     console.warn("No pagination link found.");
+# }
+#                     """.strip().replace('\n', '')
+#                 )
+#             # Lazy Loading
+#             if loading.get('lazy_loading', 0) > 0:
+#                 methods.append("lazy")
+#                 js_snippets.append(
+#                     """
+# if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
+#     console.log('Framework state detected. Consider monitoring network requests for further actions.');
+# }
+#                     """.strip().replace('\n', '')
+#                 )
+#             # Framework and State Management Indicators
+#             if framework['framework_roots'] > 0 or structural['state_management'] > 0:
+#                 methods.append("stateful")
+#                 js_snippets.append(
+#                     """
+# if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
+#     console.log('Detected stateful framework data loading.');
+#                     }
+#                     """.strip().replace('\n', '')
+#                 )
+#             # API-Driven Content
+#             if structural['api_endpoints'] > 0:
+#                 methods.append("api")
+#                 js_snippets.append(
+#                     """
+# console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
+#                     """.strip().replace('\n', '')
+#                 )
+#             # Aggregate and finalize
+#             if methods:
+#                 js_code = "\n".join(js_snippets)
+#                 return True, js_code
+#             return False, None
+#         # Main execution
+#         soup = BeautifulSoup(html_content, 'html.parser')
+#         # Run all checks
+#         structural_scores = _check_structural_indicators(soup)
+#         framework_scores = _check_modern_framework_indicators(soup)
+#         loading_scores = _check_dynamic_loading_patterns(soup)
+#         # Evaluate results
+#         return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)
+#     async def crawl(
+#             self,
+#             url,
+#             depth=2,
+#             max_pages=5,
+#             session_id=None,
+#             human_simulation=True,
+#             rotate_user_agent=True,
+#             rotate_proxy=True,
+#             return_html=False
+#         ):
+#         if not session_id:
+#             session_id = self.create_session()
+#         session_id = self.rotate_session(session_id)
+#         # List of rotating user agents
+#         user_agents = [
+#             'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+#             'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+#             'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+#             'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+#             'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
+#         ]
+#         # List of rotating proxies
+#         proxies = [
+#             "http://50.62.183.123:80",
+#             "http://104.129.60.84:6516",
+#             "http://156.228.118.163:3128",
+#             "http://142.111.104.97:6107",
+#             "http://156.228.99.99:3128"
+#         ]
+#         try:
+#             async with self.crawler as crawler:
+#                 # Rotate user agent and optimize headers for each attempt
+#                 headers = {
+#                     "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
+#                     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+#                     "Accept-Language": "en-US,en;q=0.5",
+#                     "Accept-Encoding": "gzip, deflate",
+#                     "Connection": "keep-alive",
+#                     "Upgrade-Insecure-Requests": "1",
+#                     "Sec-Fetch-Dest": "document",
+#                     "Sec-Fetch-Mode": "navigate",
+#                     "Sec-Fetch-Site": "none",
+#                     "Sec-Fetch-User": "?1",
+#                     "Cache-Control": "max-age=0"
+#                 }
+#                 # Update crawler headers for rotation
+#                 crawler.crawler_strategy.headers = headers
+#                 if rotate_proxy:
+#                     # Update crawler proxy for rotation
+#                     crawler.crawler_strategy.proxy = random.choice(proxies)
+#                 result_1 = await crawler.arun(
+#                     session_id=session_id,
+#                     url=url,
+#                     magic=True if human_simulation else False,
+#                     simulate_user=True if human_simulation else False,
+#                     override_navigator=True if human_simulation else False,
+#                     depth=depth,
+#                     max_pages=max_pages,
+#                     bypass_cache=True,
+#                     remove_overlay_elements=True,
+#                     delay_before_retrieve_html=1.0,
+#                     verbose=self.verbose
+#                 )
+#             # Update session metrics
+#             self.session_pool[session_id]['requests_count'] += 1
+#             self.session_pool[session_id]['last_used'] = datetime.now()
+#             if result_1.success:
+#                 if hasattr(result_1, 'html'):
+#                     success, js_code = self.is_dynamic_page(result_1.html)
+#                     if success:
+#                         async with crawler as crawler:
+#                             # Update crawler headers for rotation
+#                             crawler.crawler_strategy.headers = headers
+#                             if rotate_proxy:
+#                                 # Update crawler proxy for rotation
+#                                 crawler.crawler_strategy.proxy = random.choice(proxies)
+#                             print(f"Executing JS code: {js_code}")
+#                             result_2 = await crawler.arun(
+#                                 session_id=session_id,
+#                                 url=url,
+#                                 magic=True if human_simulation else False,
+#                                 simulate_user=True if human_simulation else False,
+#                                 override_navigator=True if human_simulation else False,
+#                                 depth=depth,
+#                                 max_pages=max_pages,
+#                                 js_code=js_code,
+#                                 bypass_cache=True,
+#                                 remove_overlay_elements=True,
+#                                 delay_before_retrieve_html=1.0,
+#                                 verbose=self.verbose
+#                             )
+#                         if result_2.success:
+#                             result = result_2
+#                         else:
+#                             result = result_1
+#                         # Update session metrics
+#                         self.session_pool[session_id]['requests_count'] += 1
+#                         self.session_pool[session_id]['last_used'] = datetime.now()
+#                     else:
+#                         result = result_1
+#                     if return_html and hasattr(result, 'html'):
+#                         return result.html
+#                     elif hasattr(result, 'fit_markdown'):
+#                         return result.fit_markdown
+#                     elif hasattr(result, 'markdown'):
+#                         return self.extract_content(result.markdown)
+#         except Exception as e:
+#             print(f"Error crawling {url}: {str(e)}")
+#         return None
+#     async def crawl_with_retry(
+#             self,
+#             url,
+#             depth=2,
+#             max_pages=5,
+#             max_retries=3,
+#             backoff_factor=1,
+#             session_id=None,
+#             human_simulation=True,
+#             rotate_user_agent=True,
+#             rotate_proxy=True,
+#             return_html=False,
+#             timeout=10.0
+#         ):
+#         """Crawl with retry logic and anti-blocking measures"""
+#         async def attempt_crawl(attempt):
+#             try:
+#                 async with async_timeout.timeout(timeout):
+#                     context = await self.get_browser_context(session_id)
+#                     return await self.crawl(
+#                         context,
+#                         url,
+#                         depth,
+#                         max_pages,
+#                         session_id,
+#                         human_simulation,
+#                         rotate_user_agent,
+#                         rotate_proxy,
+#                         return_html
+#                     )
+#             except asyncio.TimeoutError:
+#                 print(f"Timeout on attempt {attempt} for {url}")
+#                 raise
+#             except Exception as e:
+#                 print(f"Error on attempt {attempt} for {url}: {e}")
+#                 raise
+#         if not self.is_valid_url(url) and not self.is_html_url(url):
+#             print(f"Invalid URL: {url}")
+#             return f"No web results found for query: {url}"
+#         for attempt in range(max_retries):
+#             try:
+#                 if attempt > 0:
+#                     # Add delay between retries with exponential backoff
+#                     delay = backoff_factor * (2 ** (attempt - 1))
+#                     await asyncio.sleep(delay)
+#                 return await attempt_crawl(attempt + 1)
+#             except Exception as e:
+#                 if attempt == max_retries - 1:
+#                     print(f"Max retries ({max_retries}) reached for {url}")
+#                     return f"Failed to crawl after {max_retries} attempts: {url}"
+#                 continue
+#         return f"No content found after {max_retries} attempts for: {url}"
+#     def extract_content(self, html_content):
+#         soup = BeautifulSoup(html_content, 'html.parser')
+#         for script in soup(["script", "style"]):
+#             script.decompose()
+#         text = soup.get_text()
+#         lines = (line.strip() for line in text.splitlines())
+#         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+#         text = '\n'.join(chunk for chunk in chunks if chunk)
+#         return text
+#     def cleanup_session(self, session_id):
+#         """Clean up a session"""
+#         print(f"Cleaning up session {session_id}")
+#         if session_id in self.session_pool:
+#             self.crawler.crawler_strategy.kill_session(session_id)
+#             del self.session_pool[session_id]
+#     def cleanup_expired_sessions(self):
+#         """Regular cleanup of expired sessions using proper time calculation"""
+#         try:
+#             current_time = datetime.now()
+#             expired_sessions = []
+#             for sid, data in self.session_pool.items():
+#                 # Calculate time difference in seconds
+#                 time_diff = (current_time - data['last_used']).total_seconds()
+#                 # Check if more than 1 hour (3600 seconds)
+#                 if time_diff > 3600:
+#                     expired_sessions.append(sid)
+#             # Cleanup expired sessions
+#             for session_id in expired_sessions:
+#                 self.cleanup_session(session_id)
+#         except Exception as e:
+#             if self.verbose:
+#                 print(f"Error during session cleanup: {str(e)}")
+#     @staticmethod
+#     def is_valid_url(url):
+#         try:
+#             result = urlparse(url)
+#             return all([result.scheme, result.netloc])
+#         except ValueError:
+#             return False
+#     @staticmethod
+#     def is_html_url(url):
+#         return url.endswith(".html") or url.endswith(".htm")
 class CustomCrawler:
     def __init__(