Spaces:

hemangthakur
/

seekr

Paused

File size: 41,026 Bytes

# from crawl4ai import AsyncWebCrawler
# from urllib.parse import urlparse
import aiohttp
import asyncio
# from asyncio.exceptions import TimeoutError as async_timeout
from fast_async import make_async
from bs4 import BeautifulSoup, NavigableString
# import secrets
# from datetime import datetime
# import random
import os
import re
import uuid
from typing import List, Dict, Optional #, Tuple
from io import BytesIO
import PyPDF2
from fake_useragent import FakeUserAgent
from htmlrag import clean_html, build_block_tree, EmbedHTMLPruner, BM25HTMLPruner
from transformers import AutoTokenizer, AutoConfig
import torch
import time

# class Crawler:
#     def __init__(self, user_dir=None, rate_limit=1, headless=True, verbose=False):
#         self.session_pool = {}  # Track active sessions
#         self.verbose = verbose
#         self.rate_limit = rate_limit
#         self.user_dir = user_dir
#         self.headless = headless
#         self.crawler = AsyncWebCrawler(
#             context_options={"userDataDir": self.user_dir}, 
#             headless=self.headless, 
#             verbose=self.verbose
#         )

#         # Browser context management
#         self._browser_contexts = {}
#         self._context_locks = {}

#     async def get_browser_context(self, session_id):
#         """Get or create a browser context with proper locking"""
#         if session_id not in self._context_locks:
#             self._context_locks[session_id] = asyncio.Lock()
        
#         async with self._context_locks[session_id]:
#             if session_id not in self._browser_contexts:
#                 context = await self.crawler.new_context()
#                 self._browser_contexts[session_id] = context
#             return self._browser_contexts[session_id]
        
#     async def cleanup_browser_context(self, session_id):
#         """Safely cleanup browser context"""
#         if session_id in self._context_locks:
#             async with self._context_locks[session_id]:
#                 if session_id in self._browser_contexts:
#                     try:
#                         await asyncio.shield(
#                             self._browser_contexts[session_id].close()
#                         )
#                     except Exception as e:
#                         print(f"Error cleaning up browser context: {e}")
#                     finally:
#                         del self._browser_contexts[session_id]

#     def create_session(self):
#         """Create a new session with secure ID"""
#         session_id = secrets.token_urlsafe(32)  # Secure session ID
#         self.session_pool[session_id] = {
#             'created_at': datetime.now(),
#             'last_used': datetime.now(),
#             'requests_count': 0
#         }
#         return session_id

#     def rotate_session(self, session_id):
#         """Implement session rotation logic"""
#         if self.session_pool[session_id]['requests_count'] > 100:
#             self.cleanup_session(session_id)
#             return self.create_session()
#         return session_id

#     def is_dynamic_page(self, html_content: str) -> Tuple[bool, Optional[str]]:
#         """Analyzes HTML content to determine if a webpage is dynamically loaded"""
#         def _check_structural_indicators(soup: BeautifulSoup) -> Dict[str, int]:
#             """Check structural indicators of dynamic content loading."""
#             scores = {
#                 'empty_containers': 0,
#                 'repeated_structures': 0,
#                 'api_endpoints': 0,
#                 'state_management': 0
#             }
            
#             # 1. Check for empty content containers
#             main_containers = soup.find_all(['main', 'div', 'section'], 
#                                         class_=lambda x: x and any(term in str(x).lower() 
#                                         for term in ['content', 'main', 'feed', 'list', 'container']))
            
#             for container in main_containers:
#                 # Check if container is empty or has minimal content
#                 if len(container.find_all()) < 3:
#                     scores['empty_containers'] += 1
                
#                 # Check for repeated similar structures (common in dynamic lists)
#                 children = container.find_all(recursive=False)
#                 if children:
#                     first_child_class = children[0].get('class', [])
#                     similar_siblings = [c for c in children[1:] 
#                                     if c.get('class', []) == first_child_class]
#                     if len(similar_siblings) > 0:
#                         scores['repeated_structures'] += 1

#             # 2. Check for API endpoints in scripts
#             scripts = soup.find_all('script', {'src': True})
#             api_patterns = ['/api/', '/graphql', '/rest/', '/v1/', '/v2/']
#             for script in scripts:
#                 if any(pattern in script['src'] for pattern in api_patterns):
#                     scores['api_endpoints'] += 1

#             # 3. Look for state management setup
#             state_patterns = [
#                 r'window\.__INITIAL_STATE__',
#                 r'window\.__PRELOADED_STATE__',
#                 r'__REDUX_STATE__',
#                 r'__NUXT__',
#                 r'__NEXT_DATA__',
#                 r'window\.__data'
#             ]
            
#             inline_scripts = soup.find_all('script')
#             for script in inline_scripts:
#                 if script.string:
#                     for pattern in state_patterns:
#                         if re.search(pattern, script.string):
#                             scores['state_management'] += 1

#             return scores

#         def _check_modern_framework_indicators(soup: BeautifulSoup) -> Dict[str, int]:
#             """Check for indicators of modern web frameworks and dynamic loading patterns."""
#             scores = {
#                 'framework_roots': 0,
#                 'hydration': 0,
#                 'routing': 0
#             }
            
#             # 1. Framework-specific root elements
#             framework_roots = {
#                 'react': ['react-root', 'react-app', 'root', '__next'],
#                 'angular': ['ng-version', 'ng-app'],
#                 'vue': ['v-app', '#app', 'nuxt-app'],
#                 'modern': ['app-root', 'application', 'spa-root']
#             }
            
#             for framework, identifiers in framework_roots.items():
#                 for id_value in identifiers:
#                     if (soup.find(attrs={'id': re.compile(id_value, re.I)}) or 
#                         soup.find(attrs={'class': re.compile(id_value, re.I)}) or
#                         soup.find(attrs={'data-': re.compile(id_value, re.I)})):
#                         scores['framework_roots'] += 1

#             # 2. Check for hydration indicators
#             hydration_patterns = [
#                 r'hydrate',
#                 r'createRoot',
#                 r'reactive',
#                 r'observable'
#             ]
            
#             scripts = soup.find_all('script')
#             for script in scripts:
#                 if script.string:
#                     for pattern in hydration_patterns:
#                         if re.search(pattern, script.string):
#                             scores['hydration'] += 1

#             # 3. Check for dynamic routing setup
#             router_patterns = [
#                 'router-view',
#                 'router-link',
#                 'route-link',
#                 'history.push',
#                 'navigation'
#             ]
            
#             for pattern in router_patterns:
#                 if soup.find(class_=re.compile(pattern, re.I)) or \
#                 soup.find(id=re.compile(pattern, re.I)):
#                     scores['routing'] += 1

#             return scores

#         def _check_dynamic_loading_patterns(soup: BeautifulSoup) -> Dict[str, int]:
#             """Check for various dynamic content loading patterns."""
#             scores = {
#                 'infinite_scroll': 0,
#                 'load_more_buttons': 0,
#                 'pagination': 0,
#                 'lazy_loading': 0,
#                 'loading_indicators': 0
#             }
            
#             # 1. Check for infinite scroll indicators
#             scroll_indicators = [
#                 'infinite-scroll',
#                 'data-infinite',
#                 'data-virtualized',
#                 'virtual-scroll',
#                 'scroll-container',
#                 'scroll-viewport'
#             ]
            
#             for indicator in scroll_indicators:
#                 elements = soup.find_all(
#                     lambda tag: any(indicator.lower() in str(v).lower() 
#                                 for v in tag.attrs.values())
#                 )
#                 if elements:
#                     scores['infinite_scroll'] += len(elements)

#             # 2. Check for load more buttons
#             button_patterns = [
#                 r'load[_-]?more',
#                 r'show[_-]?more',
#                 r'view[_-]?more',
#                 r'see[_-]?more',
#                 r'more[_-]?posts',
#                 r'more[_-]?results'
#             ]
            
#             for pattern in button_patterns:
#                 elements = soup.find_all(
#                     ['button', 'a', 'div', 'span'],
#                     text=re.compile(pattern, re.I)
#                 )
#                 if elements:
#                     scores['load_more_buttons'] += len(elements)

#             # 3. Check for pagination
#             pagination_patterns = [
#                 'pagination',
#                 'page-numbers',
#                 'page-nav',
#                 'page-links'
#             ]
            
#             for pattern in pagination_patterns:
#                 elements = soup.find_all(class_=re.compile(pattern, re.I))
#                 if elements:
#                     scores['pagination'] += len(elements)

#             # 4. Check for lazy loading
#             lazy_patterns = ['lazy', 'data-src', 'data-lazy']
#             for pattern in lazy_patterns:
#                 elements = soup.find_all(
#                     lambda tag: any(pattern.lower() in str(v).lower() 
#                                 for v in tag.attrs.values())
#                 )
#                 if elements:
#                     scores['lazy_loading'] += len(elements)

#             # 5. Check for loading indicators
#             loading_patterns = [
#                 'loading',
#                 'spinner',
#                 'skeleton',
#                 'placeholder',
#                 'shimmer'
#             ]
            
#             for pattern in loading_patterns:
#                 elements = soup.find_all(class_=re.compile(pattern, re.I))
#                 if elements:
#                     scores['loading_indicators'] += len(elements)

#             return scores

#         def _evaluate_dynamic_indicators(
#             structural: Dict[str, int],
#             framework: Dict[str, int],
#             loading: Dict[str, int]
#         ) -> Tuple[bool, Optional[str]]:
#             """Evaluate dynamic indicators and return JavaScript instructions."""
#             methods = []
#             js_snippets = []

#             # Infinite Scroll
#             if loading['infinite_scroll'] > 0:
#                 methods.append("scroll")
#                 js_snippets.append(
#                     """
# window.scrollTo(0, document.body.scrollHeight);
# await new Promise(resolve => setTimeout(resolve, 1000));
#                     """.strip().replace('\n', '')
#                 )

#             # Load More Buttons
#             if loading['load_more_buttons'] > 0:
#                 methods.append("button")
#                 js_snippets.append(
#                     """
# const button = Array.from(document.querySelectorAll('button, a, div, span')).find(
#     el => /load[_-]?more|show[_-]?more/i.test(el.textContent)
# );
# if (button) {
#     button.click();
#     await new Promise(resolve => setTimeout(resolve, 1000));
# } else {
#     console.warn("No 'Load More' button found.");
# }
#                     """.strip().replace('\n', '')
#                 )

#             # Paginated Interfaces
#             if loading.get('pagination', 0) > 0:
#                 methods.append("pagination")
#                 js_snippets.append(
#                     """
#   const nextPage = document.querySelector('a[rel="next"], .pagination-next, .page-next');
# if (nextPage) {
#     nextPage.click();
#     await new Promise(resolve => setTimeout(resolve, 1000));
# } else {
#     console.warn("No pagination link found.");
# }
#                     """.strip().replace('\n', '')
#                 )

#             # Lazy Loading
#             if loading.get('lazy_loading', 0) > 0:
#                 methods.append("lazy")
#                 js_snippets.append(
#                     """
# if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
#     console.log('Framework state detected. Consider monitoring network requests for further actions.');
# }
#                     """.strip().replace('\n', '')
#                 )

#             # Framework and State Management Indicators
#             if framework['framework_roots'] > 0 or structural['state_management'] > 0:
#                 methods.append("stateful")
#                 js_snippets.append(
#                     """
# if (window.__INITIAL_STATE__ || window.__REDUX_STATE__ || window.__NUXT__ || window.__NEXT_DATA__) {
#     console.log('Detected stateful framework data loading.');
#                     }
#                     """.strip().replace('\n', '')
#                 )

#             # API-Driven Content
#             if structural['api_endpoints'] > 0:
#                 methods.append("api")
#                 js_snippets.append(
#                     """
# console.log('API requests detected. Use browser devtools to inspect network activity for specific endpoints.');
#                     """.strip().replace('\n', '')
#                 )

#             # Aggregate and finalize
#             if methods:
#                 js_code = "\n".join(js_snippets)
#                 return True, js_code
            
#             return False, None

#         # Main execution
#         soup = BeautifulSoup(html_content, 'html.parser')
        
#         # Run all checks
#         structural_scores = _check_structural_indicators(soup)
#         framework_scores = _check_modern_framework_indicators(soup)
#         loading_scores = _check_dynamic_loading_patterns(soup)
        
#         # Evaluate results
#         return _evaluate_dynamic_indicators(structural_scores, framework_scores, loading_scores)

#     async def crawl(
#             self, 
#             url, 
#             depth=2, 
#             max_pages=5, 
#             session_id=None, 
#             human_simulation=True, 
#             rotate_user_agent=True, 
#             rotate_proxy=True, 
#             return_html=False
#         ):
#         if not session_id:
#             session_id = self.create_session()

#         session_id = self.rotate_session(session_id)

#         # List of rotating user agents
#         user_agents = [
#             'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
#             'Chrome/115.0.0.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 
#             'Chrome/115.0.0.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
#             'Chrome/115.0.0.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
#             'Chrome/115.0.0.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
#         ]

#         # List of rotating proxies
#         proxies = [
#             "http://50.62.183.123:80",
#             "http://104.129.60.84:6516", 
#             "http://156.228.118.163:3128",
#             "http://142.111.104.97:6107",
#             "http://156.228.99.99:3128"
#         ]

#         try:
#             async with self.crawler as crawler:
#                 # Rotate user agent and optimize headers for each attempt
#                 headers = {
#                     "User-Agent": random.choice(user_agents) if rotate_user_agent else user_agents[0],
#                     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
#                     "Accept-Language": "en-US,en;q=0.5",
#                     "Accept-Encoding": "gzip, deflate",
#                     "Connection": "keep-alive",
#                     "Upgrade-Insecure-Requests": "1",
#                     "Sec-Fetch-Dest": "document",
#                     "Sec-Fetch-Mode": "navigate",
#                     "Sec-Fetch-Site": "none",
#                     "Sec-Fetch-User": "?1",
#                     "Cache-Control": "max-age=0"
#                 }
            
#                 # Update crawler headers for rotation
#                 crawler.crawler_strategy.headers = headers

#                 if rotate_proxy:
#                     # Update crawler proxy for rotation
#                     crawler.crawler_strategy.proxy = random.choice(proxies)

#                 result_1 = await crawler.arun(
#                     session_id=session_id,
#                     url=url,
#                     magic=True if human_simulation else False,
#                     simulate_user=True if human_simulation else False,
#                     override_navigator=True if human_simulation else False,
#                     depth=depth,
#                     max_pages=max_pages,
#                     bypass_cache=True,
#                     remove_overlay_elements=True,
#                     delay_before_retrieve_html=1.0,
#                     verbose=self.verbose
#                 )

#             # Update session metrics
#             self.session_pool[session_id]['requests_count'] += 1
#             self.session_pool[session_id]['last_used'] = datetime.now()

#             if result_1.success:
#                 if hasattr(result_1, 'html'):
#                     success, js_code = self.is_dynamic_page(result_1.html)

#                     if success:
#                         async with crawler as crawler:
#                             # Update crawler headers for rotation
#                             crawler.crawler_strategy.headers = headers

#                             if rotate_proxy:
#                                 # Update crawler proxy for rotation
#                                 crawler.crawler_strategy.proxy = random.choice(proxies)

#                             print(f"Executing JS code: {js_code}")
#                             result_2 = await crawler.arun(
#                                 session_id=session_id,
#                                 url=url,
#                                 magic=True if human_simulation else False,
#                                 simulate_user=True if human_simulation else False,
#                                 override_navigator=True if human_simulation else False,
#                                 depth=depth,
#                                 max_pages=max_pages,
#                                 js_code=js_code,
#                                 bypass_cache=True,
#                                 remove_overlay_elements=True,
#                                 delay_before_retrieve_html=1.0,
#                                 verbose=self.verbose
#                             )

#                         if result_2.success:
#                             result = result_2
#                         else:
#                             result = result_1

#                         # Update session metrics
#                         self.session_pool[session_id]['requests_count'] += 1
#                         self.session_pool[session_id]['last_used'] = datetime.now()

#                     else:
#                         result = result_1
            
#                     if return_html and hasattr(result, 'html'):
#                         return result.html
#                     elif hasattr(result, 'fit_markdown'):
#                         return result.fit_markdown
#                     elif hasattr(result, 'markdown'):
#                         return self.extract_content(result.markdown)
                
#         except Exception as e:
#             print(f"Error crawling {url}: {str(e)}")

#         return None
        
#     async def crawl_with_retry(
#             self, 
#             url, 
#             depth=2, 
#             max_pages=5, 
#             max_retries=3, 
#             backoff_factor=1, 
#             session_id=None,
#             human_simulation=True,
#             rotate_user_agent=True, 
#             rotate_proxy=True,
#             return_html=False,
#             timeout=10.0
#         ):
#         """Crawl with retry logic and anti-blocking measures"""

#         async def attempt_crawl(attempt):
#             try:
#                 async with async_timeout.timeout(timeout):
#                     context = await self.get_browser_context(session_id)
#                     return await self.crawl(
#                         context, 
#                         url, 
#                         depth, 
#                         max_pages, 
#                         session_id, 
#                         human_simulation, 
#                         rotate_user_agent,
#                         rotate_proxy,
#                         return_html
#                     )
#             except asyncio.TimeoutError:
#                 print(f"Timeout on attempt {attempt} for {url}")
#                 raise
#             except Exception as e:
#                 print(f"Error on attempt {attempt} for {url}: {e}")
#                 raise

#         if not self.is_valid_url(url) and not self.is_html_url(url):
#             print(f"Invalid URL: {url}")
#             return f"No web results found for query: {url}"

#         for attempt in range(max_retries):
#             try:
#                 if attempt > 0:
#                     # Add delay between retries with exponential backoff
#                     delay = backoff_factor * (2 ** (attempt - 1))
#                     await asyncio.sleep(delay)
                
#                 return await attempt_crawl(attempt + 1)
#             except Exception as e:
#                 if attempt == max_retries - 1:
#                     print(f"Max retries ({max_retries}) reached for {url}")
#                     return f"Failed to crawl after {max_retries} attempts: {url}"
#                 continue

#         return f"No content found after {max_retries} attempts for: {url}"

#     def extract_content(self, html_content):
#         soup = BeautifulSoup(html_content, 'html.parser')
#         for script in soup(["script", "style"]):
#             script.decompose()
#         text = soup.get_text()
#         lines = (line.strip() for line in text.splitlines())
#         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
#         text = '\n'.join(chunk for chunk in chunks if chunk)
#         return text
    
#     def cleanup_session(self, session_id):
#         """Clean up a session"""
#         print(f"Cleaning up session {session_id}")
#         if session_id in self.session_pool:
#             self.crawler.crawler_strategy.kill_session(session_id)
#             del self.session_pool[session_id]

#     def cleanup_expired_sessions(self):
#         """Regular cleanup of expired sessions using proper time calculation"""
#         try:
#             current_time = datetime.now()
#             expired_sessions = []
            
#             for sid, data in self.session_pool.items():
#                 # Calculate time difference in seconds
#                 time_diff = (current_time - data['last_used']).total_seconds()
                
#                 # Check if more than 1 hour (3600 seconds)
#                 if time_diff > 3600:
#                     expired_sessions.append(sid)
            
#             # Cleanup expired sessions
#             for session_id in expired_sessions:
#                 self.cleanup_session(session_id)
                
#         except Exception as e:
#             if self.verbose:
#                 print(f"Error during session cleanup: {str(e)}")
    
#     @staticmethod
#     def is_valid_url(url):
#         try:
#             result = urlparse(url)
#             return all([result.scheme, result.netloc])
#         except ValueError:
#             return False
        
#     @staticmethod
#     def is_html_url(url):
#         return url.endswith(".html") or url.endswith(".htm")
    
class CustomCrawler:
    def __init__(
            self, 
            embed_model: str = "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", 
            max_concurrent_requests: int = 10, 
            verbose: bool = True
        ):
        print(f"🦀 Initializing the crawler") if verbose else None
        time.sleep(1)
        self.embed_model = embed_model
        self.max_concurrent_requests = max_concurrent_requests
        self.verbose = verbose
        self.ua = FakeUserAgent()
        self.semaphore = asyncio.Semaphore(self.max_concurrent_requests)
        self.sessions = {}

        # Intilizing HTML Pruners and Tokenizer
        print(f"🔃 Loading HTML Pruners and Tokenizer with {self.embed_model}") if self.verbose else None
        self.bm25_html_pruner = BM25HTMLPruner()
        self.embed_html_pruner = EmbedHTMLPruner(
            embed_model=self.embed_model,
            local_inference=True
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.embed_model, 
            use_fast=True, 
            trust_remote_code=True,
            device="cuda" if torch.cuda.is_available() else "cpu"
        )

        # Get the model config and set the max context length for the model
        print(f"🛠️  Getting model configuration for {self.embed_model}") if self.verbose else None
        self.config = AutoConfig.from_pretrained(self.embed_model)
        self.tokenizer.max_seq_length = self.config.max_position_embeddings
        print(f"📏 Setting max context length to {self.tokenizer.max_seq_length}") if self.verbose else None

    async def create_session(self):
        session_id = str(uuid.uuid4())
        timeout = aiohttp.ClientTimeout(total=600) # Set a 10-minute timeout
        connector = aiohttp.TCPConnector(limit=self.max_concurrent_requests) # Connection pool
        self.sessions[session_id] = aiohttp.ClientSession(timeout=timeout, connector=connector)
        print(f"🔗 Created session: {session_id}") if self.verbose else None
        return session_id

    async def close_session(self, session_id):
        session = self.sessions.pop(session_id, None)
        if session:
            await session.close()
            print(f"🌂 Closed session: {session_id}") if self.verbose else None

    async def cleanup_expired_sessions(self, expiration_time: int = 600): # Default 10 minutes
        current_time = time.time()
        expired_sessions = []
        print("🔍 Checking for expired sessions") if self.verbose else None
        for session_id, (session, creation_time) in self.sessions.items():
            if current_time - creation_time > expiration_time:
                expired_sessions.append(session_id)

        for session_id in expired_sessions:
            await self.close_session(session_id)

        print(f"🗑️  Successfully cleaned up all expired sessions") if self.verbose else None

    @make_async
    def html_rag(
            self, 
            query: str, 
            html: str, 
            max_context_length: int = 32000, 
            buffer: int = 2000
        ) -> str:
        if not html:
            raise Exception("No HTML contents provided.")
        
        # Validate HTML structure
        try:
            BeautifulSoup(html, 'html.parser')
        except Exception as e:
            raise Exception(f"Invalid HTML content: {e}")

        prompt_for_retrieval = \
"""Given a query, your task is to retrieve the most relevant passages that answers and/or is relevant to the query.

Query:"""

        self.embed_html_pruner.query_instruction_for_retrieval = prompt_for_retrieval

        print(f"🧹 Pruning HTML for query: {query}") if self.verbose else None
        cleaned_html = clean_html(html)
        block_tree, cleaned_html = build_block_tree(cleaned_html, max_node_words=10)

        block_rankings = self.bm25_html_pruner.calculate_block_rankings(query, cleaned_html, block_tree)
        
        max_context_window = max_context_length - buffer
        pruned_html = self.embed_html_pruner.prune_HTML(
            cleaned_html,
            block_tree,
            block_rankings,
            self.tokenizer,
            max_context_window
        )
        print(f"👍 Successfully pruned HTML for query: {query}") if self.verbose else None
        return pruned_html

    async def fetch_page_contents(
        self,
        urls: List[str],
        query: Optional[str] = None,
        session_id: Optional[str] = None,
        max_attempts: int = 3,
        delay: float = 1.0,
        timeout: float = 10.0,
        return_type: str = "markdown",
        rotate_headers: bool = True,
    ) -> List[Optional[str]]:
        async def fetch_single_page(url, proxies, session=None, query=query):
            for attempt in range(max_attempts):
                print(f"🔍 Attempt {attempt + 1}/{max_attempts}: Fetching content from {url}") if self.verbose else None
                content = await self._fetch_page_contents(
                    url=url,
                    query=query,
                    timeout=timeout,
                    return_type=return_type,
                    rotate_headers=rotate_headers,
                    proxies=proxies,
                    session=session
                )
                
                if content:
                    print(f"✅ Successfully fetched content from {url}") if self.verbose else None
                    return content
                else:
                    if max_attempts > 1:
                        print(f"🚫 Failed to fetch content from {url}. Retrying in {delay} seconds...") if self.verbose else None
                        await asyncio.sleep(delay)

            print(f"🚫 Failed to fetch content from {url} after {max_attempts} attempts.") if self.verbose else None
            return None
        
        proxy_list = self.load_proxies()  # Load proxies from environment variables
        if proxy_list:
            proxies = proxy_list
        else:
            proxies = None

        if not urls:
            raise Exception("No URLs provided!")
            
        if return_type == "fit_markdown" and query is None:
            raise Exception("Query must be provided when return_type is 'fit_markdown'!")

        if session_id:  # Use existing session if provided
            if session_id not in self.sessions:
                raise ValueError(f"Invalid session ID: {session_id}")
            session = self.sessions[session_id]
            tasks = [fetch_single_page(url, proxies, session) for url in urls]  # Pass session to tasks
        else:  # No session handling if session_id is None
            tasks = [fetch_single_page(url, proxies) for url in urls]  # No session passed

        results = await asyncio.gather(*tasks)
        return [result for result in results if result is not None]
    
    async def _fetch_page_contents(
        self, 
        url: str, 
        query: Optional[str] = None,
        timeout: float = 5.0,
        return_type: str = "markdown",
        rotate_headers: bool = True,
        proxies: Optional[List[str]] = None,
        session: Optional[aiohttp.ClientSession] = None
    ) -> Optional[str]:
        async def get_content(response, return_type=return_type):
            print(f"📄 Getting content from {url}") if self.verbose else None
            if return_type == "html":
                return await response.text()

            response.raise_for_status()
            content_type = response.headers.get('Content-Type', '').lower()

            if 'application/pdf' in content_type:
                content = await response.read()
                text = self.extract_text_from_pdf(content)
                return text
            elif 'text/html' in content_type:
                html_content = await response.text()
                if return_type == "fit_markdown":
                    html_content = self.html_rag(query, html_content).wait()

                soup = BeautifulSoup(html_content, "html.parser")
                for script_or_style in soup(["script", "style"]):
                    script_or_style.decompose()
                text = self.html_to_markdown(soup)
                return text.strip()
            else:
                print(f"🚫 Unsupported content type {content_type} for URL {url}") if self.verbose else None
                return None

        headers = self.get_headers() if rotate_headers else {}
        proxy = self.get_proxy(proxies) if proxies else None

        # Total connection timeout
        timeout_config = aiohttp.ClientTimeout(total=timeout)

        try:
            # Use provided session if available
            if session:
                async with session.get(url, proxy=proxy, timeout=timeout_config, headers=headers) as response:
                    return await get_content(response)
            # Otherwise, create a new session for each request
            else:
                async with aiohttp.ClientSession() as new_session:
                    async with new_session.get(url, proxy=proxy, timeout=timeout_config, headers=headers) as response:
                        return await get_content(response)

        except aiohttp.ClientError as e:
            print(f"🚫 Request Exception for {url}: {e}") if self.verbose else None
            return None
        except asyncio.TimeoutError as e:
            print(f"🚫 Timeout error for {url}") if self.verbose else None
            return None
        except Exception as e:
            print(f"🚫 Unexpected error fetching {url}: {e}") if self.verbose else None
            return None
        
    def load_proxies(self) -> Optional[List[str]]:
        # Get all environment variables
        env_vars = dict(os.environ)

        # Load proxies from environment variables
        proxy_pattern = re.compile(r"PROXY_\d+")
        proxies = [env_vars[key] for key in env_vars if proxy_pattern.match(key)]

        if proxies:
            print(f"🔌 Loaded {len(proxies)} proxies from environment variables") if self.verbose else None
            return proxies
        else:
            return None
    
    def get_proxy(self, proxies: List[str]) -> str:
        if proxies:  # Check if the proxies list is not empty
            return next(iter(proxies))
        return None  # Or raise an exception, handle differently, etc.

    def get_headers(self) -> Dict[str, str]:
        return {'User-Agent': self.ua.random}

    def extract_text_from_pdf(self, pdf_content: bytes) -> str:
        try:
            print(f"📕 Extracting text from PDF") if self.verbose else None
            pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_content))
            text = ''
            for page in pdf_reader.pages:
                text += page.extract_text()
            print(f"💪 Successfully extracted text from PDF") if self.verbose else None
            return text
        except Exception as e:
            print(f"🚫 Error extracting text from PDF: {e}") if self.verbose else None
            return ""
        
    def html_to_markdown(self, soup):
        markdown_text = ""
        print(f"📟 Converting HTML to Markdown") if self.verbose else None
        def process_element(element, indent=0):
            nonlocal markdown_text

            if isinstance(element, NavigableString):
                text = str(element).strip()
                if text:
                    markdown_text += text + " "
                return

            tag = element.name

            if tag == "h1":
                markdown_text += "# " + element.text.strip() + "\n\n"
            elif tag == "h2":
                markdown_text += "## " + element.text.strip() + "\n\n"
            elif tag == "h3":
                markdown_text += "### " + element.text.strip() + "\n\n"
            elif tag == "h4":
                markdown_text += "#### " + element.text.strip() + "\n\n"
            elif tag == "h5":
                markdown_text += "##### " + element.text.strip() + "\n\n"
            elif tag == "h6":
                markdown_text += "###### " + element.text.strip() + "\n\n"
            elif tag == "p":
                markdown_text += element.text.strip() + "\n\n"
            elif tag == "br":
                markdown_text += "\n"
            elif tag == "ul":
                for li in element.find_all("li", recursive=False):
                    markdown_text += "  " * indent + "- "
                    process_element(li, indent + 1)
                    markdown_text += "\n"
                markdown_text += "\n"
            elif tag == "ol":
                for i, li in enumerate(element.find_all("li", recursive=False), 1):
                    markdown_text += "  " * indent + f"{i}. "
                    process_element(li, indent + 1)
                    markdown_text += "\n"
                markdown_text += "\n"
            elif tag == "table":
                rows = element.find_all("tr")
                for row in rows:
                    cells = row.find_all(["td", "th"])
                    row_text = [cell.text.strip() for cell in cells]
                    markdown_text += "| " + " | ".join(row_text) + " |\n"
                    if row == rows[0]:  # Header row separator
                        markdown_text += "| " + " | ".join(["---"] * len(cells)) + " |\n"
                markdown_text += "\n"
            elif tag == "blockquote":
                markdown_text += "> " + element.text.strip().replace("\n", "\n> ") + "\n\n"
            elif tag == "strong" or tag == "b":
                markdown_text += "**" + element.text.strip() + "**"
            elif tag == "em" or tag == "i":
                markdown_text += "*" + element.text.strip() + "*"
            elif tag == "code":
                markdown_text += "`" + element.text.strip() + "`"
            elif tag == "pre":
                markdown_text += "```\n" + element.text + "\n```\n\n"
            elif tag == "hr":
                markdown_text += "---\n\n"
            else:
                for child in element.children:
                    process_element(child, indent)

        process_element(soup)
        print(f"👌 Successfully converted HTML to Markdown") if self.verbose else None
        return markdown_text
    
if __name__ == "__main__":
    import time
    import winloop

    URLS = [
        "https://en.wikipedia.org/wiki/Treaty_Principles_Bill#:~:text=The%20Treaty%20Principles%20Bill%2C%20or,of%20the%20Treaty%20of%20Waitangi.",
        "https://www.parliament.nz/en/pb/sc/make-a-submission/document/54SCJUST_SCF_227E6D0B-E632-42EB-CFFE-08DCFEB826C6/principles-of-the-treaty-of-waitangi-bill",
        "https://en.wikipedia.org/wiki/Waitangi_Tribunal",
        "https://aljazeera.com/news/2024/11/19/why-are-new-zealands-maori-protesting-over-colonial-era-treaty-bill",
        "https://downiewenjack.ca/treaty-of-waitangi-treaty-principles-bill/"
    ]#  * 10   # Make 50 requests

    query = "What is the Treaty of Waitangi Bill?"
    loop = asyncio.get_event_loop()
    custom_crawler = CustomCrawler(max_concurrent_requests=1000)
    session_id = loop.run_until_complete(custom_crawler.create_session())
    start = time.perf_counter()
    winloop.install()
    result = loop.run_until_complete(custom_crawler.fetch_page_contents(
            URLS, 
            query,
            session_id=session_id, 
            timeout=20,
            max_attempts=1,
            return_type="fit_markdown",
        )
    )
    end = time.perf_counter()
    loop.run_until_complete(custom_crawler.close_session(session_id))
    loop.run_until_complete(custom_crawler.cleanup_expired_sessions())
    print("\n\n".join([f"Document {i+1}:\n\n{result[i]}" for i in range(len(result))]))  
    print(f"\n\nTime taken: {end - start} seconds")