import asyncio from playwright.async_api import async_playwright from playwright_stealth import Stealth from bs4 import BeautifulSoup from crewai.tools import BaseTool class StealthScrapeTool(BaseTool): name: str = "Stealth Web Scraper" description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector." async def _arun(self, website_url: str, css_element = "body") -> str: try: async with Stealth().use_async(async_playwright()) as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() await page.goto(website_url, timeout=120000) # Wait for the specific element to be present await page.wait_for_selector(css_element, timeout=60000) html_content = await page.content() soup = BeautifulSoup(html_content, 'html.parser') target_element = soup.select_one(css_element) if target_element: return target_element.prettify() else: return f"Error: Could not find element with selector '{css_element}' on the page." except Exception as e: return f"Error during stealth web scraping: {e}" def _run(self, website_url: str, css_element: str) -> str: # This method is for synchronous execution, which is not ideal for Playwright. # CrewAI typically calls _arun for async tools. # For simplicity, we'll just call the async version here. return asyncio.run(self._arun(website_url, css_element))