Spaces:
Sleeping
Sleeping
import asyncio | |
from playwright.async_api import async_playwright | |
from playwright_stealth import Stealth | |
from bs4 import BeautifulSoup | |
from crewai.tools import BaseTool | |
class StealthScrapeTool(BaseTool): | |
name: str = "Stealth Web Scraper" | |
description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector." | |
async def _arun(self, website_url: str, css_element = "body") -> str: | |
try: | |
async with Stealth().use_async(async_playwright()) as p: | |
browser = await p.chromium.launch(headless=True) | |
page = await browser.new_page() | |
await page.goto(website_url, timeout=120000) | |
# Wait for the specific element to be present | |
await page.wait_for_selector(css_element, timeout=60000) | |
html_content = await page.content() | |
soup = BeautifulSoup(html_content, 'html.parser') | |
target_element = soup.select_one(css_element) | |
if target_element: | |
return target_element.prettify() | |
else: | |
return f"Error: Could not find element with selector '{css_element}' on the page." | |
except Exception as e: | |
return f"Error during stealth web scraping: {e}" | |
def _run(self, website_url: str, css_element: str) -> str: | |
# This method is for synchronous execution, which is not ideal for Playwright. | |
# CrewAI typically calls _arun for async tools. | |
# For simplicity, we'll just call the async version here. | |
return asyncio.run(self._arun(website_url, css_element)) | |