Spaces:
Running
Running
File size: 1,717 Bytes
5b4a293 c1cde73 5b4a293 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from bs4 import BeautifulSoup
from crewai.tools import BaseTool
class StealthScrapeTool(BaseTool):
name: str = "Stealth Web Scraper"
description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."
async def _arun(self, website_url: str, css_element = "body") -> str:
try:
async with Stealth().use_async(async_playwright()) as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(website_url, timeout=120000)
# Wait for the specific element to be present
await page.wait_for_selector(css_element, timeout=60000)
html_content = await page.content()
soup = BeautifulSoup(html_content, 'html.parser')
target_element = soup.select_one(css_element)
if target_element:
return target_element.prettify()
else:
return f"Error: Could not find element with selector '{css_element}' on the page."
except Exception as e:
return f"Error during stealth web scraping: {e}"
def _run(self, website_url: str, css_element: str) -> str:
# This method is for synchronous execution, which is not ideal for Playwright.
# CrewAI typically calls _arun for async tools.
# For simplicity, we'll just call the async version here.
return asyncio.run(self._arun(website_url, css_element))
|