nat-ad / stealth_scrape_tool.py
ibombonato's picture
feat: add discount pct and change resumo for opiniao dos usuarios
c1cde73 verified
raw
history blame
1.72 kB
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from bs4 import BeautifulSoup
from crewai.tools import BaseTool
class StealthScrapeTool(BaseTool):
name: str = "Stealth Web Scraper"
description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."
async def _arun(self, website_url: str, css_element = "body") -> str:
try:
async with Stealth().use_async(async_playwright()) as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(website_url, timeout=120000)
# Wait for the specific element to be present
await page.wait_for_selector(css_element, timeout=60000)
html_content = await page.content()
soup = BeautifulSoup(html_content, 'html.parser')
target_element = soup.select_one(css_element)
if target_element:
return target_element.prettify()
else:
return f"Error: Could not find element with selector '{css_element}' on the page."
except Exception as e:
return f"Error during stealth web scraping: {e}"
def _run(self, website_url: str, css_element: str) -> str:
# This method is for synchronous execution, which is not ideal for Playwright.
# CrewAI typically calls _arun for async tools.
# For simplicity, we'll just call the async version here.
return asyncio.run(self._arun(website_url, css_element))