File size: 1,717 Bytes
5b4a293
 
 
 
 
 
 
 
 
 
c1cde73
5b4a293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from bs4 import BeautifulSoup
from crewai.tools import BaseTool

class StealthScrapeTool(BaseTool):
    name: str = "Stealth Web Scraper"
    description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."

    async def _arun(self, website_url: str, css_element = "body") -> str:
        try:
            async with Stealth().use_async(async_playwright()) as p:
                browser = await p.chromium.launch(headless=True)
                page = await browser.new_page()
                
                await page.goto(website_url, timeout=120000)
                
                # Wait for the specific element to be present
                await page.wait_for_selector(css_element, timeout=60000)
                
                html_content = await page.content()
                soup = BeautifulSoup(html_content, 'html.parser')
                
                target_element = soup.select_one(css_element)
                if target_element:
                    return target_element.prettify()
                else:
                    return f"Error: Could not find element with selector '{css_element}' on the page."
        except Exception as e:
            return f"Error during stealth web scraping: {e}"

    def _run(self, website_url: str, css_element: str) -> str:
        # This method is for synchronous execution, which is not ideal for Playwright.
        # CrewAI typically calls _arun for async tools.
        # For simplicity, we'll just call the async version here.
        return asyncio.run(self._arun(website_url, css_element))