Spaces:
Runtime error
Runtime error
from os import getenv | |
from typing import List, Optional | |
from phi.tools import Toolkit | |
from phi.utils.log import logger | |
try: | |
from apify_client import ApifyClient | |
except ImportError: | |
raise ImportError("`apify_client` not installed. Please install using `pip install apify-client`") | |
class ApifyTools(Toolkit): | |
def __init__( | |
self, | |
api_key: Optional[str] = None, | |
website_content_crawler: bool = True, | |
web_scraper: bool = False, | |
): | |
super().__init__(name="apify_tools") | |
self.api_key = api_key or getenv("MY_APIFY_TOKEN") | |
if not self.api_key: | |
logger.error("No Apify API key provided") | |
if website_content_crawler: | |
self.register(self.website_content_crawler) | |
if web_scraper: | |
self.register(self.web_scrapper) | |
def website_content_crawler(self, urls: List[str], timeout: Optional[int] = 60) -> str: | |
""" | |
Crawls a website using Apify's website-content-crawler actor. | |
:param urls: The URLs to crawl. | |
:param timeout: The timeout for the crawling. | |
:return: The results of the crawling. | |
""" | |
if self.api_key is None: | |
return "No API key provided" | |
if urls is None: | |
return "No URLs provided" | |
client = ApifyClient(self.api_key) | |
logger.debug(f"Crawling URLs: {urls}") | |
formatted_urls = [{"url": url} for url in urls] | |
run_input = {"startUrls": formatted_urls} | |
run = client.actor("apify/website-content-crawler").call(run_input=run_input, timeout_secs=timeout) | |
results: str = "" | |
for item in client.dataset(run["defaultDatasetId"]).iterate_items(): | |
results += "Results for URL: " + item.get("url") + "\n" | |
results += item.get("text") + "\n" | |
return results | |
def web_scrapper(self, urls: List[str], timeout: Optional[int] = 60) -> str: | |
""" | |
Scrapes a website using Apify's web-scraper actor. | |
:param urls: The URLs to scrape. | |
:param timeout: The timeout for the scraping. | |
:return: The results of the scraping. | |
""" | |
if self.api_key is None: | |
return "No API key provided" | |
if urls is None: | |
return "No URLs provided" | |
client = ApifyClient(self.api_key) | |
logger.debug(f"Scrapping URLs: {urls}") | |
formatted_urls = [{"url": url} for url in urls] | |
page_function_string = """ | |
async function pageFunction(context) { | |
const $ = context.jQuery; | |
const pageTitle = $('title').first().text(); | |
const h1 = $('h1').first().text(); | |
const first_h2 = $('h2').first().text(); | |
const random_text_from_the_page = $('p').first().text(); | |
context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`); | |
return { | |
url: context.request.url, | |
pageTitle, | |
h1, | |
first_h2, | |
random_text_from_the_page | |
}; | |
} | |
""" | |
run_input = { | |
"pageFunction": page_function_string, | |
"startUrls": formatted_urls, | |
} | |
run = client.actor("apify/web-scraper").call(run_input=run_input, timeout_secs=timeout) | |
results: str = "" | |
for item in client.dataset(run["defaultDatasetId"]).iterate_items(): | |
results += "Results for URL: " + item.get("url") + "\n" | |
results += item.get("pageTitle") + "\n" | |
results += item.get("h1") + "\n" | |
results += item.get("first_h2") + "\n" | |
results += item.get("random_text_from_the_page") + "\n" | |
return results | |