Spaces:

AmmarFahmy
/

AutoRAG_llama3_groq

Runtime error

AutoRAG_llama3_groq / phi /tools /apify.py

AmmarFahmy

adding all files

105b369 about 1 year ago

3.78 kB

	from os import getenv
	from typing import List, Optional

	from phi.tools import Toolkit
	from phi.utils.log import logger

	try:
	from apify_client import ApifyClient
	except ImportError:
	raise ImportError("`apify_client` not installed. Please install using `pip install apify-client`")


	class ApifyTools(Toolkit):
	def __init__(
	self,
	api_key: Optional[str] = None,
	website_content_crawler: bool = True,
	web_scraper: bool = False,
	):
	super().__init__(name="apify_tools")

	self.api_key = api_key or getenv("MY_APIFY_TOKEN")
	if not self.api_key:
	logger.error("No Apify API key provided")

	if website_content_crawler:
	self.register(self.website_content_crawler)
	if web_scraper:
	self.register(self.web_scrapper)

	def website_content_crawler(self, urls: List[str], timeout: Optional[int] = 60) -> str:
	"""
	Crawls a website using Apify's website-content-crawler actor.

	:param urls: The URLs to crawl.
	:param timeout: The timeout for the crawling.

	:return: The results of the crawling.
	"""
	if self.api_key is None:
	return "No API key provided"

	if urls is None:
	return "No URLs provided"

	client = ApifyClient(self.api_key)

	logger.debug(f"Crawling URLs: {urls}")

	formatted_urls = [{"url": url} for url in urls]

	run_input = {"startUrls": formatted_urls}

	run = client.actor("apify/website-content-crawler").call(run_input=run_input, timeout_secs=timeout)

	results: str = ""

	for item in client.dataset(run["defaultDatasetId"]).iterate_items():
	results += "Results for URL: " + item.get("url") + "\n"
	results += item.get("text") + "\n"

	return results

	def web_scrapper(self, urls: List[str], timeout: Optional[int] = 60) -> str:
	"""
	Scrapes a website using Apify's web-scraper actor.

	:param urls: The URLs to scrape.
	:param timeout: The timeout for the scraping.

	:return: The results of the scraping.
	"""
	if self.api_key is None:
	return "No API key provided"

	if urls is None:
	return "No URLs provided"

	client = ApifyClient(self.api_key)

	logger.debug(f"Scrapping URLs: {urls}")

	formatted_urls = [{"url": url} for url in urls]

	page_function_string = """
	async function pageFunction(context) {
	const $ = context.jQuery;
	const pageTitle = $('title').first().text();
	const h1 = $('h1').first().text();
	const first_h2 = $('h2').first().text();
	const random_text_from_the_page = $('p').first().text();

	context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);

	return {
	url: context.request.url,
	pageTitle,
	h1,
	first_h2,
	random_text_from_the_page
	};
	}
	"""

	run_input = {
	"pageFunction": page_function_string,
	"startUrls": formatted_urls,
	}

	run = client.actor("apify/web-scraper").call(run_input=run_input, timeout_secs=timeout)

	results: str = ""

	for item in client.dataset(run["defaultDatasetId"]).iterate_items():
	results += "Results for URL: " + item.get("url") + "\n"
	results += item.get("pageTitle") + "\n"
	results += item.get("h1") + "\n"
	results += item.get("first_h2") + "\n"
	results += item.get("random_text_from_the_page") + "\n"

	return results