Spaces:

rizoa-auchan-hack
/

hack

Sleeping

hack / llama_index /readers /web /async_web /base.py

Cédric KACZMAREK

first commit

70b87af over 1 year ago

3.7 kB

	import asyncio
	import logging
	from typing import List

	from llama_index.core.readers.base import BaseReader
	from llama_index.core.schema import Document

	logger = logging.getLogger(__name__)


	class AsyncWebPageReader(BaseReader):
	"""Asynchronous web page reader.

	Reads pages from the web asynchronously.

	Args:
	html_to_text (bool): Whether to convert HTML to text.
	Requires `html2text` package.
	limit (int): Maximum number of concurrent requests.
	dedupe (bool): to deduplicate urls if there is exact-match within given list
	fail_on_error (bool): if requested url does not return status code 200 the routine will raise an ValueError
	"""

	def __init__(
	self,
	html_to_text: bool = False,
	limit: int = 10,
	dedupe: bool = True,
	fail_on_error: bool = False,
	) -> None:
	"""Initialize with parameters."""
	try:
	import html2text # noqa: F401
	except ImportError:
	raise ImportError(
	"`html2text` package not found, please run `pip install html2text`"
	)
	try:
	import aiohttp # noqa: F401
	except ImportError:
	raise ImportError(
	"`aiohttp` package not found, please run `pip install aiohttp`"
	)
	self._limit = limit
	self._html_to_text = html_to_text
	self._dedupe = dedupe
	self._fail_on_error = fail_on_error

	def load_data(self, urls: List[str]) -> List[Document]:
	"""Load data from the input urls.

	Args:
	urls (List[str]): List of URLs to scrape.

	Returns:
	List[Document]: List of documents.

	"""
	if self._dedupe:
	urls = list(dict.fromkeys(urls))

	import aiohttp

	def chunked_http_client(limit: int):
	semaphore = asyncio.Semaphore(limit)

	async def http_get(url: str, session: aiohttp.ClientSession):
	async with semaphore:
	async with session.get(url) as response:
	return response, await response.text()

	return http_get

	async def fetch_urls(urls: List[str]):
	http_client = chunked_http_client(self._limit)
	async with aiohttp.ClientSession() as session:
	tasks = [http_client(url, session) for url in urls]
	return await asyncio.gather(*tasks, return_exceptions=True)

	if not isinstance(urls, list):
	raise ValueError("urls must be a list of strings.")

	documents = []
	responses = asyncio.run(fetch_urls(urls))

	for i, response_tuple in enumerate(responses):
	if not isinstance(response_tuple, tuple):
	raise ValueError(f"One of the inputs is not a valid url: {urls[i]}")

	response, raw_page = response_tuple

	if response.status != 200:
	logger.warning(f"error fetching page from {urls[i]}")
	logger.info(response)

	if self._fail_on_error:
	raise ValueError(
	f"error fetching page from {urls[i]}. server returned status:"
	f" {response.status} and response {raw_page}"
	)

	continue

	if self._html_to_text:
	import html2text

	response_text = html2text.html2text(raw_page)
	else:
	response_text = raw_page

	documents.append(
	Document(text=response_text, extra_info={"Source": str(response.url)})
	)

	return documents