Spaces:
Sleeping
Sleeping
import asyncio | |
import logging | |
from typing import List | |
from llama_index.core.readers.base import BaseReader | |
from llama_index.core.schema import Document | |
logger = logging.getLogger(__name__) | |
class AsyncWebPageReader(BaseReader): | |
"""Asynchronous web page reader. | |
Reads pages from the web asynchronously. | |
Args: | |
html_to_text (bool): Whether to convert HTML to text. | |
Requires `html2text` package. | |
limit (int): Maximum number of concurrent requests. | |
dedupe (bool): to deduplicate urls if there is exact-match within given list | |
fail_on_error (bool): if requested url does not return status code 200 the routine will raise an ValueError | |
""" | |
def __init__( | |
self, | |
html_to_text: bool = False, | |
limit: int = 10, | |
dedupe: bool = True, | |
fail_on_error: bool = False, | |
) -> None: | |
"""Initialize with parameters.""" | |
try: | |
import html2text # noqa: F401 | |
except ImportError: | |
raise ImportError( | |
"`html2text` package not found, please run `pip install html2text`" | |
) | |
try: | |
import aiohttp # noqa: F401 | |
except ImportError: | |
raise ImportError( | |
"`aiohttp` package not found, please run `pip install aiohttp`" | |
) | |
self._limit = limit | |
self._html_to_text = html_to_text | |
self._dedupe = dedupe | |
self._fail_on_error = fail_on_error | |
def load_data(self, urls: List[str]) -> List[Document]: | |
"""Load data from the input urls. | |
Args: | |
urls (List[str]): List of URLs to scrape. | |
Returns: | |
List[Document]: List of documents. | |
""" | |
if self._dedupe: | |
urls = list(dict.fromkeys(urls)) | |
import aiohttp | |
def chunked_http_client(limit: int): | |
semaphore = asyncio.Semaphore(limit) | |
async def http_get(url: str, session: aiohttp.ClientSession): | |
async with semaphore: | |
async with session.get(url) as response: | |
return response, await response.text() | |
return http_get | |
async def fetch_urls(urls: List[str]): | |
http_client = chunked_http_client(self._limit) | |
async with aiohttp.ClientSession() as session: | |
tasks = [http_client(url, session) for url in urls] | |
return await asyncio.gather(*tasks, return_exceptions=True) | |
if not isinstance(urls, list): | |
raise ValueError("urls must be a list of strings.") | |
documents = [] | |
responses = asyncio.run(fetch_urls(urls)) | |
for i, response_tuple in enumerate(responses): | |
if not isinstance(response_tuple, tuple): | |
raise ValueError(f"One of the inputs is not a valid url: {urls[i]}") | |
response, raw_page = response_tuple | |
if response.status != 200: | |
logger.warning(f"error fetching page from {urls[i]}") | |
logger.info(response) | |
if self._fail_on_error: | |
raise ValueError( | |
f"error fetching page from {urls[i]}. server returned status:" | |
f" {response.status} and response {raw_page}" | |
) | |
continue | |
if self._html_to_text: | |
import html2text | |
response_text = html2text.html2text(raw_page) | |
else: | |
response_text = raw_page | |
documents.append( | |
Document(text=response_text, extra_info={"Source": str(response.url)}) | |
) | |
return documents | |