Cédric KACZMAREK
first commit
70b87af
"""Simple Web scraper."""
from typing import List, Optional, Dict, Callable
import requests
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document
class SimpleWebPageReader(BasePydanticReader):
"""Simple web page reader.
Reads pages from the web.
Args:
html_to_text (bool): Whether to convert HTML to text.
Requires `html2text` package.
metadata_fn (Optional[Callable[[str], Dict]]): A function that takes in
a URL and returns a dictionary of metadata.
Default is None.
"""
is_remote: bool = True
html_to_text: bool
_metadata_fn: Optional[Callable[[str], Dict]] = PrivateAttr()
def __init__(
self,
html_to_text: bool = False,
metadata_fn: Optional[Callable[[str], Dict]] = None,
) -> None:
"""Initialize with parameters."""
try:
import html2text # noqa
except ImportError:
raise ImportError(
"`html2text` package not found, please run `pip install html2text`"
)
self._metadata_fn = metadata_fn
super().__init__(html_to_text=html_to_text)
@classmethod
def class_name(cls) -> str:
return "SimpleWebPageReader"
def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from the input directory.
Args:
urls (List[str]): List of URLs to scrape.
Returns:
List[Document]: List of documents.
"""
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
documents = []
for url in urls:
response = requests.get(url, headers=None).text
if self.html_to_text:
import html2text
response = html2text.html2text(response)
metadata: Optional[Dict] = None
if self._metadata_fn is not None:
metadata = self._metadata_fn(url)
documents.append(Document(text=response, id_=url, metadata=metadata or {}))
return documents