Spaces:

rizoa-auchan-hack
/

hack

Sleeping

hack / llama_index /readers /web /simple_web /base.py

Cédric KACZMAREK

first commit

70b87af over 1 year ago

2.19 kB

	"""Simple Web scraper."""
	from typing import List, Optional, Dict, Callable

	import requests

	from llama_index.core.bridge.pydantic import PrivateAttr
	from llama_index.core.readers.base import BasePydanticReader
	from llama_index.core.schema import Document


	class SimpleWebPageReader(BasePydanticReader):
	"""Simple web page reader.

	Reads pages from the web.

	Args:
	html_to_text (bool): Whether to convert HTML to text.
	Requires `html2text` package.
	metadata_fn (Optional[Callable[[str], Dict]]): A function that takes in
	a URL and returns a dictionary of metadata.
	Default is None.
	"""

	is_remote: bool = True
	html_to_text: bool

	_metadata_fn: Optional[Callable[[str], Dict]] = PrivateAttr()

	def __init__(
	self,
	html_to_text: bool = False,
	metadata_fn: Optional[Callable[[str], Dict]] = None,
	) -> None:
	"""Initialize with parameters."""
	try:
	import html2text # noqa
	except ImportError:
	raise ImportError(
	"`html2text` package not found, please run `pip install html2text`"
	)
	self._metadata_fn = metadata_fn
	super().__init__(html_to_text=html_to_text)

	@classmethod
	def class_name(cls) -> str:
	return "SimpleWebPageReader"

	def load_data(self, urls: List[str]) -> List[Document]:
	"""Load data from the input directory.

	Args:
	urls (List[str]): List of URLs to scrape.

	Returns:
	List[Document]: List of documents.

	"""
	if not isinstance(urls, list):
	raise ValueError("urls must be a list of strings.")
	documents = []
	for url in urls:
	response = requests.get(url, headers=None).text
	if self.html_to_text:
	import html2text

	response = html2text.html2text(response)

	metadata: Optional[Dict] = None
	if self._metadata_fn is not None:
	metadata = self._metadata_fn(url)

	documents.append(Document(text=response, id_=url, metadata=metadata or {}))

	return documents