Spaces:

rizoa-auchan-hack
/

hack

Sleeping

hack / llama_index /readers /web /sitemap /base.py

Cédric KACZMAREK

first commit

70b87af over 1 year ago

1.9 kB

	import urllib.request
	import xml.etree.ElementTree as ET
	from typing import List

	from llama_index.core.readers.base import BaseReader
	from llama_index.core.schema import Document
	from llama_index.readers.web.async_web.base import AsyncWebPageReader


	class SitemapReader(BaseReader):
	"""Asynchronous sitemap reader for web.

	Reads pages from the web based on their sitemap.xml.

	Args:
	sitemap_url (string): Path to the sitemap.xml. e.g. https://gpt-index.readthedocs.io/sitemap.xml
	html_to_text (bool): Whether to convert HTML to text.
	Requires `html2text` package.
	limit (int): Maximum number of concurrent requests.

	"""

	xml_schema_sitemap = "http://www.sitemaps.org/schemas/sitemap/0.9"

	def __init__(self, html_to_text: bool = False, limit: int = 10) -> None:
	"""Initialize with parameters."""
	self._async_loader = AsyncWebPageReader(html_to_text=html_to_text, limit=limit)
	self._html_to_text = html_to_text
	self._limit = limit

	def _load_sitemap(self, sitemap_url: str) -> str:
	sitemap_url_request = urllib.request.urlopen(sitemap_url)

	return sitemap_url_request.read()

	def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list:
	sitemap = ET.fromstring(raw_sitemap)
	sitemap_urls = []

	for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"):
	location = url.find(f"{{{self.xml_schema_sitemap}}}loc").text

	if filter_locs is None or filter_locs in location:
	sitemap_urls.append(location)

	return sitemap_urls

	def load_data(self, sitemap_url: str, filter: str = None) -> List[Document]:
	sitemap = self._load_sitemap(sitemap_url=sitemap_url)
	sitemap_urls = self._parse_sitemap(sitemap, filter)

	return self._async_loader.load_data(urls=sitemap_urls)