import urllib.request import xml.etree.ElementTree as ET from typing import List from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from llama_index.readers.web.async_web.base import AsyncWebPageReader class SitemapReader(BaseReader): """Asynchronous sitemap reader for web. Reads pages from the web based on their sitemap.xml. Args: sitemap_url (string): Path to the sitemap.xml. e.g. https://gpt-index.readthedocs.io/sitemap.xml html_to_text (bool): Whether to convert HTML to text. Requires `html2text` package. limit (int): Maximum number of concurrent requests. """ xml_schema_sitemap = "http://www.sitemaps.org/schemas/sitemap/0.9" def __init__(self, html_to_text: bool = False, limit: int = 10) -> None: """Initialize with parameters.""" self._async_loader = AsyncWebPageReader(html_to_text=html_to_text, limit=limit) self._html_to_text = html_to_text self._limit = limit def _load_sitemap(self, sitemap_url: str) -> str: sitemap_url_request = urllib.request.urlopen(sitemap_url) return sitemap_url_request.read() def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list: sitemap = ET.fromstring(raw_sitemap) sitemap_urls = [] for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"): location = url.find(f"{{{self.xml_schema_sitemap}}}loc").text if filter_locs is None or filter_locs in location: sitemap_urls.append(location) return sitemap_urls def load_data(self, sitemap_url: str, filter: str = None) -> List[Document]: sitemap = self._load_sitemap(sitemap_url=sitemap_url) sitemap_urls = self._parse_sitemap(sitemap, filter) return self._async_loader.load_data(urls=sitemap_urls)