Cédric KACZMAREK
first commit
70b87af
import urllib.request
import xml.etree.ElementTree as ET
from typing import List
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from llama_index.readers.web.async_web.base import AsyncWebPageReader
class SitemapReader(BaseReader):
"""Asynchronous sitemap reader for web.
Reads pages from the web based on their sitemap.xml.
Args:
sitemap_url (string): Path to the sitemap.xml. e.g. https://gpt-index.readthedocs.io/sitemap.xml
html_to_text (bool): Whether to convert HTML to text.
Requires `html2text` package.
limit (int): Maximum number of concurrent requests.
"""
xml_schema_sitemap = "http://www.sitemaps.org/schemas/sitemap/0.9"
def __init__(self, html_to_text: bool = False, limit: int = 10) -> None:
"""Initialize with parameters."""
self._async_loader = AsyncWebPageReader(html_to_text=html_to_text, limit=limit)
self._html_to_text = html_to_text
self._limit = limit
def _load_sitemap(self, sitemap_url: str) -> str:
sitemap_url_request = urllib.request.urlopen(sitemap_url)
return sitemap_url_request.read()
def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list:
sitemap = ET.fromstring(raw_sitemap)
sitemap_urls = []
for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"):
location = url.find(f"{{{self.xml_schema_sitemap}}}loc").text
if filter_locs is None or filter_locs in location:
sitemap_urls.append(location)
return sitemap_urls
def load_data(self, sitemap_url: str, filter: str = None) -> List[Document]:
sitemap = self._load_sitemap(sitemap_url=sitemap_url)
sitemap_urls = self._parse_sitemap(sitemap, filter)
return self._async_loader.load_data(urls=sitemap_urls)