Spaces:
Sleeping
Sleeping
from typing import Any, Dict, List, Optional | |
from llama_index.core.readers.base import BaseReader | |
from llama_index.core.schema import Document | |
class KnowledgeBaseWebReader(BaseReader): | |
"""Knowledge base reader. | |
Crawls and reads articles from a knowledge base/help center with Playwright. | |
Tested on Zendesk and Intercom CMS, may work on others. | |
Can be run in headless mode but it may be blocked by Cloudflare. Run it headed to be safe. | |
Times out occasionally, just increase the default time out if it does. | |
Requires the `playwright` package. | |
Args: | |
root_url (str): the base url of the knowledge base, with no trailing slash | |
e.g. 'https://support.intercom.com' | |
link_selectors (List[str]): list of css selectors to find links to articles while crawling | |
e.g. ['.article-list a', '.article-list a'] | |
article_path (str): the url path of articles on this domain so the crawler knows when to stop | |
e.g. '/articles' | |
title_selector (Optional[str]): css selector to find the title of the article | |
e.g. '.article-title' | |
subtitle_selector (Optional[str]): css selector to find the subtitle/description of the article | |
e.g. '.article-subtitle' | |
body_selector (Optional[str]): css selector to find the body of the article | |
e.g. '.article-body' | |
""" | |
def __init__( | |
self, | |
root_url: str, | |
link_selectors: List[str], | |
article_path: str, | |
title_selector: Optional[str] = None, | |
subtitle_selector: Optional[str] = None, | |
body_selector: Optional[str] = None, | |
) -> None: | |
"""Initialize with parameters.""" | |
self.root_url = root_url | |
self.link_selectors = link_selectors | |
self.article_path = article_path | |
self.title_selector = title_selector | |
self.subtitle_selector = subtitle_selector | |
self.body_selector = body_selector | |
def load_data(self) -> List[Document]: | |
"""Load data from the knowledge base.""" | |
from playwright.sync_api import sync_playwright | |
with sync_playwright() as p: | |
browser = p.chromium.launch(headless=False) | |
# Crawl | |
article_urls = self.get_article_urls( | |
browser, | |
self.root_url, | |
self.root_url, | |
) | |
# Scrape | |
documents = [] | |
for url in article_urls: | |
article = self.scrape_article( | |
browser, | |
url, | |
) | |
extra_info = { | |
"title": article["title"], | |
"subtitle": article["subtitle"], | |
"url": article["url"], | |
} | |
documents.append(Document(text=article["body"], extra_info=extra_info)) | |
browser.close() | |
return documents | |
def scrape_article( | |
self, | |
browser: Any, | |
url: str, | |
) -> Dict[str, str]: | |
"""Scrape a single article url. | |
Args: | |
browser (Any): a Playwright Chromium browser. | |
url (str): URL of the article to scrape. | |
Returns: | |
Dict[str, str]: a mapping of article attributes to their values. | |
""" | |
page = browser.new_page(ignore_https_errors=True) | |
page.set_default_timeout(60000) | |
page.goto(url, wait_until="domcontentloaded") | |
title = ( | |
( | |
page.query_selector(self.title_selector).evaluate( | |
"node => node.innerText" | |
) | |
) | |
if self.title_selector | |
else "" | |
) | |
subtitle = ( | |
( | |
page.query_selector(self.subtitle_selector).evaluate( | |
"node => node.innerText" | |
) | |
) | |
if self.subtitle_selector | |
else "" | |
) | |
body = ( | |
(page.query_selector(self.body_selector).evaluate("node => node.innerText")) | |
if self.body_selector | |
else "" | |
) | |
page.close() | |
print("scraped:", url) | |
return {"title": title, "subtitle": subtitle, "body": body, "url": url} | |
def get_article_urls( | |
self, browser: Any, root_url: str, current_url: str | |
) -> List[str]: | |
"""Recursively crawl through the knowledge base to find a list of articles. | |
Args: | |
browser (Any): a Playwright Chromium browser. | |
root_url (str): root URL of the knowledge base. | |
current_url (str): current URL that is being crawled. | |
Returns: | |
List[str]: a list of URLs of found articles. | |
""" | |
page = browser.new_page(ignore_https_errors=True) | |
page.set_default_timeout(60000) | |
page.goto(current_url, wait_until="domcontentloaded") | |
# If this is a leaf node aka article page, return itself | |
if self.article_path in current_url: | |
print("Found an article: ", current_url) | |
page.close() | |
return [current_url] | |
# Otherwise crawl this page and find all the articles linked from it | |
article_urls = [] | |
links = [] | |
for link_selector in self.link_selectors: | |
ahrefs = page.query_selector_all(link_selector) | |
links.extend(ahrefs) | |
for link in links: | |
url = root_url + page.evaluate("(node) => node.getAttribute('href')", link) | |
article_urls.extend(self.get_article_urls(browser, root_url, url)) | |
page.close() | |
return article_urls | |