Spaces:

rizoa-auchan-hack
/

hack

Sleeping

hack / llama_index /readers /web /beautiful_soup_web /base.py

Cédric KACZMAREK

first commit

70b87af over 1 year ago

7.07 kB

	"""Beautiful Soup Web scraper."""

	import logging
	from typing import Any, Callable, Dict, List, Optional, Tuple
	from urllib.parse import urljoin

	from llama_index.core.bridge.pydantic import PrivateAttr
	from llama_index.core.readers.base import BasePydanticReader
	from llama_index.core.schema import Document

	logger = logging.getLogger(__name__)


	def _substack_reader(soup: Any, **kwargs) -> Tuple[str, Dict[str, Any]]:
	"""Extract text from Substack blog post."""
	extra_info = {
	"Title of this Substack post": soup.select_one("h1.post-title").getText(),
	"Subtitle": soup.select_one("h3.subtitle").getText(),
	"Author": soup.select_one("span.byline-names").getText(),
	}
	text = soup.select_one("div.available-content").getText()
	return text, extra_info


	def _readthedocs_reader(soup: Any, url: str, **kwargs) -> Tuple[str, Dict[str, Any]]:
	"""Extract text from a ReadTheDocs documentation site."""
	import requests
	from bs4 import BeautifulSoup

	links = soup.find_all("a", {"class": "reference internal"})
	rtd_links = []

	for link in links:
	rtd_links.append(link["href"])
	for i in range(len(rtd_links)):
	if not rtd_links[i].startswith("http"):
	rtd_links[i] = urljoin(url, rtd_links[i])

	texts = []
	for doc_link in rtd_links:
	page_link = requests.get(doc_link)
	soup = BeautifulSoup(page_link.text, "html.parser")
	try:
	text = soup.find(attrs={"role": "main"}).get_text()

	except IndexError:
	text = None
	if text:
	texts.append("\n".join([t for t in text.split("\n") if t]))
	return "\n".join(texts), {}


	def _readmedocs_reader(
	soup: Any, url: str, include_url_in_text: bool = True
	) -> Tuple[str, Dict[str, Any]]:
	"""Extract text from a ReadMe documentation site."""
	import requests
	from bs4 import BeautifulSoup

	links = soup.find_all("a")
	docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
	docs_links = list(set(docs_links))
	for i in range(len(docs_links)):
	if not docs_links[i].startswith("http"):
	docs_links[i] = urljoin(url, docs_links[i])

	texts = []
	for doc_link in docs_links:
	page_link = requests.get(doc_link)
	soup = BeautifulSoup(page_link.text, "html.parser")
	try:
	text = ""
	for element in soup.find_all("article", {"id": "content"}):
	for child in element.descendants:
	if child.name == "a" and child.has_attr("href"):
	if include_url_in_text:
	url = child.get("href")
	if url is not None and "edit" in url:
	text += child.text
	else:
	text += (
	f"{child.text} (Reference url: {doc_link}{url}) "
	)
	elif child.string and child.string.strip():
	text += child.string.strip() + " "

	except IndexError:
	text = None
	logger.error(f"Could not extract text from {doc_link}")
	continue
	texts.append("\n".join([t for t in text.split("\n") if t]))
	return "\n".join(texts), {}


	def _gitbook_reader(
	soup: Any, url: str, include_url_in_text: bool = True
	) -> Tuple[str, Dict[str, Any]]:
	"""Extract text from a ReadMe documentation site."""
	import requests
	from bs4 import BeautifulSoup

	links = soup.find_all("a")
	docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
	docs_links = list(set(docs_links))
	for i in range(len(docs_links)):
	if not docs_links[i].startswith("http"):
	docs_links[i] = urljoin(url, docs_links[i])

	texts = []
	for doc_link in docs_links:
	page_link = requests.get(doc_link)
	soup = BeautifulSoup(page_link.text, "html.parser")
	try:
	text = ""
	text = soup.find("main")
	clean_text = clean_text = ", ".join([tag.get_text() for tag in text])
	except IndexError:
	text = None
	logger.error(f"Could not extract text from {doc_link}")
	continue
	texts.append(clean_text)
	return "\n".join(texts), {}


	DEFAULT_WEBSITE_EXTRACTOR: Dict[
	str, Callable[[Any, str], Tuple[str, Dict[str, Any]]]
	] = {
	"substack.com": _substack_reader,
	"readthedocs.io": _readthedocs_reader,
	"readme.com": _readmedocs_reader,
	"gitbook.io": _gitbook_reader,
	}


	class BeautifulSoupWebReader(BasePydanticReader):
	"""BeautifulSoup web page reader.

	Reads pages from the web.
	Requires the `bs4` and `urllib` packages.

	Args:
	website_extractor (Optional[Dict[str, Callable]]): A mapping of website
	hostname (e.g. google.com) to a function that specifies how to
	extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR.
	"""

	is_remote: bool = True
	_website_extractor: Dict[str, Callable] = PrivateAttr()

	def __init__(self, website_extractor: Optional[Dict[str, Callable]] = None) -> None:
	self._website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR
	super().__init__()

	@classmethod
	def class_name(cls) -> str:
	"""Get the name identifier of the class."""
	return "BeautifulSoupWebReader"

	def load_data(
	self,
	urls: List[str],
	custom_hostname: Optional[str] = None,
	include_url_in_text: Optional[bool] = True,
	) -> List[Document]:
	"""Load data from the urls.

	Args:
	urls (List[str]): List of URLs to scrape.
	custom_hostname (Optional[str]): Force a certain hostname in the case
	a website is displayed under custom URLs (e.g. Substack blogs)
	include_url_in_text (Optional[bool]): Include the reference url in the text of the document

	Returns:
	List[Document]: List of documents.

	"""
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup

	documents = []
	for url in urls:
	try:
	page = requests.get(url)
	except Exception:
	raise ValueError(f"One of the inputs is not a valid url: {url}")

	hostname = custom_hostname or urlparse(url).hostname or ""

	soup = BeautifulSoup(page.content, "html.parser")

	data = ""
	extra_info = {"URL": url}
	if hostname in self._website_extractor:
	data, metadata = self._website_extractor[hostname](
	soup=soup, url=url, include_url_in_text=include_url_in_text
	)
	extra_info.update(metadata)

	else:
	data = soup.getText()

	documents.append(Document(text=data, id_=url, extra_info=extra_info))

	return documents