Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

buster-dev / buster /parser.py

hbertrand

Huggingface support (#28)

f5ec40e unverified over 2 years ago

raw

history blame

5.4 kB

	import math
	import os

	import bs4
	import pandas as pd
	from bs4 import BeautifulSoup


	def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
	section = []
	for node in nodes:
	if node.name == "table":
	node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
	elif node.name == "script":
	continue
	else:
	node_text = node.text
	section.append(node_text)
	section = "".join(section)

	return section


	class Parser:
	def __init__(
	self,
	soup: BeautifulSoup,
	base_url: str,
	filename: str,
	min_section_length: int = 100,
	max_section_length: int = 2000,
	):
	self.soup = soup
	self.base_url = base_url
	self.filename = filename
	self.min_section_length = min_section_length
	self.max_section_length = max_section_length

	def parse(self) -> tuple[list[str], list[str], list[str]]:
	...

	def find_sections(self) -> bs4.element.ResultSet:
	...

	def build_url(self, suffix: str) -> str:
	...


	class SphinxParser(Parser):
	def parse(self) -> tuple[list[str], list[str], list[str]]:
	found = self.find_sections()

	sections = []
	urls = []
	names = []
	for i in range(len(found)):
	section_found = found[i]

	section_soup = section_found.parent.parent
	section_href = section_soup.find_all("a", href=True, class_="headerlink")

	# If sections has subsections, keep only the part before the first subsection
	if len(section_href) > 1 and section_soup.section is not None:
	section_siblings = list(section_soup.section.previous_siblings)[::-1]
	section = parse_section(section_siblings)
	else:
	section = parse_section(section_soup.children)

	# Remove special characters, plus newlines in some url and section names.
	section = section.strip()
	url = section_found["href"].strip().replace("\n", "")
	name = section_found.parent.text.strip()[:-1].replace("\n", "")

	url = self.build_url(url)

	# If text is too long, split into chunks of equal sizes
	if len(section) > self.max_section_length:
	n_chunks = math.ceil(len(section) / float(self.max_section_length))
	separator_index = math.floor(len(section) / n_chunks)

	section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
	url_chunks = [url] * n_chunks
	name_chunks = [name] * n_chunks

	sections.extend(section_chunks)
	urls.extend(url_chunks)
	names.extend(name_chunks)
	# If text is not too short, add in 1 chunk
	elif len(section) > self.min_section_length:
	sections.append(section)
	urls.append(url)
	names.append(name)

	return sections, urls, names

	def find_sections(self) -> bs4.element.ResultSet:
	return self.soup.find_all("a", href=True, class_="headerlink")

	def build_url(self, suffix: str) -> str:
	return self.base_url + self.filename + suffix


	class HuggingfaceParser(Parser):
	def parse(self) -> tuple[list[str], list[str], list[str]]:
	found = self.find_sections()

	sections = []
	urls = []
	names = []
	for i in range(len(found)):
	section_href = found[i].find("a", href=True, class_="header-link")

	section_nodes = []
	for element in found[i].find_next_siblings():
	if i + 1 < len(found) and element == found[i + 1]:
	break
	section_nodes.append(element)
	section = parse_section(section_nodes)

	# Remove special characters, plus newlines in some url and section names.
	section = section.strip()
	url = section_href["href"].strip().replace("\n", "")
	name = found[i].text.strip().replace("\n", "")

	url = self.build_url(url)

	# If text is too long, split into chunks of equal sizes
	if len(section) > self.max_section_length:
	n_chunks = math.ceil(len(section) / float(self.max_section_length))
	separator_index = math.floor(len(section) / n_chunks)

	section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
	url_chunks = [url] * n_chunks
	name_chunks = [name] * n_chunks

	sections.extend(section_chunks)
	urls.extend(url_chunks)
	names.extend(name_chunks)
	# If text is not too short, add in 1 chunk
	elif len(section) > self.min_section_length:
	sections.append(section)
	urls.append(url)
	names.append(name)

	return sections, urls, names

	def find_sections(self) -> bs4.element.ResultSet:
	return self.soup.find_all(["h1", "h2", "h3"], class_="relative group")

	def build_url(self, suffix: str) -> str:
	# The splitext is to remove the .html extension
	return self.base_url + os.path.splitext(self.filename)[0] + suffix