Spaces:

jerpint
/

buster-dev

Runtime error

File size: 5,402 Bytes

f5ec40e

import math
import os

import bs4
import pandas as pd
from bs4 import BeautifulSoup


def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
    section = []
    for node in nodes:
        if node.name == "table":
            node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
        elif node.name == "script":
            continue
        else:
            node_text = node.text
        section.append(node_text)
    section = "".join(section)

    return section


class Parser:
    def __init__(
        self,
        soup: BeautifulSoup,
        base_url: str,
        filename: str,
        min_section_length: int = 100,
        max_section_length: int = 2000,
    ):
        self.soup = soup
        self.base_url = base_url
        self.filename = filename
        self.min_section_length = min_section_length
        self.max_section_length = max_section_length

    def parse(self) -> tuple[list[str], list[str], list[str]]:
        ...

    def find_sections(self) -> bs4.element.ResultSet:
        ...

    def build_url(self, suffix: str) -> str:
        ...


class SphinxParser(Parser):
    def parse(self) -> tuple[list[str], list[str], list[str]]:
        found = self.find_sections()

        sections = []
        urls = []
        names = []
        for i in range(len(found)):
            section_found = found[i]

            section_soup = section_found.parent.parent
            section_href = section_soup.find_all("a", href=True, class_="headerlink")

            # If sections has subsections, keep only the part before the first subsection
            if len(section_href) > 1 and section_soup.section is not None:
                section_siblings = list(section_soup.section.previous_siblings)[::-1]
                section = parse_section(section_siblings)
            else:
                section = parse_section(section_soup.children)

            # Remove special characters, plus newlines in some url and section names.
            section = section.strip()
            url = section_found["href"].strip().replace("\n", "")
            name = section_found.parent.text.strip()[:-1].replace("\n", "")

            url = self.build_url(url)

            # If text is too long, split into chunks of equal sizes
            if len(section) > self.max_section_length:
                n_chunks = math.ceil(len(section) / float(self.max_section_length))
                separator_index = math.floor(len(section) / n_chunks)

                section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
                url_chunks = [url] * n_chunks
                name_chunks = [name] * n_chunks

                sections.extend(section_chunks)
                urls.extend(url_chunks)
                names.extend(name_chunks)
            # If text is not too short, add in 1 chunk
            elif len(section) > self.min_section_length:
                sections.append(section)
                urls.append(url)
                names.append(name)

        return sections, urls, names

    def find_sections(self) -> bs4.element.ResultSet:
        return self.soup.find_all("a", href=True, class_="headerlink")

    def build_url(self, suffix: str) -> str:
        return self.base_url + self.filename + suffix


class HuggingfaceParser(Parser):
    def parse(self) -> tuple[list[str], list[str], list[str]]:
        found = self.find_sections()

        sections = []
        urls = []
        names = []
        for i in range(len(found)):
            section_href = found[i].find("a", href=True, class_="header-link")

            section_nodes = []
            for element in found[i].find_next_siblings():
                if i + 1 < len(found) and element == found[i + 1]:
                    break
                section_nodes.append(element)
            section = parse_section(section_nodes)

            # Remove special characters, plus newlines in some url and section names.
            section = section.strip()
            url = section_href["href"].strip().replace("\n", "")
            name = found[i].text.strip().replace("\n", "")

            url = self.build_url(url)

            # If text is too long, split into chunks of equal sizes
            if len(section) > self.max_section_length:
                n_chunks = math.ceil(len(section) / float(self.max_section_length))
                separator_index = math.floor(len(section) / n_chunks)

                section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
                url_chunks = [url] * n_chunks
                name_chunks = [name] * n_chunks

                sections.extend(section_chunks)
                urls.extend(url_chunks)
                names.extend(name_chunks)
            # If text is not too short, add in 1 chunk
            elif len(section) > self.min_section_length:
                sections.append(section)
                urls.append(url)
                names.append(name)

        return sections, urls, names

    def find_sections(self) -> bs4.element.ResultSet:
        return self.soup.find_all(["h1", "h2", "h3"], class_="relative group")

    def build_url(self, suffix: str) -> str:
        # The splitext is to remove the .html extension
        return self.base_url + os.path.splitext(self.filename)[0] + suffix