Spaces:

garage-lab
/

MCP_WEB2JSON

Building

File size: 12,708 Bytes

251790a

import os
import re
import json
import pdfkit
import requests
import warnings
import tempfile
# import textract
import html2text
import inscriptis
import trafilatura
from pathlib import Path
from markdownify import markdownify
from json_repair import repair_json
from bs4 import BeautifulSoup, Comment
from html_chunking import get_html_chunks
from urllib.error import URLError, HTTPError
from html_to_markdown import convert_to_markdown
from readabilipy import simple_json_from_html_string
from docling.document_converter import DocumentConverter
from dateparser_scripts.update_supported_languages_and_locales import to_string


def clean_html(html_content: str) -> str:
    """
    Cleans up the given HTML content by:
      - Removing <script> and <style> tags and their content.
      - Removing HTML comments.
      - Extracting and returning the visible text with normalized whitespace.
    
    Args:
        html_content (str): The HTML content to clean.
    
    Returns:
        str: The cleaned, visible text from the HTML.
    """
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Remove script and style elements
    # Remove unwanted tags
    for tag in soup(["script", "style", "img", "a", "table", "tr", "td", "th", "thead", "tbody",
                     "tfoot", "header", "footer", "link", "rel"]):
        tag.decompose()

    # Remove elements that do not contain any visible text
    for element in soup.find_all():
        # If the element has no text (after stripping whitespace), remove it
        if not element.get_text(strip=True):
            element.decompose()
    
    # Remove HTML comments
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    
    # Extract text and normalize whitespace
    # text = soup.get_text(separator=" ", strip=True)
    # clean_text = re.sub(r'\s+', ' ', text)
    
    # return clean_text
    return str(soup)


def print_content_extractors():
    print(
        [
            "Default: the plain text of the HTML page",
            "Inscriptis",
            "Trafilatura",
        ]
    )


class ContentExtractor:
    def get_text(self, html):
        return clean_html(html)

    # TODO: Clean this mess
    def url_to_html(self, url,clean=False):
        # Define custom headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.6",
            "Cache-Control": "max-age=0",
            "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
            "Sec-Ch-Ua-Mobile": "?0",
            "Sec-Ch-Ua-Platform": "\"Windows\"",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1"
        }

        try:
            # Create a Request object with custom headers
            response = requests.get(url, headers=headers, timeout=10)

            html = None

            if response.status_code == 200:
                html = response.text
            else:
                print(f"Failed to retrieve HTML. Status code: {response.status_code}")
                return None

            if clean:    
                return self.get_text(html)
            
            return html

        except HTTPError as e:
            print(f"HTTP Error: {e.code} - {e.reason}")
            return None
        except URLError as e:
            print(f"URL Error: {e.reason}")
            return None
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return None


class Inscriptis(ContentExtractor):
    def __init__(self):
        super()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Brave/119.0.0.0",
            "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
        }

        warnings.warn("\nBeware, put only clean links with no trackers, or it may produce unexpected results.")

    def get_text(self, html):
        """Extract text from HTML using inscriptis."""
        return inscriptis.get_text(html)

    def url_to_html(self, url):
        response = requests.get(url, headers=self.headers)
        return response.text


class Docling(ContentExtractor):
    def __init__(self):
        super().__init__()

    # TODO: This is an unexpected behaviour but due to docling docs website being down, it's what works for now
    def get_text(self, text_content):
        result = None
        with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
            tmpfile.write(text_content)
            tmpfile.flush()
            tmpfile_path = tmpfile.name.replace("\\", "/")
            tmpfile_path = Path(tmpfile_path)
        try:
            converter = DocumentConverter()
            document = converter.convert(tmpfile_path).document
            tables = []
            for table_ix, table in enumerate(document.tables):
                table_text = table.export_to_markdown()
                tables.append(table_text)

            result = document.export_to_markdown()
            for table in tables:
                result += "\n\n" + table
        finally:
            os.remove(tmpfile_path)
        return result


class ReadabiliPy(ContentExtractor):
    def __init__(self):
        super().__init__()

    def get_text(self, html):
        content = simple_json_from_html_string(html, use_readability=True)
        json_object = json.dumps(content, indent=4)
        repaired = repair_json(json_object)
        return repaired


class Trafilatura(ContentExtractor):
    def __init__(self):
        super().__init__()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
        }

        warnings.warn("\nTrafilatura Content Extractor: Beware, put only clean links with no trackers, or it may produce unexpected results.")

        from copy import deepcopy
        from trafilatura.settings import DEFAULT_CONFIG
        config = deepcopy(DEFAULT_CONFIG)
        # config['DEFAULT']['MIN_EXTRACTED_SIZE'] = '5000' # Configurable but this value worked well for me
        self.config = config

    def url_to_html(self, url):
        response = requests.get(url, headers=self.headers)
        return response.text

    def get_text(self, html, output_format="markdown", min_extracted_size_char=20_000):
        # self.config['DEFAULT']['MIN_EXTRACTED_SIZE'] = f"{min_extracted_size_char}"
        # self.config['DEFAULT']['MIN_OUTPUT_SIZE'] = f"{min_extracted_size_char}"
        return trafilatura.extract(filecontent=html, favor_recall=True, config=self.config, output_format=output_format)
    

class Markdownify(ContentExtractor):
    def get_text(self, html):
        alt = re.sub(r"\n{3,}", "\n\n", html)
        md = markdownify(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])

        md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
        # Remove extra newlines
        md = re.sub(r"\n{3,}", "\n\n", md)
        md = md.strip()

        return md


class HTML2Text(ContentExtractor):
    def get_text(self, html):
        converter = html2text.HTML2Text()
        converter.ignore_tables=True
        converter.ignore_links=True
        converter.ignore_images=True
        converter.ignore_mailto_links=True
        return converter.handle(html)
    

class HTML_TO_Markdown(ContentExtractor):
    def get_text(self, html):
        alt = re.sub(r"\n{3,}", "\n\n", html)
        md = convert_to_markdown(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
    
        md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
        # Remove extra newlines
        md = re.sub(r"\n{3,}", "\n\n", md)
        md = md.strip()

        return md


class PDFkitDocling(ContentExtractor):
    def get_text(self, html):
        soup = BeautifulSoup(html, "html.parser")

        # Remove <a>, <link>, <img>, and other unwanted tags
        for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
            tag.decompose()

        # Remove HTML comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()


        content = str(soup)

        # PDF path to save
        pdf_path = 'test.pdf'

        # Create PDF
        pdfkit.from_string(content, pdf_path)

        converter = DocumentConverter()

        return converter.convert(pdf_path).document.export_to_markdown()


class TrafilatraCHUNKS(ContentExtractor):
    def __init__(self):
        super().__init__()
        # self.trafi = Trafilatura()

    def get_text(self, html, max_tokens=1000):
        soup = BeautifulSoup(html, "html.parser")

        # Remove <a>, <link>, <img>, and other unwanted tags
        for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
            tag.decompose()

        # Remove HTML comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()


        content = str(soup)

        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)

        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
        cleaned = [chunk for chunk in cleaned if chunk is not None]
        

        combined_text = ""
        for chunk in cleaned:
            if chunk is None:
                continue
            combined_text += chunk + "\n"
        
        return combined_text


class TrafilaCHUNKSRobust(ContentExtractor):
    def __init__(self):
        super().__init__()
        # self.trafi = Trafilatura()

    def get_text(self, html, max_tokens=1000):
        soup = BeautifulSoup(html, "html.parser")

        for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
            tag.decompose()

        for tag in soup.find_all(lambda tag: tag.attrs and any("nav" in str(v) for v in tag.attrs.values())):
            tag.decompose()

        # Remove HTML comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        content = str(soup)

        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)

        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
        cleaned = [chunk for chunk in cleaned if chunk is not None]
        
        combined_text = ""
        for chunk in cleaned:
            if chunk is None:
                continue
            combined_text += chunk + "\n"
        
        return combined_text

class TrafilaCHUNKSRobustV2(ContentExtractor):
    def __init__(self):
        super().__init__()
        # self.trafi = Trafilatura()

    def get_text(self, html, max_tokens=1000):
        soup = BeautifulSoup(html, "html.parser")

        for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
            tag.decompose()

        # Remove HTML comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        content = str(soup)

        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)

        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
        cleaned = [chunk for chunk in cleaned if chunk is not None]
        
        combined_text = ""
        for chunk in cleaned:
            if chunk is None:
                continue
            combined_text += chunk + "\n"
        
        return combined_text

# Very Bad lol
# class Textract(ContentExtractor):
#     def get_text(self, html):
#         with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
#             tmpfile.write(html)
#             tmpfile.flush()
#             tmpfile_path = tmpfile.name.replace("\\", "/")
#             tmpfile_path = Path(tmpfile_path)
#         try:
#             result = textract.process(tmpfile_path)
#         finally:
#             os.remove(tmpfile_path)
#         return result