Spaces:
Building
Building
File size: 7,900 Bytes
4ed1b4f 7a692a6 4ed1b4f c67f04e 4ed1b4f c67f04e 7a692a6 c67f04e 4ed1b4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import re
import requests
from bs4 import BeautifulSoup , Comment
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
from htmlrag import clean_html
class HTMLCleaner:
DEFAULT_REMOVE_TAGS = [
"script", "style"
]
def __init__(self, config: dict = None):
self.config = config or {}
# allow custom tags to remove
self.remove_tags = set(self.DEFAULT_REMOVE_TAGS) | set(self.config.get("extra_remove_tags", []))
def _clean_html(self, html_content: str) -> str:
"""
Cleans up the given HTML content by:
- Removing specified tags and their content.
- Stripping HTML comments.
- Optionally stripping out all attributes.
- Optionally flattening hyperlinks.
- Removing empty tags.
- Extracting and returning cleaned HTML or visible text.
Args:
html_content (str): The HTML content to clean.
Returns:
str: The cleaned HTML (if keep_tags=True) or normalized text.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Remove unwanted tags entirely
for tag_name in self.remove_tags:
for tag in soup.find_all(tag_name):
tag.decompose()
# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Strip attributes if requested
if self.config.get("strip_attrs", False):
for tag in soup.find_all(True):
tag.attrs = {}
# Flatten hyperlinks if requested
if self.config.get("strip_links", False):
for a in soup.find_all('a'):
a.replace_with(a.get_text())
# Remove empty tags (no text and no non-empty children)
for tag in soup.find_all(True):
if not tag.get_text(strip=True):
tag.decompose()
# Convert soup to HTML string if preserving tags
if self.config.get('keep_tags', False):
html_str = str(soup)
# Remove any empty lines
html_str = re.sub(r'(?m)^[ \t]*\n', '', html_str)
return html_str.strip()
# Extract visible text
text = soup.get_text(separator="\n", strip=True)
# Remove empty lines
lines = [line for line in text.splitlines() if line.strip()]
clean_text = "\n".join(lines)
# Normalize whitespace within lines
clean_text = re.sub(r'\s+', ' ', clean_text)
return clean_text.strip()
class Preprocessor(ABC):
"""
Abstract base class for preprocessors.
Defines the interface for transforming raw inputs into structured data.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
"""
Initialize the preprocessor with optional configuration.
Args:
config: A dictionary of configuration settings.
- keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
"""
self.config = config if config is not None else {'keep_tags': False}
def _fetch_content(self, url: str) -> str:
"""
Fetches and parses the text content from a URL.
Args:
url: The URL to fetch content from.
Returns:
The clean, extracted text content from the page.
Raises:
ValueError: If the URL cannot be fetched or processed.
"""
try:
# Set a User-Agent header to mimic a browser, which can help avoid
# being blocked by some websites.
# Inside _fetch_content method
headers = headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.6",
"Cache-Control": "max-age=0",
"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}
# Make the HTTP GET request with a timeout.
response = requests.get(url, headers=headers, timeout=15)
return response.text
except requests.exceptions.RequestException as e:
# Catch any network-related errors (DNS, connection, timeout, etc.)
# and re-raise them as a more user-friendly ValueError.
raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
@abstractmethod
def preprocess(self, content: str, is_url: bool) -> str:
"""
Take raw content (HTML, text, etc.) and apply preprocessing steps.
Args:
content: The raw data to preprocess.
Returns:
A dictionary containing structured, cleaned data ready for downstream tasks.
"""
pass
class BasicPreprocessor(Preprocessor):
"""
Base preprocessor with common functionality.
Can be extended for specific preprocessing tasks.
"""
# TODO: Might need to think of how to improve this later
def _clean_html(self, html_content: str) -> str:
"""
Cleans up the given HTML content by:
- Removing <script> and <style> tags and their content.
- Removing HTML comments.
- Extracting and returning the visible text with normalized whitespace if keep_tags is False.
Args:
html_content (str): The HTML content to clean.
Returns:
str: The cleaned, visible text from the HTML.
"""
# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style elements
for tag in soup(["script", "style"]):
tag.decompose()
# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Extract text and normalize whitespace
if self.config.get('keep_tags', False):
# If keep_tags is True, return the raw HTML
return str(soup)
text = soup.get_text(separator=" ", strip=True)
clean_text = re.sub(r'\s+', ' ', text)
return clean_text
def preprocess(self, content: str, is_url: bool) -> str:
"""
Take raw content (HTML, text, etc.) and apply preprocessing steps.
Args:
content: The raw data to preprocess.
Returns:
A dictionary containing structured, cleaned data ready for downstream tasks.
"""
html_content = content
if is_url:
# Fetch content from the URL
html_content = self._fetch_content(content)
# Clean the HTML content
# cleaned_content = self._clean_html(html_content)
cleaner = HTMLCleaner({
'keep_tags': True if self.config.get('keep_tags', False) else False,
'strip_attrs': True,
'strip_links': True,
'extra_remove_tags': ['header', 'footer']
})
clean = cleaner._clean_html(html_content=html_content)
clean = clean_html(clean)
# clean = clean_html(html_content)
return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|