Spaces:
Runtime error
Runtime error
from abc import ABC, abstractmethod | |
from typing import Optional, Dict, Any, Tuple | |
from .models import MarkdownGenerationResult | |
from .html2text import CustomHTML2Text | |
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter | |
import re | |
from urllib.parse import urljoin | |
# Pre-compile the regex pattern | |
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') | |
def fast_urljoin(base: str, url: str) -> str: | |
"""Fast URL joining for common cases.""" | |
if url.startswith(('http://', 'https://', 'mailto:', '//')): | |
return url | |
if url.startswith('/'): | |
# Handle absolute paths | |
if base.endswith('/'): | |
return base[:-1] + url | |
return base + url | |
return urljoin(base, url) | |
class MarkdownGenerationStrategy(ABC): | |
"""Abstract base class for markdown generation strategies.""" | |
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): | |
self.content_filter = content_filter | |
self.options = options or {} | |
def generate_markdown(self, | |
cleaned_html: str, | |
base_url: str = "", | |
html2text_options: Optional[Dict[str, Any]] = None, | |
content_filter: Optional[RelevantContentFilter] = None, | |
citations: bool = True, | |
**kwargs) -> MarkdownGenerationResult: | |
"""Generate markdown from cleaned HTML.""" | |
pass | |
class DefaultMarkdownGenerator(MarkdownGenerationStrategy): | |
""" | |
Default implementation of markdown generation strategy. | |
How it works: | |
1. Generate raw markdown from cleaned HTML. | |
2. Convert links to citations. | |
3. Generate fit markdown if content filter is provided. | |
4. Return MarkdownGenerationResult. | |
Args: | |
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. | |
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. | |
Returns: | |
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. | |
""" | |
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): | |
super().__init__(content_filter, options) | |
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: | |
""" | |
Convert links in markdown to citations. | |
How it works: | |
1. Find all links in the markdown. | |
2. Convert links to citations. | |
3. Return converted markdown and references markdown. | |
Note: | |
This function uses a regex pattern to find links in markdown. | |
Args: | |
markdown (str): Markdown text. | |
base_url (str): Base URL for URL joins. | |
Returns: | |
Tuple[str, str]: Converted markdown and references markdown. | |
""" | |
link_map = {} | |
url_cache = {} # Cache for URL joins | |
parts = [] | |
last_end = 0 | |
counter = 1 | |
for match in LINK_PATTERN.finditer(markdown): | |
parts.append(markdown[last_end:match.start()]) | |
text, url, title = match.groups() | |
# Use cached URL if available, otherwise compute and cache | |
if base_url and not url.startswith(('http://', 'https://', 'mailto:')): | |
if url not in url_cache: | |
url_cache[url] = fast_urljoin(base_url, url) | |
url = url_cache[url] | |
if url not in link_map: | |
desc = [] | |
if title: desc.append(title) | |
if text and text != title: desc.append(text) | |
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") | |
counter += 1 | |
num = link_map[url][0] | |
parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") | |
last_end = match.end() | |
parts.append(markdown[last_end:]) | |
converted_text = ''.join(parts) | |
# Pre-build reference strings | |
references = ["\n\n## References\n\n"] | |
references.extend( | |
f"⟨{num}⟩ {url}{desc}\n" | |
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) | |
) | |
return converted_text, ''.join(references) | |
def generate_markdown(self, | |
cleaned_html: str, | |
base_url: str = "", | |
html2text_options: Optional[Dict[str, Any]] = None, | |
options: Optional[Dict[str, Any]] = None, | |
content_filter: Optional[RelevantContentFilter] = None, | |
citations: bool = True, | |
**kwargs) -> MarkdownGenerationResult: | |
""" | |
Generate markdown with citations from cleaned HTML. | |
How it works: | |
1. Generate raw markdown from cleaned HTML. | |
2. Convert links to citations. | |
3. Generate fit markdown if content filter is provided. | |
4. Return MarkdownGenerationResult. | |
Args: | |
cleaned_html (str): Cleaned HTML content. | |
base_url (str): Base URL for URL joins. | |
html2text_options (Optional[Dict[str, Any]]): HTML2Text options. | |
options (Optional[Dict[str, Any]]): Additional options for markdown generation. | |
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. | |
citations (bool): Whether to generate citations. | |
Returns: | |
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. | |
""" | |
try: | |
# Initialize HTML2Text with default options for better conversion | |
h = CustomHTML2Text(baseurl=base_url) | |
default_options = { | |
'body_width': 0, # Disable text wrapping | |
'ignore_emphasis': False, | |
'ignore_links': False, | |
'ignore_images': False, | |
'protect_links': True, | |
'single_line_break': True, | |
'mark_code': True, | |
'escape_snob': False | |
} | |
# Update with custom options if provided | |
if html2text_options: | |
default_options.update(html2text_options) | |
elif options: | |
default_options.update(options) | |
elif self.options: | |
default_options.update(self.options) | |
h.update_params(**default_options) | |
# Ensure we have valid input | |
if not cleaned_html: | |
cleaned_html = "" | |
elif not isinstance(cleaned_html, str): | |
cleaned_html = str(cleaned_html) | |
# Generate raw markdown | |
try: | |
raw_markdown = h.handle(cleaned_html) | |
except Exception as e: | |
raw_markdown = f"Error converting HTML to markdown: {str(e)}" | |
raw_markdown = raw_markdown.replace(' ```', '```') | |
# Convert links to citations | |
markdown_with_citations: str = raw_markdown | |
references_markdown: str = "" | |
if citations: | |
try: | |
markdown_with_citations, references_markdown = self.convert_links_to_citations( | |
raw_markdown, base_url | |
) | |
except Exception as e: | |
markdown_with_citations = raw_markdown | |
references_markdown = f"Error generating citations: {str(e)}" | |
# Generate fit markdown if content filter is provided | |
fit_markdown: Optional[str] = "" | |
filtered_html: Optional[str] = "" | |
if content_filter or self.content_filter: | |
try: | |
content_filter = content_filter or self.content_filter | |
filtered_html = content_filter.filter_content(cleaned_html) | |
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html) | |
fit_markdown = h.handle(filtered_html) | |
except Exception as e: | |
fit_markdown = f"Error generating fit markdown: {str(e)}" | |
filtered_html = "" | |
return MarkdownGenerationResult( | |
raw_markdown=raw_markdown or "", | |
markdown_with_citations=markdown_with_citations or "", | |
references_markdown=references_markdown or "", | |
fit_markdown=fit_markdown or "", | |
fit_html=filtered_html or "", | |
) | |
except Exception as e: | |
# If anything fails, return empty strings with error message | |
error_msg = f"Error in markdown generation: {str(e)}" | |
return MarkdownGenerationResult( | |
raw_markdown=error_msg, | |
markdown_with_citations=error_msg, | |
references_markdown="", | |
fit_markdown="", | |
fit_html="", | |
) | |