File size: 9,386 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
import re
from urllib.parse import urljoin

# Pre-compile the regex pattern
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')

def fast_urljoin(base: str, url: str) -> str:
    """Fast URL joining for common cases."""
    if url.startswith(('http://', 'https://', 'mailto:', '//')):
        return url
    if url.startswith('/'):
        # Handle absolute paths
        if base.endswith('/'):
            return base[:-1] + url
        return base + url
    return urljoin(base, url)

class MarkdownGenerationStrategy(ABC):
    """Abstract base class for markdown generation strategies."""
    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
        self.content_filter = content_filter
        self.options = options or {}
    
    @abstractmethod
    def generate_markdown(self, 
                         cleaned_html: str, 
                         base_url: str = "",
                         html2text_options: Optional[Dict[str, Any]] = None,
                         content_filter: Optional[RelevantContentFilter] = None,
                         citations: bool = True,
                         **kwargs) -> MarkdownGenerationResult:
        """Generate markdown from cleaned HTML."""
        pass

class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
    """
    Default implementation of markdown generation strategy.
    
    How it works:
    1. Generate raw markdown from cleaned HTML.
    2. Convert links to citations.
    3. Generate fit markdown if content filter is provided.
    4. Return MarkdownGenerationResult.
    
    Args:
        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
        
    Returns:
        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
    """
    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
        super().__init__(content_filter, options)
    
    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
        """
        Convert links in markdown to citations.
        
        How it works:
        1. Find all links in the markdown.
        2. Convert links to citations.
        3. Return converted markdown and references markdown.
        
        Note:
        This function uses a regex pattern to find links in markdown.
        
        Args:
            markdown (str): Markdown text.
            base_url (str): Base URL for URL joins.
            
        Returns:
            Tuple[str, str]: Converted markdown and references markdown.
        """
        link_map = {}
        url_cache = {}  # Cache for URL joins
        parts = []
        last_end = 0
        counter = 1
        
        for match in LINK_PATTERN.finditer(markdown):
            parts.append(markdown[last_end:match.start()])
            text, url, title = match.groups()
            
            # Use cached URL if available, otherwise compute and cache
            if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
                if url not in url_cache:
                    url_cache[url] = fast_urljoin(base_url, url)
                url = url_cache[url]
                
            if url not in link_map:
                desc = []
                if title: desc.append(title)
                if text and text != title: desc.append(text)
                link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
                counter += 1
                
            num = link_map[url][0]
            parts.append(f"{text}{num}⟩" if not match.group(0).startswith('!') else f"![{text}{num}⟩]")
            last_end = match.end()
        
        parts.append(markdown[last_end:])
        converted_text = ''.join(parts)
        
        # Pre-build reference strings
        references = ["\n\n## References\n\n"]
        references.extend(
            f"⟨{num}{url}{desc}\n" 
            for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
        )
        
        return converted_text, ''.join(references)

    def generate_markdown(self, 
                         cleaned_html: str, 
                         base_url: str = "",
                         html2text_options: Optional[Dict[str, Any]] = None,
                         options: Optional[Dict[str, Any]] = None,
                         content_filter: Optional[RelevantContentFilter] = None,
                         citations: bool = True,
                         **kwargs) -> MarkdownGenerationResult:
        """
        Generate markdown with citations from cleaned HTML.
        
        How it works:
        1. Generate raw markdown from cleaned HTML.
        2. Convert links to citations.
        3. Generate fit markdown if content filter is provided.
        4. Return MarkdownGenerationResult.
        
        Args:
            cleaned_html (str): Cleaned HTML content.
            base_url (str): Base URL for URL joins.
            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
            citations (bool): Whether to generate citations.
            
        Returns:
            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
        """
        try:
            # Initialize HTML2Text with default options for better conversion
            h = CustomHTML2Text(baseurl=base_url)
            default_options = {
                'body_width': 0,  # Disable text wrapping
                'ignore_emphasis': False,
                'ignore_links': False,
                'ignore_images': False,
                'protect_links': True,
                'single_line_break': True,
                'mark_code': True,
                'escape_snob': False
            }
            
            # Update with custom options if provided
            if html2text_options:
                default_options.update(html2text_options)
            elif options:
                default_options.update(options)
            elif self.options:
                default_options.update(self.options)
            
            h.update_params(**default_options)

            # Ensure we have valid input
            if not cleaned_html:
                cleaned_html = ""
            elif not isinstance(cleaned_html, str):
                cleaned_html = str(cleaned_html)

            # Generate raw markdown
            try:
                raw_markdown = h.handle(cleaned_html)
            except Exception as e:
                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
            
            raw_markdown = raw_markdown.replace('    ```', '```')

            # Convert links to citations
            markdown_with_citations: str = raw_markdown
            references_markdown: str = ""
            if citations:
                try:
                    markdown_with_citations, references_markdown = self.convert_links_to_citations(
                        raw_markdown, base_url
                    )
                except Exception as e:
                    markdown_with_citations = raw_markdown
                    references_markdown = f"Error generating citations: {str(e)}"

            # Generate fit markdown if content filter is provided
            fit_markdown: Optional[str] = ""
            filtered_html: Optional[str] = ""
            if content_filter or self.content_filter:
                try:
                    content_filter = content_filter or self.content_filter
                    filtered_html = content_filter.filter_content(cleaned_html)
                    filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
                    fit_markdown = h.handle(filtered_html)
                except Exception as e:
                    fit_markdown = f"Error generating fit markdown: {str(e)}"
                    filtered_html = ""

            return MarkdownGenerationResult(
                raw_markdown=raw_markdown or "",
                markdown_with_citations=markdown_with_citations or "",
                references_markdown=references_markdown or "",
                fit_markdown=fit_markdown or "",
                fit_html=filtered_html or "",
            )
        except Exception as e:
            # If anything fails, return empty strings with error message
            error_msg = f"Error in markdown generation: {str(e)}"
            return MarkdownGenerationResult(
                raw_markdown=error_msg,
                markdown_with_citations=error_msg,
                references_markdown="",
                fit_markdown="",
                fit_html="",
            )