Crawl4AI

Runtime error

File size: 9,386 Bytes

03c0888

from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
import re
from urllib.parse import urljoin

# Pre-compile the regex pattern
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')

def fast_urljoin(base: str, url: str) -> str:
    """Fast URL joining for common cases."""
    if url.startswith(('http://', 'https://', 'mailto:', '//')):
        return url
    if url.startswith('/'):
        # Handle absolute paths
        if base.endswith('/'):
            return base[:-1] + url
        return base + url
    return urljoin(base, url)

class MarkdownGenerationStrategy(ABC):
    """Abstract base class for markdown generation strategies."""
    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
        self.content_filter = content_filter
        self.options = options or {}
    
    @abstractmethod
    def generate_markdown(self, 
                         cleaned_html: str, 
                         base_url: str = "",
                         html2text_options: Optional[Dict[str, Any]] = None,
                         content_filter: Optional[RelevantContentFilter] = None,
                         citations: bool = True,
                         **kwargs) -> MarkdownGenerationResult:
        """Generate markdown from cleaned HTML."""
        pass

class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
    """
    Default implementation of markdown generation strategy.
    
    How it works:
    1. Generate raw markdown from cleaned HTML.
    2. Convert links to citations.
    3. Generate fit markdown if content filter is provided.
    4. Return MarkdownGenerationResult.
    
    Args:
        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
        
    Returns:
        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
    """
    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
        super().__init__(content_filter, options)
    
    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
        """
        Convert links in markdown to citations.
        
        How it works:
        1. Find all links in the markdown.
        2. Convert links to citations.
        3. Return converted markdown and references markdown.
        
        Note:
        This function uses a regex pattern to find links in markdown.
        
        Args:
            markdown (str): Markdown text.
            base_url (str): Base URL for URL joins.
            
        Returns:
            Tuple[str, str]: Converted markdown and references markdown.
        """
        link_map = {}
        url_cache = {}  # Cache for URL joins
        parts = []
        last_end = 0
        counter = 1
        
        for match in LINK_PATTERN.finditer(markdown):
            parts.append(markdown[last_end:match.start()])
            text, url, title = match.groups()
            
            # Use cached URL if available, otherwise compute and cache
            if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
                if url not in url_cache:
                    url_cache[url] = fast_urljoin(base_url, url)
                url = url_cache[url]
                
            if url not in link_map:
                desc = []
                if title: desc.append(title)
                if text and text != title: desc.append(text)
                link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
                counter += 1
                
            num = link_map[url][0]
            parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
            last_end = match.end()
        
        parts.append(markdown[last_end:])
        converted_text = ''.join(parts)
        
        # Pre-build reference strings
        references = ["\n\n## References\n\n"]
        references.extend(
            f"⟨{num}⟩ {url}{desc}\n" 
            for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
        )
        
        return converted_text, ''.join(references)

    def generate_markdown(self, 
                         cleaned_html: str, 
                         base_url: str = "",
                         html2text_options: Optional[Dict[str, Any]] = None,
                         options: Optional[Dict[str, Any]] = None,
                         content_filter: Optional[RelevantContentFilter] = None,
                         citations: bool = True,
                         **kwargs) -> MarkdownGenerationResult:
        """
        Generate markdown with citations from cleaned HTML.
        
        How it works:
        1. Generate raw markdown from cleaned HTML.
        2. Convert links to citations.
        3. Generate fit markdown if content filter is provided.
        4. Return MarkdownGenerationResult.
        
        Args:
            cleaned_html (str): Cleaned HTML content.
            base_url (str): Base URL for URL joins.
            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
            citations (bool): Whether to generate citations.
            
        Returns:
            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
        """
        try:
            # Initialize HTML2Text with default options for better conversion
            h = CustomHTML2Text(baseurl=base_url)
            default_options = {
                'body_width': 0,  # Disable text wrapping
                'ignore_emphasis': False,
                'ignore_links': False,
                'ignore_images': False,
                'protect_links': True,
                'single_line_break': True,
                'mark_code': True,
                'escape_snob': False
            }
            
            # Update with custom options if provided
            if html2text_options:
                default_options.update(html2text_options)
            elif options:
                default_options.update(options)
            elif self.options:
                default_options.update(self.options)
            
            h.update_params(**default_options)

            # Ensure we have valid input
            if not cleaned_html:
                cleaned_html = ""
            elif not isinstance(cleaned_html, str):
                cleaned_html = str(cleaned_html)

            # Generate raw markdown
            try:
                raw_markdown = h.handle(cleaned_html)
            except Exception as e:
                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
            
            raw_markdown = raw_markdown.replace('    ```', '```')

            # Convert links to citations
            markdown_with_citations: str = raw_markdown
            references_markdown: str = ""
            if citations:
                try:
                    markdown_with_citations, references_markdown = self.convert_links_to_citations(
                        raw_markdown, base_url
                    )
                except Exception as e:
                    markdown_with_citations = raw_markdown
                    references_markdown = f"Error generating citations: {str(e)}"

            # Generate fit markdown if content filter is provided
            fit_markdown: Optional[str] = ""
            filtered_html: Optional[str] = ""
            if content_filter or self.content_filter:
                try:
                    content_filter = content_filter or self.content_filter
                    filtered_html = content_filter.filter_content(cleaned_html)
                    filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
                    fit_markdown = h.handle(filtered_html)
                except Exception as e:
                    fit_markdown = f"Error generating fit markdown: {str(e)}"
                    filtered_html = ""

            return MarkdownGenerationResult(
                raw_markdown=raw_markdown or "",
                markdown_with_citations=markdown_with_citations or "",
                references_markdown=references_markdown or "",
                fit_markdown=fit_markdown or "",
                fit_html=filtered_html or "",
            )
        except Exception as e:
            # If anything fails, return empty strings with error message
            error_msg = f"Error in markdown generation: {str(e)}"
            return MarkdownGenerationResult(
                raw_markdown=error_msg,
                markdown_with_citations=error_msg,
                references_markdown="",
                fit_markdown="",
                fit_html="",
            )