File size: 6,117 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# ## Issue #236
# - **Last Updated:** 2024-11-11 01:42:14
# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236)
# - **State:** open

import os, sys, time
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
import asyncio
import os
import time
from typing import Dict, Any
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

# Get current directory
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
    """Helper function to print test results."""
    print(f"\n{'='*20} {name} {'='*20}")
    print(f"Execution time: {execution_time:.4f} seconds")
    
    
    # Save markdown to files
    for key, content in result.items():
        if isinstance(content, str):
            with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f:
                f.write(content)
    
    # # Print first few lines of each markdown version
    # for key, content in result.items():
    #     if isinstance(content, str):
    #         preview = '\n'.join(content.split('\n')[:3])
    #         print(f"\n{key} (first 3 lines):")
    #         print(preview)
    #         print(f"Total length: {len(content)} characters")

def test_basic_markdown_conversion():
    """Test basic markdown conversion with links."""
    with open(__location__ + "/data/wikipedia.html", "r") as f:
        cleaned_html = f.read()

    generator = DefaultMarkdownGenerator()
    
    start_time = time.perf_counter()
    result = generator.generate_markdown(
        cleaned_html=cleaned_html,
        base_url="https://en.wikipedia.org"
    )
    execution_time = time.perf_counter() - start_time
    
    print_test_result("Basic Markdown Conversion", {
        'raw': result.raw_markdown,
        'with_citations': result.markdown_with_citations,
        'references': result.references_markdown
    }, execution_time)
    
    # Basic assertions
    assert result.raw_markdown, "Raw markdown should not be empty"
    assert result.markdown_with_citations, "Markdown with citations should not be empty"
    assert result.references_markdown, "References should not be empty"
    assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets"
    assert "## References" in result.references_markdown, "Should contain references section"

def test_relative_links():
    """Test handling of relative links with base URL."""
    markdown = """
    Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com).
    Also an [image](/images/test.png) and another [page](/wiki/Banana).
    """
    
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://en.wikipedia.org"
    )
    
    assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown
    assert "https://example.com" in result.references_markdown
    assert "https://en.wikipedia.org/images/test.png" in result.references_markdown

def test_duplicate_links():
    """Test handling of duplicate links."""
    markdown = """
    Here's a [link](/test) and another [link](/test) and a [different link](/other).
    """
    
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://example.com"
    )
    
    # Count citations in markdown
    citations = result.markdown_with_citations.count("⟨1⟩")
    assert citations == 2, "Same link should use same citation number"

def test_link_descriptions():
    """Test handling of link titles and descriptions."""
    markdown = """
    Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
    """
    
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://example.com"
    )
    
    assert "Test Title" in result.references_markdown, "Link title should be in references"
    assert "link with description" in result.references_markdown, "Link text should be in references"

def test_performance_large_document():
    """Test performance with large document."""
    with open(__location__ + "/data/wikipedia.md", "r") as f:
        markdown = f.read()
    
    # Test with multiple iterations
    iterations = 5
    times = []
    
    generator = DefaultMarkdownGenerator()
    
    for i in range(iterations):
        start_time = time.perf_counter()
        result = generator.generate_markdown(
            cleaned_html=markdown,
            base_url="https://en.wikipedia.org"
        )
        end_time = time.perf_counter()
        times.append(end_time - start_time)
    
    avg_time = sum(times) / len(times)
    print(f"\n{'='*20} Performance Test {'='*20}")
    print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds")
    print(f"Min time: {min(times):.4f} seconds")
    print(f"Max time: {max(times):.4f} seconds")

def test_image_links():
    """Test handling of image links."""
    markdown = """
    Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg).
    And a regular [link](/page).
    """
    
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://example.com"
    )
    
    assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved"
    assert "Image Title" in result.references_markdown, "Image title should be in references"

if __name__ == "__main__":
    print("Running markdown generation strategy tests...")
    
    test_basic_markdown_conversion()
    test_relative_links()
    test_duplicate_links()
    test_link_descriptions()
    test_performance_large_document()
    test_image_links()