Spaces:
Runtime error
Runtime error
# ## Issue #236 | |
# - **Last Updated:** 2024-11-11 01:42:14 | |
# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236) | |
# - **State:** open | |
import os, sys, time | |
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
sys.path.append(parent_dir) | |
__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
import asyncio | |
import os | |
import time | |
from typing import Dict, Any | |
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
# Get current directory | |
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
def print_test_result(name: str, result: Dict[str, Any], execution_time: float): | |
"""Helper function to print test results.""" | |
print(f"\n{'='*20} {name} {'='*20}") | |
print(f"Execution time: {execution_time:.4f} seconds") | |
# Save markdown to files | |
for key, content in result.items(): | |
if isinstance(content, str): | |
with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f: | |
f.write(content) | |
# # Print first few lines of each markdown version | |
# for key, content in result.items(): | |
# if isinstance(content, str): | |
# preview = '\n'.join(content.split('\n')[:3]) | |
# print(f"\n{key} (first 3 lines):") | |
# print(preview) | |
# print(f"Total length: {len(content)} characters") | |
def test_basic_markdown_conversion(): | |
"""Test basic markdown conversion with links.""" | |
with open(__location__ + "/data/wikipedia.html", "r") as f: | |
cleaned_html = f.read() | |
generator = DefaultMarkdownGenerator() | |
start_time = time.perf_counter() | |
result = generator.generate_markdown( | |
cleaned_html=cleaned_html, | |
base_url="https://en.wikipedia.org" | |
) | |
execution_time = time.perf_counter() - start_time | |
print_test_result("Basic Markdown Conversion", { | |
'raw': result.raw_markdown, | |
'with_citations': result.markdown_with_citations, | |
'references': result.references_markdown | |
}, execution_time) | |
# Basic assertions | |
assert result.raw_markdown, "Raw markdown should not be empty" | |
assert result.markdown_with_citations, "Markdown with citations should not be empty" | |
assert result.references_markdown, "References should not be empty" | |
assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets" | |
assert "## References" in result.references_markdown, "Should contain references section" | |
def test_relative_links(): | |
"""Test handling of relative links with base URL.""" | |
markdown = """ | |
Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com). | |
Also an [image](/images/test.png) and another [page](/wiki/Banana). | |
""" | |
generator = DefaultMarkdownGenerator() | |
result = generator.generate_markdown( | |
cleaned_html=markdown, | |
base_url="https://en.wikipedia.org" | |
) | |
assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown | |
assert "https://example.com" in result.references_markdown | |
assert "https://en.wikipedia.org/images/test.png" in result.references_markdown | |
def test_duplicate_links(): | |
"""Test handling of duplicate links.""" | |
markdown = """ | |
Here's a [link](/test) and another [link](/test) and a [different link](/other). | |
""" | |
generator = DefaultMarkdownGenerator() | |
result = generator.generate_markdown( | |
cleaned_html=markdown, | |
base_url="https://example.com" | |
) | |
# Count citations in markdown | |
citations = result.markdown_with_citations.count("⟨1⟩") | |
assert citations == 2, "Same link should use same citation number" | |
def test_link_descriptions(): | |
"""Test handling of link titles and descriptions.""" | |
markdown = """ | |
Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. | |
""" | |
generator = DefaultMarkdownGenerator() | |
result = generator.generate_markdown( | |
cleaned_html=markdown, | |
base_url="https://example.com" | |
) | |
assert "Test Title" in result.references_markdown, "Link title should be in references" | |
assert "link with description" in result.references_markdown, "Link text should be in references" | |
def test_performance_large_document(): | |
"""Test performance with large document.""" | |
with open(__location__ + "/data/wikipedia.md", "r") as f: | |
markdown = f.read() | |
# Test with multiple iterations | |
iterations = 5 | |
times = [] | |
generator = DefaultMarkdownGenerator() | |
for i in range(iterations): | |
start_time = time.perf_counter() | |
result = generator.generate_markdown( | |
cleaned_html=markdown, | |
base_url="https://en.wikipedia.org" | |
) | |
end_time = time.perf_counter() | |
times.append(end_time - start_time) | |
avg_time = sum(times) / len(times) | |
print(f"\n{'='*20} Performance Test {'='*20}") | |
print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds") | |
print(f"Min time: {min(times):.4f} seconds") | |
print(f"Max time: {max(times):.4f} seconds") | |
def test_image_links(): | |
"""Test handling of image links.""" | |
markdown = """ | |
Here's an  and another . | |
And a regular [link](/page). | |
""" | |
generator = DefaultMarkdownGenerator() | |
result = generator.generate_markdown( | |
cleaned_html=markdown, | |
base_url="https://example.com" | |
) | |
assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved" | |
assert "Image Title" in result.references_markdown, "Image title should be in references" | |
if __name__ == "__main__": | |
print("Running markdown generation strategy tests...") | |
test_basic_markdown_conversion() | |
test_relative_links() | |
test_duplicate_links() | |
test_link_descriptions() | |
test_performance_large_document() | |
test_image_links() | |