File size: 3,233 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import sys
import pytest
import asyncio
import json

# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from crawl4ai.async_webcrawler import AsyncWebCrawler

@pytest.mark.asyncio
async def test_extract_markdown():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
        assert result.success
        assert result.markdown
        assert isinstance(result.markdown, str)
        assert len(result.markdown) > 0

@pytest.mark.asyncio
async def test_extract_cleaned_html():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
        assert result.success
        assert result.cleaned_html
        assert isinstance(result.cleaned_html, str)
        assert len(result.cleaned_html) > 0

@pytest.mark.asyncio
async def test_extract_media():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
        assert result.success
        assert result.media
        media = result.media
        assert isinstance(media, dict)
        assert "images" in media
        assert isinstance(media["images"], list)
        for image in media["images"]:
            assert "src" in image
            assert "alt" in image
            assert "type" in image

@pytest.mark.asyncio
async def test_extract_links():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
        assert result.success
        assert result.links
        links = result.links
        assert isinstance(links, dict)
        assert "internal" in links
        assert "external" in links
        assert isinstance(links["internal"], list)
        assert isinstance(links["external"], list)
        for link in links["internal"] + links["external"]:
            assert "href" in link
            assert "text" in link

@pytest.mark.asyncio
async def test_extract_metadata():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
        assert result.success
        assert result.metadata
        metadata = result.metadata
        assert isinstance(metadata, dict)
        assert "title" in metadata
        assert isinstance(metadata["title"], str)

@pytest.mark.asyncio
async def test_css_selector_extraction():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        css_selector = "h1, h2, h3"
        result = await crawler.arun(url=url, bypass_cache=True, css_selector=css_selector)
        assert result.success
        assert result.markdown
        assert all(heading in result.markdown for heading in ["#", "##", "###"])

# Entry point for debugging
if __name__ == "__main__":
    pytest.main([__file__, "-v"])