Spaces:
Building
Building
File size: 3,233 Bytes
03c0888 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import os
import sys
import pytest
import asyncio
import json
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai.async_webcrawler import AsyncWebCrawler
@pytest.mark.asyncio
async def test_extract_markdown():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
result = await crawler.arun(url=url, bypass_cache=True)
assert result.success
assert result.markdown
assert isinstance(result.markdown, str)
assert len(result.markdown) > 0
@pytest.mark.asyncio
async def test_extract_cleaned_html():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
result = await crawler.arun(url=url, bypass_cache=True)
assert result.success
assert result.cleaned_html
assert isinstance(result.cleaned_html, str)
assert len(result.cleaned_html) > 0
@pytest.mark.asyncio
async def test_extract_media():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
result = await crawler.arun(url=url, bypass_cache=True)
assert result.success
assert result.media
media = result.media
assert isinstance(media, dict)
assert "images" in media
assert isinstance(media["images"], list)
for image in media["images"]:
assert "src" in image
assert "alt" in image
assert "type" in image
@pytest.mark.asyncio
async def test_extract_links():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
result = await crawler.arun(url=url, bypass_cache=True)
assert result.success
assert result.links
links = result.links
assert isinstance(links, dict)
assert "internal" in links
assert "external" in links
assert isinstance(links["internal"], list)
assert isinstance(links["external"], list)
for link in links["internal"] + links["external"]:
assert "href" in link
assert "text" in link
@pytest.mark.asyncio
async def test_extract_metadata():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
result = await crawler.arun(url=url, bypass_cache=True)
assert result.success
assert result.metadata
metadata = result.metadata
assert isinstance(metadata, dict)
assert "title" in metadata
assert isinstance(metadata["title"], str)
@pytest.mark.asyncio
async def test_css_selector_extraction():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
css_selector = "h1, h2, h3"
result = await crawler.arun(url=url, bypass_cache=True, css_selector=css_selector)
assert result.success
assert result.markdown
assert all(heading in result.markdown for heading in ["#", "##", "###"])
# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"]) |