Crawl4AI / tests /async /test_edge_cases.py
amaye15
test
03c0888
raw
history blame
5.28 kB
import os
import re
import sys
import pytest
import json
from bs4 import BeautifulSoup
import asyncio
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai.async_webcrawler import AsyncWebCrawler
# @pytest.mark.asyncio
# async def test_large_content_page():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert len(result.html) > 1000000 # Expecting more than 1MB of content
# @pytest.mark.asyncio
# async def test_minimal_content_page():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://example.com" # A very simple page
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert len(result.html) < 10000 # Expecting less than 10KB of content
# @pytest.mark.asyncio
# async def test_single_page_application():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://reactjs.org/" # React's website is a SPA
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "react" in result.html.lower()
# @pytest.mark.asyncio
# async def test_page_with_infinite_scroll():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "hacker news" in result.html.lower()
# @pytest.mark.asyncio
# async def test_page_with_heavy_javascript():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "airbnb" in result.html.lower()
# @pytest.mark.asyncio
# async def test_page_with_mixed_content():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://github.com/" # GitHub has a mix of static and dynamic content
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "github" in result.html.lower()
# Add this test to your existing test file
@pytest.mark.asyncio
async def test_typescript_commits_multi_page():
first_commit = ""
async def on_execution_started(page):
nonlocal first_commit
try:
# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
while True:
await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
commit = await commit.evaluate('(element) => element.textContent')
commit = re.sub(r'\s+', '', commit)
if commit and commit != first_commit:
first_commit = commit
break
await asyncio.sleep(0.5)
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
async with AsyncWebCrawler(verbose=True) as crawler:
crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url, # Only use URL for the first page
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
js=js_next_page if page > 0 else None, # Don't click 'next' on the first page
bypass_cache=True,
js_only=page > 0 # Use js_only for subsequent pages
)
assert result.success, f"Failed to crawl page {page + 1}"
# Parse the HTML and extract commits
soup = BeautifulSoup(result.cleaned_html, 'html.parser')
commits = soup.select("li")
# Take first commit find h4 extract text
first_commit = commits[0].find("h4").text
first_commit = re.sub(r'\s+', '', first_commit)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
# Clean up the session
await crawler.crawler_strategy.kill_session(session_id)
# Assertions
assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"])