Spaces:
Runtime error
Runtime error
import os | |
import re | |
import sys | |
import pytest | |
import json | |
from bs4 import BeautifulSoup | |
import asyncio | |
# Add the parent directory to the Python path | |
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
sys.path.append(parent_dir) | |
from crawl4ai.async_webcrawler import AsyncWebCrawler | |
# @pytest.mark.asyncio | |
# async def test_large_content_page(): | |
# async with AsyncWebCrawler(verbose=True) as crawler: | |
# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table | |
# result = await crawler.arun(url=url, bypass_cache=True) | |
# assert result.success | |
# assert len(result.html) > 1000000 # Expecting more than 1MB of content | |
# @pytest.mark.asyncio | |
# async def test_minimal_content_page(): | |
# async with AsyncWebCrawler(verbose=True) as crawler: | |
# url = "https://example.com" # A very simple page | |
# result = await crawler.arun(url=url, bypass_cache=True) | |
# assert result.success | |
# assert len(result.html) < 10000 # Expecting less than 10KB of content | |
# @pytest.mark.asyncio | |
# async def test_single_page_application(): | |
# async with AsyncWebCrawler(verbose=True) as crawler: | |
# url = "https://reactjs.org/" # React's website is a SPA | |
# result = await crawler.arun(url=url, bypass_cache=True) | |
# assert result.success | |
# assert "react" in result.html.lower() | |
# @pytest.mark.asyncio | |
# async def test_page_with_infinite_scroll(): | |
# async with AsyncWebCrawler(verbose=True) as crawler: | |
# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll | |
# result = await crawler.arun(url=url, bypass_cache=True) | |
# assert result.success | |
# assert "hacker news" in result.html.lower() | |
# @pytest.mark.asyncio | |
# async def test_page_with_heavy_javascript(): | |
# async with AsyncWebCrawler(verbose=True) as crawler: | |
# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript | |
# result = await crawler.arun(url=url, bypass_cache=True) | |
# assert result.success | |
# assert "airbnb" in result.html.lower() | |
# @pytest.mark.asyncio | |
# async def test_page_with_mixed_content(): | |
# async with AsyncWebCrawler(verbose=True) as crawler: | |
# url = "https://github.com/" # GitHub has a mix of static and dynamic content | |
# result = await crawler.arun(url=url, bypass_cache=True) | |
# assert result.success | |
# assert "github" in result.html.lower() | |
# Add this test to your existing test file | |
async def test_typescript_commits_multi_page(): | |
first_commit = "" | |
async def on_execution_started(page): | |
nonlocal first_commit | |
try: | |
# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4')) | |
while True: | |
await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4') | |
commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4') | |
commit = await commit.evaluate('(element) => element.textContent') | |
commit = re.sub(r'\s+', '', commit) | |
if commit and commit != first_commit: | |
first_commit = commit | |
break | |
await asyncio.sleep(0.5) | |
except Exception as e: | |
print(f"Warning: New content didn't appear after JavaScript execution: {e}") | |
async with AsyncWebCrawler(verbose=True) as crawler: | |
crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started) | |
url = "https://github.com/microsoft/TypeScript/commits/main" | |
session_id = "typescript_commits_session" | |
all_commits = [] | |
js_next_page = """ | |
const button = document.querySelector('a[data-testid="pagination-next-button"]'); | |
if (button) button.click(); | |
""" | |
for page in range(3): # Crawl 3 pages | |
result = await crawler.arun( | |
url=url, # Only use URL for the first page | |
session_id=session_id, | |
css_selector="li.Box-sc-g0xbh4-0", | |
js=js_next_page if page > 0 else None, # Don't click 'next' on the first page | |
bypass_cache=True, | |
js_only=page > 0 # Use js_only for subsequent pages | |
) | |
assert result.success, f"Failed to crawl page {page + 1}" | |
# Parse the HTML and extract commits | |
soup = BeautifulSoup(result.cleaned_html, 'html.parser') | |
commits = soup.select("li") | |
# Take first commit find h4 extract text | |
first_commit = commits[0].find("h4").text | |
first_commit = re.sub(r'\s+', '', first_commit) | |
all_commits.extend(commits) | |
print(f"Page {page + 1}: Found {len(commits)} commits") | |
# Clean up the session | |
await crawler.crawler_strategy.kill_session(session_id) | |
# Assertions | |
assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}" | |
print(f"Successfully crawled {len(all_commits)} commits across 3 pages") | |
# Entry point for debugging | |
if __name__ == "__main__": | |
pytest.main([__file__, "-v"]) |