Spaces:
Runtime error
Runtime error
File size: 5,279 Bytes
03c0888 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import re
import sys
import pytest
import json
from bs4 import BeautifulSoup
import asyncio
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai.async_webcrawler import AsyncWebCrawler
# @pytest.mark.asyncio
# async def test_large_content_page():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert len(result.html) > 1000000 # Expecting more than 1MB of content
# @pytest.mark.asyncio
# async def test_minimal_content_page():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://example.com" # A very simple page
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert len(result.html) < 10000 # Expecting less than 10KB of content
# @pytest.mark.asyncio
# async def test_single_page_application():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://reactjs.org/" # React's website is a SPA
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "react" in result.html.lower()
# @pytest.mark.asyncio
# async def test_page_with_infinite_scroll():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "hacker news" in result.html.lower()
# @pytest.mark.asyncio
# async def test_page_with_heavy_javascript():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "airbnb" in result.html.lower()
# @pytest.mark.asyncio
# async def test_page_with_mixed_content():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://github.com/" # GitHub has a mix of static and dynamic content
# result = await crawler.arun(url=url, bypass_cache=True)
# assert result.success
# assert "github" in result.html.lower()
# Add this test to your existing test file
@pytest.mark.asyncio
async def test_typescript_commits_multi_page():
first_commit = ""
async def on_execution_started(page):
nonlocal first_commit
try:
# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
while True:
await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
commit = await commit.evaluate('(element) => element.textContent')
commit = re.sub(r'\s+', '', commit)
if commit and commit != first_commit:
first_commit = commit
break
await asyncio.sleep(0.5)
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
async with AsyncWebCrawler(verbose=True) as crawler:
crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url, # Only use URL for the first page
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
js=js_next_page if page > 0 else None, # Don't click 'next' on the first page
bypass_cache=True,
js_only=page > 0 # Use js_only for subsequent pages
)
assert result.success, f"Failed to crawl page {page + 1}"
# Parse the HTML and extract commits
soup = BeautifulSoup(result.cleaned_html, 'html.parser')
commits = soup.select("li")
# Take first commit find h4 extract text
first_commit = commits[0].find("h4").text
first_commit = re.sub(r'\s+', '', first_commit)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
# Clean up the session
await crawler.crawler_strategy.kill_session(session_id)
# Assertions
assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"]) |