File size: 5,279 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import re
import sys
import pytest
import json
from bs4 import BeautifulSoup
import asyncio
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from crawl4ai.async_webcrawler import AsyncWebCrawler

# @pytest.mark.asyncio
# async def test_large_content_page():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars"  # A page with a large table
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert len(result.html) > 1000000  # Expecting more than 1MB of content

# @pytest.mark.asyncio
# async def test_minimal_content_page():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://example.com"  # A very simple page
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert len(result.html) < 10000  # Expecting less than 10KB of content

# @pytest.mark.asyncio
# async def test_single_page_application():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://reactjs.org/"  # React's website is a SPA
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "react" in result.html.lower()

# @pytest.mark.asyncio
# async def test_page_with_infinite_scroll():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://news.ycombinator.com/"  # Hacker News has infinite scroll
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "hacker news" in result.html.lower()

# @pytest.mark.asyncio
# async def test_page_with_heavy_javascript():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://www.airbnb.com/"  # Airbnb uses a lot of JavaScript
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "airbnb" in result.html.lower()

# @pytest.mark.asyncio
# async def test_page_with_mixed_content():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://github.com/"  # GitHub has a mix of static and dynamic content
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "github" in result.html.lower()

# Add this test to your existing test file
@pytest.mark.asyncio
async def test_typescript_commits_multi_page():
    first_commit = ""
    async def on_execution_started(page):
        nonlocal first_commit 
        try:
            # Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
            while True:
                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
                commit = await commit.evaluate('(element) => element.textContent')
                commit = re.sub(r'\s+', '', commit)
                if commit and commit != first_commit:
                    first_commit = commit
                    break
                await asyncio.sleep(0.5)
        except Exception as e:
            print(f"Warning: New content didn't appear after JavaScript execution: {e}")


    async with AsyncWebCrawler(verbose=True) as crawler:
        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)

        url = "https://github.com/microsoft/TypeScript/commits/main"
        session_id = "typescript_commits_session"
        all_commits = []

        js_next_page = """
        const button = document.querySelector('a[data-testid="pagination-next-button"]');
        if (button) button.click();
        """

        for page in range(3):  # Crawl 3 pages
            result = await crawler.arun(
                url=url,  # Only use URL for the first page
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                js=js_next_page if page > 0 else None,  # Don't click 'next' on the first page
                bypass_cache=True,
                js_only=page > 0  # Use js_only for subsequent pages
            )

            assert result.success, f"Failed to crawl page {page + 1}"

            # Parse the HTML and extract commits
            soup = BeautifulSoup(result.cleaned_html, 'html.parser')
            commits = soup.select("li")
            # Take first commit find h4 extract text
            first_commit = commits[0].find("h4").text
            first_commit = re.sub(r'\s+', '', first_commit)
            all_commits.extend(commits)

            print(f"Page {page + 1}: Found {len(commits)} commits")

        # Clean up the session
        await crawler.crawler_strategy.kill_session(session_id)

        # Assertions
        assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
        
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")                      

# Entry point for debugging
if __name__ == "__main__":
    pytest.main([__file__, "-v"])