webscrapper

Sleeping

Arafath10 commited on Oct 2, 2024

Commit

ac10c70

verified ·

1 Parent(s): bdb4bd8

Update scraper.py

Files changed (1) hide show

scraper.py CHANGED Viewed

@@ -11,12 +11,27 @@ class Scraper:
     async def power_scrapper_2(url):
         async with async_playwright() as p:
             browser = await p.chromium.launch(headless=True)
-            page = await browser.new_page()
-            # Route to block images, videos, and CSS
-            await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
-            await page.goto(url)
             # Get the title
             title = await page.title()

     async def power_scrapper_2(url):
         async with async_playwright() as p:
             browser = await p.chromium.launch(headless=True)
+            # Create a new browser context with a realistic user-agent
+            context = await browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            )
+            # Set additional headers to force HTTP/1.1 and avoid detection
+            await context.set_extra_http_headers({
+                "Accept-Language": "en-US,en;q=0.9",
+                "Upgrade-Insecure-Requests": "1",
+                "Connection": "keep-alive"  # Force HTTP/1.1 instead of HTTP/2
+            })
+            # Open a new page
+            page = await context.new_page()
+            # Route to block images, videos, and CSS to speed up page load
+            await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
+            # Navigate to the page with an extended timeout and alternate loading strategy
+            await page.goto(url, wait_until='domcontentloaded', timeout=60000)
+            # Wait for a specific element (like the title or an H1 header) to ensure the page is loaded
             # Get the title
             title = await page.title()