Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +19 -4
scraper.py
CHANGED
@@ -11,12 +11,27 @@ class Scraper:
|
|
11 |
async def power_scrapper_2(url):
|
12 |
async with async_playwright() as p:
|
13 |
browser = await p.chromium.launch(headless=True)
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
# Route to block images, videos, and CSS
|
17 |
-
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"
|
18 |
|
19 |
-
|
|
|
|
|
20 |
|
21 |
# Get the title
|
22 |
title = await page.title()
|
|
|
11 |
async def power_scrapper_2(url):
|
12 |
async with async_playwright() as p:
|
13 |
browser = await p.chromium.launch(headless=True)
|
14 |
+
# Create a new browser context with a realistic user-agent
|
15 |
+
context = await browser.new_context(
|
16 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
17 |
+
)
|
18 |
+
|
19 |
+
# Set additional headers to force HTTP/1.1 and avoid detection
|
20 |
+
await context.set_extra_http_headers({
|
21 |
+
"Accept-Language": "en-US,en;q=0.9",
|
22 |
+
"Upgrade-Insecure-Requests": "1",
|
23 |
+
"Connection": "keep-alive" # Force HTTP/1.1 instead of HTTP/2
|
24 |
+
})
|
25 |
+
|
26 |
+
# Open a new page
|
27 |
+
page = await context.new_page()
|
28 |
|
29 |
+
# Route to block images, videos, and CSS to speed up page load
|
30 |
+
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
|
31 |
|
32 |
+
# Navigate to the page with an extended timeout and alternate loading strategy
|
33 |
+
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
34 |
+
# Wait for a specific element (like the title or an H1 header) to ensure the page is loaded
|
35 |
|
36 |
# Get the title
|
37 |
title = await page.title()
|