Arafath10 commited on
Commit
ac10c70
1 Parent(s): bdb4bd8

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +19 -4
scraper.py CHANGED
@@ -11,12 +11,27 @@ class Scraper:
11
  async def power_scrapper_2(url):
12
  async with async_playwright() as p:
13
  browser = await p.chromium.launch(headless=True)
14
- page = await browser.new_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Route to block images, videos, and CSS
17
- await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
18
 
19
- await page.goto(url)
 
 
20
 
21
  # Get the title
22
  title = await page.title()
 
11
  async def power_scrapper_2(url):
12
  async with async_playwright() as p:
13
  browser = await p.chromium.launch(headless=True)
14
+ # Create a new browser context with a realistic user-agent
15
+ context = await browser.new_context(
16
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
17
+ )
18
+
19
+ # Set additional headers to force HTTP/1.1 and avoid detection
20
+ await context.set_extra_http_headers({
21
+ "Accept-Language": "en-US,en;q=0.9",
22
+ "Upgrade-Insecure-Requests": "1",
23
+ "Connection": "keep-alive" # Force HTTP/1.1 instead of HTTP/2
24
+ })
25
+
26
+ # Open a new page
27
+ page = await context.new_page()
28
 
29
+ # Route to block images, videos, and CSS to speed up page load
30
+ await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
31
 
32
+ # Navigate to the page with an extended timeout and alternate loading strategy
33
+ await page.goto(url, wait_until='domcontentloaded', timeout=60000)
34
+ # Wait for a specific element (like the title or an H1 header) to ensure the page is loaded
35
 
36
  # Get the title
37
  title = await page.title()