Arafath10 commited on
Commit
706f4ae
·
verified ·
1 Parent(s): 599b9d7

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +13 -11
scraper.py CHANGED
@@ -4,6 +4,8 @@ import asyncio
4
  from playwright.async_api import async_playwright
5
  from bs4 import BeautifulSoup
6
  import requests
 
 
7
 
8
 
9
 
@@ -13,26 +15,26 @@ class Scraper:
13
  async with async_playwright() as p:
14
  browser = await p.chromium.launch(headless=True)
15
  page = await browser.new_page()
 
 
 
 
16
  await page.goto(url)
17
-
18
  # Get the title
19
  #title = await page.title()
20
-
21
  # Get all links
22
  page_url = await page.evaluate("""() => {
23
  return Array.from(document.querySelectorAll('a')).map(a => a.href);
24
  }""")
25
-
26
- # Get page content (paragraphs, headers)
27
  page_content = await page.evaluate("""() => {
28
- let elements = Array.from(document.querySelectorAll('body *'));
29
- return elements.map(element => element.innerText).join('\\n');
30
  }""")
31
-
32
- # Print the results
33
- # print(f"Title: {title}")
34
- # print(f"Links: {links}")
35
- # print(f"Content: {content}")
36
 
37
  await browser.close()
38
  return page_url, page_content
 
4
  from playwright.async_api import async_playwright
5
  from bs4 import BeautifulSoup
6
  import requests
7
+ import time
8
+
9
 
10
 
11
 
 
15
  async with async_playwright() as p:
16
  browser = await p.chromium.launch(headless=True)
17
  page = await browser.new_page()
18
+
19
+ # Route to block images, videos, and CSS
20
+ await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
21
+
22
  await page.goto(url)
23
+
24
  # Get the title
25
  #title = await page.title()
26
+
27
  # Get all links
28
  page_url = await page.evaluate("""() => {
29
  return Array.from(document.querySelectorAll('a')).map(a => a.href);
30
  }""")
31
+
32
+ # Get page content (text from paragraphs and headers)
33
  page_content = await page.evaluate("""() => {
34
+ let elements = Array.from(document.querySelectorAll('body *'));
35
+ return elements.map(element => element.innerText).join('\\n');
36
  }""")
37
+
 
 
 
 
38
 
39
  await browser.close()
40
  return page_url, page_content