Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +13 -11
scraper.py
CHANGED
@@ -4,6 +4,8 @@ import asyncio
|
|
4 |
from playwright.async_api import async_playwright
|
5 |
from bs4 import BeautifulSoup
|
6 |
import requests
|
|
|
|
|
7 |
|
8 |
|
9 |
|
@@ -13,26 +15,26 @@ class Scraper:
|
|
13 |
async with async_playwright() as p:
|
14 |
browser = await p.chromium.launch(headless=True)
|
15 |
page = await browser.new_page()
|
|
|
|
|
|
|
|
|
16 |
await page.goto(url)
|
17 |
-
|
18 |
# Get the title
|
19 |
#title = await page.title()
|
20 |
-
|
21 |
# Get all links
|
22 |
page_url = await page.evaluate("""() => {
|
23 |
return Array.from(document.querySelectorAll('a')).map(a => a.href);
|
24 |
}""")
|
25 |
-
|
26 |
-
# Get page content (paragraphs
|
27 |
page_content = await page.evaluate("""() => {
|
28 |
-
|
29 |
-
|
30 |
}""")
|
31 |
-
|
32 |
-
# Print the results
|
33 |
-
# print(f"Title: {title}")
|
34 |
-
# print(f"Links: {links}")
|
35 |
-
# print(f"Content: {content}")
|
36 |
|
37 |
await browser.close()
|
38 |
return page_url, page_content
|
|
|
4 |
from playwright.async_api import async_playwright
|
5 |
from bs4 import BeautifulSoup
|
6 |
import requests
|
7 |
+
import time
|
8 |
+
|
9 |
|
10 |
|
11 |
|
|
|
15 |
async with async_playwright() as p:
|
16 |
browser = await p.chromium.launch(headless=True)
|
17 |
page = await browser.new_page()
|
18 |
+
|
19 |
+
# Route to block images, videos, and CSS
|
20 |
+
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet"] else route.continue_())
|
21 |
+
|
22 |
await page.goto(url)
|
23 |
+
|
24 |
# Get the title
|
25 |
#title = await page.title()
|
26 |
+
|
27 |
# Get all links
|
28 |
page_url = await page.evaluate("""() => {
|
29 |
return Array.from(document.querySelectorAll('a')).map(a => a.href);
|
30 |
}""")
|
31 |
+
|
32 |
+
# Get page content (text from paragraphs and headers)
|
33 |
page_content = await page.evaluate("""() => {
|
34 |
+
let elements = Array.from(document.querySelectorAll('body *'));
|
35 |
+
return elements.map(element => element.innerText).join('\\n');
|
36 |
}""")
|
37 |
+
|
|
|
|
|
|
|
|
|
38 |
|
39 |
await browser.close()
|
40 |
return page_url, page_content
|