Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +7 -7
scraper.py
CHANGED
@@ -6,9 +6,6 @@ from bs4 import BeautifulSoup
|
|
6 |
import requests
|
7 |
import time
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
class Scraper:
|
13 |
@staticmethod
|
14 |
async def power_scrapper_2(url):
|
@@ -32,7 +29,10 @@ class Scraper:
|
|
32 |
# Get page content (text from paragraphs and headers)
|
33 |
page_content = await page.evaluate("""() => {
|
34 |
let elements = Array.from(document.querySelectorAll('body *'));
|
35 |
-
return elements
|
|
|
|
|
|
|
36 |
}""")
|
37 |
|
38 |
|
@@ -99,7 +99,7 @@ class Scraper:
|
|
99 |
async def scrape(url):
|
100 |
try:
|
101 |
headers = {'User-Agent': 'Mozilla/5.0'}
|
102 |
-
response = requests.get(url,timeout=
|
103 |
soup = BeautifulSoup(response.content, 'html.parser')
|
104 |
|
105 |
title = Scraper.get_title(soup)
|
@@ -109,8 +109,8 @@ class Scraper:
|
|
109 |
if not links:
|
110 |
print("Running alternative scrapper")
|
111 |
links, text_content = await Scraper.power_scrapper_2(url)
|
|
|
112 |
return {"title": title, "URL": links, "Content": text_content}
|
113 |
except:
|
114 |
-
print("Running alternative scrapper second time")
|
115 |
title,links, text_content = await Scraper.power_scrapper_2(url)
|
116 |
-
return {"title": title, "URL": links, "Content": text_content}
|
|
|
6 |
import requests
|
7 |
import time
|
8 |
|
|
|
|
|
|
|
9 |
class Scraper:
|
10 |
@staticmethod
|
11 |
async def power_scrapper_2(url):
|
|
|
29 |
# Get page content (text from paragraphs and headers)
|
30 |
page_content = await page.evaluate("""() => {
|
31 |
let elements = Array.from(document.querySelectorAll('body *'));
|
32 |
+
return elements
|
33 |
+
.filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
|
34 |
+
.map(element => element.innerText.trim())
|
35 |
+
.join('\\n');
|
36 |
}""")
|
37 |
|
38 |
|
|
|
99 |
async def scrape(url):
|
100 |
try:
|
101 |
headers = {'User-Agent': 'Mozilla/5.0'}
|
102 |
+
response = requests.get(url,timeout=3)
|
103 |
soup = BeautifulSoup(response.content, 'html.parser')
|
104 |
|
105 |
title = Scraper.get_title(soup)
|
|
|
109 |
if not links:
|
110 |
print("Running alternative scrapper")
|
111 |
links, text_content = await Scraper.power_scrapper_2(url)
|
112 |
+
|
113 |
return {"title": title, "URL": links, "Content": text_content}
|
114 |
except:
|
|
|
115 |
title,links, text_content = await Scraper.power_scrapper_2(url)
|
116 |
+
return {"title": title, "URL": links, "Content": text_content}
|