Arafath10 commited on
Commit
bdb4bd8
·
verified ·
1 Parent(s): ef24fee

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +7 -7
scraper.py CHANGED
@@ -6,9 +6,6 @@ from bs4 import BeautifulSoup
6
  import requests
7
  import time
8
 
9
-
10
-
11
-
12
  class Scraper:
13
  @staticmethod
14
  async def power_scrapper_2(url):
@@ -32,7 +29,10 @@ class Scraper:
32
  # Get page content (text from paragraphs and headers)
33
  page_content = await page.evaluate("""() => {
34
  let elements = Array.from(document.querySelectorAll('body *'));
35
- return elements.map(element => element.innerText).join('\\n');
 
 
 
36
  }""")
37
 
38
 
@@ -99,7 +99,7 @@ class Scraper:
99
  async def scrape(url):
100
  try:
101
  headers = {'User-Agent': 'Mozilla/5.0'}
102
- response = requests.get(url,timeout=5)
103
  soup = BeautifulSoup(response.content, 'html.parser')
104
 
105
  title = Scraper.get_title(soup)
@@ -109,8 +109,8 @@ class Scraper:
109
  if not links:
110
  print("Running alternative scrapper")
111
  links, text_content = await Scraper.power_scrapper_2(url)
 
112
  return {"title": title, "URL": links, "Content": text_content}
113
  except:
114
- print("Running alternative scrapper second time")
115
  title,links, text_content = await Scraper.power_scrapper_2(url)
116
- return {"title": title, "URL": links, "Content": text_content}
 
6
  import requests
7
  import time
8
 
 
 
 
9
  class Scraper:
10
  @staticmethod
11
  async def power_scrapper_2(url):
 
29
  # Get page content (text from paragraphs and headers)
30
  page_content = await page.evaluate("""() => {
31
  let elements = Array.from(document.querySelectorAll('body *'));
32
+ return elements
33
+ .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
34
+ .map(element => element.innerText.trim())
35
+ .join('\\n');
36
  }""")
37
 
38
 
 
99
  async def scrape(url):
100
  try:
101
  headers = {'User-Agent': 'Mozilla/5.0'}
102
+ response = requests.get(url,timeout=3)
103
  soup = BeautifulSoup(response.content, 'html.parser')
104
 
105
  title = Scraper.get_title(soup)
 
109
  if not links:
110
  print("Running alternative scrapper")
111
  links, text_content = await Scraper.power_scrapper_2(url)
112
+
113
  return {"title": title, "URL": links, "Content": text_content}
114
  except:
 
115
  title,links, text_content = await Scraper.power_scrapper_2(url)
116
+ return {"title": title, "URL": links, "Content": text_content}