Arafath10 commited on
Commit
b928ab9
1 Parent(s): 9097392

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +33 -1
scraper.py CHANGED
@@ -8,6 +8,38 @@ import requests
8
 
9
 
10
  class Scraper:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  @staticmethod
12
  async def power_scrapper(url):
13
  async with async_playwright() as p:
@@ -76,6 +108,6 @@ class Scraper:
76
 
77
  if not links:
78
  print("Running alternative scrapper")
79
- links, text_content = await Scraper.power_scrapper(url)
80
 
81
  return {"title": title, "URL": links, "Content": text_content}
 
8
 
9
 
10
  class Scraper:
11
+ @staticmethod
12
+ async def power_scrapper_2(url):
13
+ async with async_playwright() as p:
14
+ browser = await p.chromium.launch(headless=True)
15
+ page = await browser.new_page()
16
+ await page.goto(url)
17
+
18
+ # Get the title
19
+ #title = await page.title()
20
+
21
+ # Get all links
22
+ page_url = await page.evaluate("""() => {
23
+ return Array.from(document.querySelectorAll('a')).map(a => a.href);
24
+ }""")
25
+
26
+ # Get page content (paragraphs, headers)
27
+ page_content = await page.evaluate("""() => {
28
+ let elements = Array.from(document.querySelectorAll('p, h1, h2, h3, h4, h5, h6'));
29
+ return elements.map(element => ({
30
+ tag: element.tagName,
31
+ text: element.innerText
32
+ }));
33
+ }""")
34
+
35
+ # Print the results
36
+ # print(f"Title: {title}")
37
+ # print(f"Links: {links}")
38
+ # print(f"Content: {content}")
39
+
40
+ await browser.close()
41
+ return page_url, page_content
42
+
43
  @staticmethod
44
  async def power_scrapper(url):
45
  async with async_playwright() as p:
 
108
 
109
  if not links:
110
  print("Running alternative scrapper")
111
+ links, text_content = await Scraper.power_scrapper_2(url)
112
 
113
  return {"title": title, "URL": links, "Content": text_content}