Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -36,10 +36,11 @@ async def power_scrapper(url):
|
|
36 |
|
37 |
# Extract all links
|
38 |
links = await page.query_selector_all('a')
|
39 |
-
|
|
|
40 |
for link in links:
|
41 |
href = await link.get_attribute('href')
|
42 |
-
result.append(
|
43 |
|
44 |
# Extract all text content
|
45 |
elements = await page.query_selector_all('body *')
|
@@ -47,10 +48,10 @@ async def power_scrapper(url):
|
|
47 |
for element in elements:
|
48 |
text_content = await element.text_content()
|
49 |
if text_content and text_content.strip():
|
50 |
-
|
51 |
|
52 |
await browser.close()
|
53 |
-
return
|
54 |
|
55 |
|
56 |
def get_links(soup):
|
@@ -89,6 +90,6 @@ async def get_data(url: str):
|
|
89 |
|
90 |
if links==[]:
|
91 |
print("running alternative scrapper")
|
92 |
-
links = await power_scrapper(url)
|
93 |
|
94 |
-
return ({"title": title ,
|
|
|
36 |
|
37 |
# Extract all links
|
38 |
links = await page.query_selector_all('a')
|
39 |
+
page_url = []
|
40 |
+
page_content = []
|
41 |
for link in links:
|
42 |
href = await link.get_attribute('href')
|
43 |
+
result.append(href)
|
44 |
|
45 |
# Extract all text content
|
46 |
elements = await page.query_selector_all('body *')
|
|
|
48 |
for element in elements:
|
49 |
text_content = await element.text_content()
|
50 |
if text_content and text_content.strip():
|
51 |
+
page_content.append(text_content.strip())
|
52 |
|
53 |
await browser.close()
|
54 |
+
return page_url,page_content
|
55 |
|
56 |
|
57 |
def get_links(soup):
|
|
|
90 |
|
91 |
if links==[]:
|
92 |
print("running alternative scrapper")
|
93 |
+
links,text_content = await power_scrapper(url)
|
94 |
|
95 |
+
return ({"title": title ,"URL":links,"Content":text_content})
|