Spaces:
Runtime error
Runtime error
File size: 2,201 Bytes
60e3b0a 0125da1 5505694 60e3b0a 1fdb555 60e3b0a 1fdb555 60e3b0a 1fdb555 60e3b0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import requests
from bs4 import BeautifulSoup
TOKEN_CUT_OFF = 2500
def process_webpage(url:str):
# A set to keep track of visited pages
visited_pages = set()
text_list = []
# A function to recursively get all child pages
def get_child_pages(url):
# Make a GET request to the page and get the HTML content
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Get all the text content from the relevant HTML tags
text_content = ""
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
for element in soup.find_all(tag):
text_content += element.get_text() + " "
# Add the page to the set of visited pages
text_content = f"page {url} contains: " + text_content
visited_pages.add(url)
# Find all the child links and recursively get their text content
for link in soup.find_all("a"):
href = link.get("href")
if href and href not in visited_pages and url in href:
get_child_pages(href)
text_list.append(text_content)
# Get the text content of the landing page
# get_child_pages(url)
# Make a GET request to the page and get the HTML content
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Get all the text content from the relevant HTML tags
text_content = ""
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
for element in soup.find_all(tag):
text_content += element.get_text() + " "
# # make main page as first item
# text_list.reverse()
# text_list_cut_off = text_list[:TOKEN_CUT_OFF]
# page_content = "\n".join(text_list_cut_off)
# # Print the text content of the landing page and all child pages
# print(page_content)
# return page_content
print(text_content)
return text_content
if __name__ == '__main__':
process_webpage(url="https://www.meet-drift.ai/") |