curfox_chatbot / user_guide_sync.py
Arafath10's picture
Update user_guide_sync.py
cf0dcbd verified
raw
history blame
1.43 kB
import requests
from bs4 import BeautifulSoup
# URL of the page to scrape
url = 'https://help.storemate.cloud/docs/'
def get_web_data(valid_links):
for url in valid_links:
# Send a GET request to the URL
response = requests.get(url)
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the title and section content
title = soup.find('h1').get_text()
# Find the section with the title "Renew Package Subscription"
section = soup.find('h1').find_next('div')
# Extract the text content from the section
section_text = section.get_text().strip()
section_text = section_text + f"\nmore detail link : {url}"
file = open(f"user_guide/{title}.txt","w")
file.write(f"{title}\n{section_text}")
file.close()
def get_base_links():
# Send a GET request to the URL
response = requests.get(url)
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find all <a> tags with href attributes
links = soup.find_all('a', href=True)
valid_links = []
# Extract and print all the URLs
for link in links:
if url in str(link):
valid_links.append(link['href'])
get_web_data(valid_links)
return "data updated"
def update_user_guide():
get_base_links()