curfox_chatbot

Sleeping

Arafath10 commited on Sep 8, 2024

Commit

0e27472

verified ·

1 Parent(s): 875ce0c

Create user_guide_sync.py

Files changed (1) hide show

user_guide_sync.py ADDED Viewed

+import requests
+from bs4 import BeautifulSoup
+# URL of the page to scrape
+url = 'https://help.storemate.cloud/docs/'
+def get_web_data(valid_links):
+    for url in valid_links:
+      # Send a GET request to the URL
+      response = requests.get(url)
+      # Parse the page content with BeautifulSoup
+      soup = BeautifulSoup(response.content, 'html.parser')
+      # Find the title and section content
+      title = soup.find('h1').get_text()
+      # Find the section with the title "Renew Package Subscription"
+      section = soup.find('h1').find_next('div')
+      # Extract the text content from the section
+      section_text = section.get_text().strip()
+      section_text = section_text + f"\nmore detail link : {url}"
+      file = open(f"{title}.txt","w")
+      file.write(f"{title}\n{section_text}")
+def get_base_links():
+    # Send a GET request to the URL
+    response = requests.get(url)
+    # Parse the page content with BeautifulSoup
+    soup = BeautifulSoup(response.content, 'html.parser')
+    # Find all <a> tags with href attributes
+    links = soup.find_all('a', href=True)
+    valid_links = []
+    # Extract and print all the URLs
+    for link in links:
+        if url in str(link):
+           valid_links.append(link['href'])
+    get_web_data(valid_links)
+    return "data updated"