curfox_chatbot

Sleeping

Arafath10 commited on Sep 24, 2024

Commit

0f5f90c

verified ·

1 Parent(s): c79d50f

Update user_guide_sync.py

Files changed (1) hide show

user_guide_sync.py CHANGED Viewed

@@ -19,29 +19,32 @@ from llama_index import StorageContext, load_index_from_storage
 # URL of the page to scrape
-url = 'https://help.storemate.cloud/docs/'
 def get_web_data(valid_links):
     for url in valid_links:
-      # Send a GET request to the URL
-      response = requests.get(url)
-      # Parse the page content with BeautifulSoup
-      soup = BeautifulSoup(response.content, 'html.parser')
-      # Find the title and section content
-      title = soup.find('h1').get_text()
-      # Find the section with the title "Renew Package Subscription"
-      section = soup.find('h1').find_next('div')
-      # Extract the text content from the section
-      section_text = section.get_text().strip()
-      section_text = section_text + f"\nmore detail link : {url}"
-      file = open(f"user_guide/{title}.txt","w")
-      file.write(f"{title}\n{section_text}")
-      file.close()
     print("data collected")
@@ -58,7 +61,7 @@ def get_base_links():
     valid_links = []
     # Extract and print all the URLs
     for link in links:
-        if url in str(link):
            valid_links.append(link['href'])
     print("base links collected")

 # URL of the page to scrape
+url = 'https://help.storemate.cloud/docs/reports/'
 def get_web_data(valid_links):
     for url in valid_links:
+      try:
+          # Send a GET request to the URL
+          response = requests.get(url)
+          # Parse the page content with BeautifulSoup
+          soup = BeautifulSoup(response.content, 'html.parser')
+          # Find the title and section content
+          title = soup.find('h1').get_text()
+          # Find the section with the title "Renew Package Subscription"
+          section = soup.find('h1').find_next('div')
+          # Extract the text content from the section
+          section_text = section.get_text().strip()
+          section_text = section_text + f"\nmore detail link : {url}"
+          file = open(f"user_guide/{title}.txt","w")
+          file.write(f"{title}\n{section_text}")
+          file.close()
+      except:
+        pass
     print("data collected")
     valid_links = []
     # Extract and print all the URLs
     for link in links:
+        if "https://help.storemate.cloud/docs/" in str(link):
            valid_links.append(link['href'])
     print("base links collected")