Arafath10 commited on
Commit
0f5f90c
·
verified ·
1 Parent(s): c79d50f

Update user_guide_sync.py

Browse files
Files changed (1) hide show
  1. user_guide_sync.py +23 -20
user_guide_sync.py CHANGED
@@ -19,29 +19,32 @@ from llama_index import StorageContext, load_index_from_storage
19
 
20
 
21
  # URL of the page to scrape
22
- url = 'https://help.storemate.cloud/docs/'
23
 
24
 
25
  def get_web_data(valid_links):
26
  for url in valid_links:
27
- # Send a GET request to the URL
28
- response = requests.get(url)
29
-
30
- # Parse the page content with BeautifulSoup
31
- soup = BeautifulSoup(response.content, 'html.parser')
32
-
33
- # Find the title and section content
34
- title = soup.find('h1').get_text()
35
-
36
- # Find the section with the title "Renew Package Subscription"
37
- section = soup.find('h1').find_next('div')
38
- # Extract the text content from the section
39
- section_text = section.get_text().strip()
40
- section_text = section_text + f"\nmore detail link : {url}"
41
-
42
- file = open(f"user_guide/{title}.txt","w")
43
- file.write(f"{title}\n{section_text}")
44
- file.close()
 
 
 
45
  print("data collected")
46
 
47
 
@@ -58,7 +61,7 @@ def get_base_links():
58
  valid_links = []
59
  # Extract and print all the URLs
60
  for link in links:
61
- if url in str(link):
62
  valid_links.append(link['href'])
63
 
64
  print("base links collected")
 
19
 
20
 
21
  # URL of the page to scrape
22
+ url = 'https://help.storemate.cloud/docs/reports/'
23
 
24
 
25
  def get_web_data(valid_links):
26
  for url in valid_links:
27
+ try:
28
+ # Send a GET request to the URL
29
+ response = requests.get(url)
30
+
31
+ # Parse the page content with BeautifulSoup
32
+ soup = BeautifulSoup(response.content, 'html.parser')
33
+
34
+ # Find the title and section content
35
+ title = soup.find('h1').get_text()
36
+
37
+ # Find the section with the title "Renew Package Subscription"
38
+ section = soup.find('h1').find_next('div')
39
+ # Extract the text content from the section
40
+ section_text = section.get_text().strip()
41
+ section_text = section_text + f"\nmore detail link : {url}"
42
+
43
+ file = open(f"user_guide/{title}.txt","w")
44
+ file.write(f"{title}\n{section_text}")
45
+ file.close()
46
+ except:
47
+ pass
48
  print("data collected")
49
 
50
 
 
61
  valid_links = []
62
  # Extract and print all the URLs
63
  for link in links:
64
+ if "https://help.storemate.cloud/docs/" in str(link):
65
  valid_links.append(link['href'])
66
 
67
  print("base links collected")