File size: 2,685 Bytes
0e27472
 
 
051b565
 
 
 
 
 
 
 
 
 
 
 
 
f1b81d4
4bb67f4
f1b81d4
4bb67f4
 
051b565
 
4bb67f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e27472
 
 
4bb67f4
 
 
0e27472
 
4bb67f4
0e27472
 
4bb67f4
 
0e27472
 
4bb67f4
 
 
 
 
0e27472
4bb67f4
 
 
c7aa78b
cf0dcbd
 
f1b81d4
 
 
 
 
 
35c49e8
6337586
 
 
4bb67f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import requests
from bs4 import BeautifulSoup

import google.generativeai as genai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
#from langchain_experimental.agents.agent_toolkits import create_csv_agent


from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index import StorageContext, load_index_from_storage


#os.environ["OPENAI_API_KEY"]
import concurrent.futures

# URL of the page to scrape
base_url = 'https://help.storemate.cloud/docs/reports/'


def fetch_web_data(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the title and section content
        title = soup.find('h1').get_text()

        # Find the section with the title "Renew Package Subscription"
        section = soup.find('h1').find_next('div').find_next('div')

        # Extract the text content from the section
        section_text = section.get_text().strip()
        section_text = section_text + f"\nMore detail link: {url}"

        # Save the data into a text file
        with open(f"user_guide/{title}.txt", "w") as file:
            file.write(f"{title}\n{section_text}")
    except Exception as e:
        print(f"Failed to fetch data from {url}: {e}")


def get_base_links():
    # Send a GET request to the base URL
    response = requests.get(base_url)

    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <a> tags with href attributes
    links = soup.find_all('a', href=True)

    # Collect all valid links
    valid_links = []
    for link in links:
        href = link['href']
        if href.startswith("https://help.storemate.cloud/docs/"):
            valid_links.append(href)

    print("Base links collected")

    # Use ThreadPoolExecutor to fetch web data in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_web_data, valid_links)
    

def update_user_guide():
    get_base_links()
    # try:
    #     storage_context = StorageContext.from_defaults(persist_dir="llama_index")
    #     index = load_index_from_storage(storage_context=storage_context)
    #     print("loaded")
    # except:     
    documents = SimpleDirectoryReader("user_guide").load_data()
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist("llama_index")
    print("index created")
    return "done"