curfox_chatbot

Sleeping

File size: 2,535 Bytes

import requests
from bs4 import BeautifulSoup

import google.generativeai as genai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
#from langchain_experimental.agents.agent_toolkits import create_csv_agent


from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index import StorageContext, load_index_from_storage


#os.environ["OPENAI_API_KEY"]



# URL of the page to scrape
url = 'https://help.storemate.cloud/docs/reports/'


def get_web_data(valid_links):
    for url in valid_links:
      try:
          # Send a GET request to the URL
          response = requests.get(url)
        
          # Parse the page content with BeautifulSoup
          soup = BeautifulSoup(response.content, 'html.parser')
        
          # Find the title and section content
          title = soup.find('h1').get_text()
        
          # Find the section with the title "Renew Package Subscription"
          section = soup.find('h1').find_next('div')
          # Extract the text content from the section
          section_text = section.get_text().strip()
          section_text = section_text + f"\nmore detail link : {url}"
        
          file = open(f"user_guide/{title}.txt","w")
          file.write(f"{title}\n{section_text}")
          file.close()
      except:
        pass
    print("data collected")


def get_base_links():
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <a> tags with href attributes
    links = soup.find_all('a', href=True)
    
    valid_links = []
    # Extract and print all the URLs
    for link in links:
        if "https://help.storemate.cloud/docs/" in str(link):
           valid_links.append(link['href'])
            
    print("base links collected")
    
    get_web_data(valid_links)

    

def update_user_guide():
    get_base_links()
    # try:
    #     storage_context = StorageContext.from_defaults(persist_dir="llama_index")
    #     index = load_index_from_storage(storage_context=storage_context)
    #     print("loaded")
    # except:     
    documents = SimpleDirectoryReader("user_guide").load_data()
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist("llama_index")
    print("index created")
    return "done"