from datetime import date, timedelta import bs4 from langchain.retrievers import ParentDocumentRetriever from langchain.storage import LocalFileStore from langchain.storage._lc_store import create_kv_docstore from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.chroma import Chroma from langchain_community.document_loaders import WebBaseLoader from langchain_google_genai import GoogleGenerativeAIEmbeddings from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait import config DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm" embeddings_model = GoogleGenerativeAIEmbeddings( model=config.GOOGLE_EMBEDDING_MODEL ) # type: ignore options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=options) def scrap_articles( url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5 ): today = date.today() driver.get(url) all_articles = [] for i in range(num_days_past + 1): past_date = today - timedelta(days=i) date_str = past_date.strftime("%Y-%m-%d") WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "dateActu")) ) text_box = driver.find_element(By.ID, "dateActu") text_box.send_keys(date_str) submit_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.ID, "btn")) ) submit_btn.click() dates = driver.find_elements(By.CLASS_NAME, "sp1") table = driver.find_element(By.ID, "tabQuotes") titles = table.find_elements(By.TAG_NAME, "a") articles = [] for i in range(len(titles)): art = { "title": titles[i].text.strip(), "date": dates[i].text, "link": titles[i].get_attribute("href"), } articles.append(art) all_articles += articles # driver.quit() return all_articles def set_metadata(documents, metadatas): """ #Edit a metadata of lanchain Documents object """ for doc in documents: idx = documents.index(doc) doc.metadata = metadatas[idx] print("Metadata successfully changed") print(documents[0].metadata) def process_docs( articles, persist_directory, embeddings_model, chunk_size=500, chunk_overlap=0 ): """ #Scrap all articles urls content and save on a vector DB """ article_urls = [a["link"] for a in articles] print("Starting to scrap ..") loader = WebBaseLoader( web_paths=article_urls, bs_kwargs=dict( parse_only=bs4.SoupStrainer( class_=("inarticle txtbig", "dt_sign", "innerUp") ) ), ) print("After scraping Loading ..") docs = loader.load() # Update metadata: add title, set_metadata(documents=docs, metadatas=articles) # print("Successfully loaded to document") # This text splitter is used to create the child documents child_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"] ) # The vectorstore to use to index the child chunks vectorstore = Chroma( persist_directory=persist_directory + "vectorstore", collection_name="full_documents", embedding_function=embeddings_model, ) # The storage layer for the parent documents fs = LocalFileStore(persist_directory + "docstore") store = create_kv_docstore(fs) retriever = ParentDocumentRetriever( vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, ) retriever.add_documents(docs, ids=None) print(len(docs), " documents added") if __name__ == "__main__": data = scrap_articles(DATA_URL, num_days_past=config.NUM_DAYS_PAST) process_docs(data, config.STORAGE_PATH, embeddings_model)