import os from datetime import date, timedelta import bs4 from langchain.indexes import SQLRecordManager, index from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.chroma import Chroma from langchain_community.document_loaders import WebBaseLoader from langchain_google_genai import GoogleGenerativeAIEmbeddings from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait import config DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm" embeddings_model = GoogleGenerativeAIEmbeddings( model=config.GOOGLE_EMBEDDING_MODEL ) # type: ignore options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=options) def scrap_articles( url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5 ): today = date.today() driver.get(url) all_articles = [] for i in range(num_days_past + 1): past_date = today - timedelta(days=i) date_str = past_date.strftime("%Y-%m-%d") WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "dateActu")) ) text_box = driver.find_element(By.ID, "dateActu") text_box.send_keys(date_str) submit_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.ID, "btn")) ) submit_btn.click() dates = driver.find_elements(By.CLASS_NAME, "sp1") titles = driver.find_elements(By.XPATH, "//td/a") articles = [] for i in range(len(titles)): art = { "title": titles[i].text.strip(), "date": dates[i].text, "link": titles[i].get_attribute("href"), } articles.append(art) all_articles += articles # driver.quit() return all_articles def set_metadata(documents, metadatas): """ #Edit a metadata of lanchain Documents object """ for doc in documents: idx = documents.index(doc) doc.metadata = metadatas[idx] print("Metadata successfully changed") print(documents[0].metadata) def process_docs( articles, persist_directory, embeddings_model, chunk_size=1000, chunk_overlap=100 ): """ #Scrap all articles urls content and save on a vector DB """ article_urls = [a["link"] for a in articles] print("Starting to scrap ..") loader = WebBaseLoader( web_paths=article_urls, bs_kwargs=dict( parse_only=bs4.SoupStrainer( class_=("inarticle txtbig", "dt_sign", "innerUp") ) ), ) print("After scraping Loading ..") docs = loader.load() # Update metadata: add title, set_metadata(documents=docs, metadatas=articles) print("Successfully loaded to document") text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"] ) splits = text_splitter.split_documents(docs) # Create the storage path if it doesn't exist if not os.path.exists(persist_directory): os.makedirs(persist_directory) doc_search = Chroma.from_documents( documents=splits, embedding=embeddings_model, persist_directory=persist_directory, ) # Indexing data namespace = "chromadb/my_documents" record_manager = SQLRecordManager( namespace, db_url="sqlite:///record_manager_cache.sql" ) record_manager.create_schema() index_result = index( docs, record_manager, doc_search, cleanup="incremental", source_id_key="link", ) print(f"Indexing stats: {index_result}") return doc_search if __name__ == "__main__": data = scrap_articles(DATA_URL, num_days_past=2) vectordb = process_docs(data, config.STORAGE_PATH, embeddings_model) ret = vectordb.as_retriever()