import re import requests import pandas as pd from bs4 import BeautifulSoup from chroma_utils import save_handbook_to_chroma def get_chapters(base_url): """ Gets all URLs from the General Handbook main page by finding links in the doc-map structure. Only returns chapter-level URLs without section anchors. Returns: list: List of URLs for all chapters in the handbook """ response = requests.get(base_url) soup = BeautifulSoup(response.content, 'html.parser') # Extract language from base_url if present lang_match = re.search(r'lang=([a-z]{3})', base_url) lang = lang_match.group(1) if lang_match else 'eng' # Find all links within doc-map class elements doc_maps = soup.find_all("ul", class_="doc-map") urls = [] for doc_map in doc_maps: links = doc_map.find_all("a", class_="list-tile") for link in links: href = link.get('href') if href: # Remove any section anchors and query parameters base_href = href.split('?')[0].split('#')[0] # Construct full URL from relative path, including language if present full_url = f"https://www.churchofjesuschrist.org{base_href}?lang={lang}" urls.append(full_url) # Remove duplicates while preserving order unique_urls = list(dict.fromkeys(urls)) return unique_urls def get_sections(url): """ Gets all sections from a chapter page with their titles, URLs and text content. Only processes sections within the body-block div. Args: url (str): URL of the chapter page Returns: dict: Dictionary with section title, URL and text content """ response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Find the body-block div first body_block = soup.find("div", class_="body-block") if not body_block: return {} sections = body_block.find_all("section") result = [] for section in sections: # Get section title header = section.find("header") if header: title = header.find(re.compile( "h\d+")).text if header.find(re.compile("h\d+")) else "" # Get section URL from header link link = header.find("a", class_="cross-ref") section_id = section.get('id') section_url = f"https://www.churchofjesuschrist.org{link['href']}" if link else f"{url}#{section_id}" # Get section text paragraphs = section.find_all("p") # Exclude title number paragraph text = [p.text for p in paragraphs if not p.get( "class") or "title-number" not in p["class"]] text = " ".join(text) result.append({ 'title': title, 'url': section_url, 'text': text }) return result def update_handbook_data(handbook_url): chapters_urls = get_chapters(handbook_url) total_chapters = len(chapters_urls) handbook_data = [] for i, chapter_url in enumerate(chapters_urls): chapter_sections = get_sections(chapter_url) if chapter_sections: handbook_data.append(chapter_sections) print(f"Progress: {int(((i+1)/total_chapters)*100)}%") save_handbook_to_chroma(handbook_data) if __name__ == '__main__': update_handbook_data( "https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=spa")