Spaces:
Sleeping
Sleeping
import re | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from chroma_utils import save_handbook_to_chroma | |
def get_chapters(base_url): | |
""" | |
Gets all URLs from the General Handbook main page by finding links in the doc-map structure. | |
Only returns chapter-level URLs without section anchors. | |
Returns: | |
list: List of URLs for all chapters in the handbook | |
""" | |
response = requests.get(base_url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract language from base_url if present | |
lang_match = re.search(r'lang=([a-z]{3})', base_url) | |
lang = lang_match.group(1) if lang_match else 'eng' | |
# Find all links within doc-map class elements | |
doc_maps = soup.find_all("ul", class_="doc-map") | |
urls = [] | |
for doc_map in doc_maps: | |
links = doc_map.find_all("a", class_="list-tile") | |
for link in links: | |
href = link.get('href') | |
if href: | |
# Remove any section anchors and query parameters | |
base_href = href.split('?')[0].split('#')[0] | |
# Construct full URL from relative path, including language if present | |
full_url = f"https://www.churchofjesuschrist.org{base_href}?lang={lang}" | |
urls.append(full_url) | |
# Remove duplicates while preserving order | |
unique_urls = list(dict.fromkeys(urls)) | |
return unique_urls | |
def get_sections(url): | |
""" | |
Gets all sections from a chapter page with their titles, URLs and text content. | |
Only processes sections within the body-block div. | |
Args: | |
url (str): URL of the chapter page | |
Returns: | |
dict: Dictionary with section title, URL and text content | |
""" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find the body-block div first | |
body_block = soup.find("div", class_="body-block") | |
if not body_block: | |
return {} | |
sections = body_block.find_all("section") | |
result = [] | |
for section in sections: | |
# Get section title | |
header = section.find("header") | |
if header: | |
title = header.find(re.compile( | |
"h\d+")).text if header.find(re.compile("h\d+")) else "" | |
# Get section URL from header link | |
link = header.find("a", class_="cross-ref") | |
section_id = section.get('id') | |
section_url = f"https://www.churchofjesuschrist.org{link['href']}" if link else f"{url}#{section_id}" | |
# Get section text | |
paragraphs = section.find_all("p") | |
# Exclude title number paragraph | |
text = [p.text for p in paragraphs if not p.get( | |
"class") or "title-number" not in p["class"]] | |
text = " ".join(text) | |
result.append({ | |
'title': title, | |
'url': section_url, | |
'text': text | |
}) | |
return result | |
def update_handbook_data(handbook_url): | |
chapters_urls = get_chapters(handbook_url) | |
total_chapters = len(chapters_urls) | |
handbook_data = [] | |
for i, chapter_url in enumerate(chapters_urls): | |
chapter_sections = get_sections(chapter_url) | |
if chapter_sections: | |
handbook_data.append(chapter_sections) | |
print(f"Progress: {int(((i+1)/total_chapters)*100)}%") | |
save_handbook_to_chroma(handbook_data) | |
if __name__ == '__main__': | |
update_handbook_data( | |
"https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=spa") | |