handbookreader / scrap.py
danielRamon's picture
refactor ♻️: Update Dockerfile and code structure for improved search result layout
585c783
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from chroma_utils import save_handbook_to_chroma
def get_chapters(base_url):
"""
Gets all URLs from the General Handbook main page by finding links in the doc-map structure.
Only returns chapter-level URLs without section anchors.
Returns:
list: List of URLs for all chapters in the handbook
"""
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract language from base_url if present
lang_match = re.search(r'lang=([a-z]{3})', base_url)
lang = lang_match.group(1) if lang_match else 'eng'
# Find all links within doc-map class elements
doc_maps = soup.find_all("ul", class_="doc-map")
urls = []
for doc_map in doc_maps:
links = doc_map.find_all("a", class_="list-tile")
for link in links:
href = link.get('href')
if href:
# Remove any section anchors and query parameters
base_href = href.split('?')[0].split('#')[0]
# Construct full URL from relative path, including language if present
full_url = f"https://www.churchofjesuschrist.org{base_href}?lang={lang}"
urls.append(full_url)
# Remove duplicates while preserving order
unique_urls = list(dict.fromkeys(urls))
return unique_urls
def get_sections(url):
"""
Gets all sections from a chapter page with their titles, URLs and text content.
Only processes sections within the body-block div.
Args:
url (str): URL of the chapter page
Returns:
dict: Dictionary with section title, URL and text content
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the body-block div first
body_block = soup.find("div", class_="body-block")
if not body_block:
return {}
sections = body_block.find_all("section")
result = []
for section in sections:
# Get section title
header = section.find("header")
if header:
title = header.find(re.compile(
"h\d+")).text if header.find(re.compile("h\d+")) else ""
# Get section URL from header link
link = header.find("a", class_="cross-ref")
section_id = section.get('id')
section_url = f"https://www.churchofjesuschrist.org{link['href']}" if link else f"{url}#{section_id}"
# Get section text
paragraphs = section.find_all("p")
# Exclude title number paragraph
text = [p.text for p in paragraphs if not p.get(
"class") or "title-number" not in p["class"]]
text = " ".join(text)
result.append({
'title': title,
'url': section_url,
'text': text
})
return result
def update_handbook_data(handbook_url):
chapters_urls = get_chapters(handbook_url)
total_chapters = len(chapters_urls)
handbook_data = []
for i, chapter_url in enumerate(chapters_urls):
chapter_sections = get_sections(chapter_url)
if chapter_sections:
handbook_data.append(chapter_sections)
print(f"Progress: {int(((i+1)/total_chapters)*100)}%")
save_handbook_to_chroma(handbook_data)
if __name__ == '__main__':
update_handbook_data(
"https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=spa")