Spaces:

danielRamon
/

handbookreader

Sleeping

App Files Files Community

danielRamon commited on Nov 5, 2024

Commit

6998bc6

1 Parent(s): 20a2075

Init commit

Browse files

Files changed (6) hide show

.gitignore +3 -0
Dockerfile +18 -0
app.py +23 -0
chroma_utils.py +71 -0
requirements.txt +8 -0
scrap.py +106 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.vscode/launch.json
+__pycache__/chroma_utils.cpython-311.pyc
+chroma_data/

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+# Use Python slim-buster as base image for smaller size
+FROM python:3.10.15-slim
+# Set working directory
+WORKDIR /app
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy Python source files
+COPY *.py .
+# Start Streamlit app
+CMD ["streamlit", "run", "app.py", "--server.port", "2000"]

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import streamlit as st
+from chroma_utils import ask_chroma
+st.title("Encuentra en el Manual")
+question = st.text_input("Haz una pregunta para encontrar en el manual")
+if question:
+    results = ask_chroma(question, k=5)
+    for i, result in enumerate(results):
+        # st.page_link(result.metadata['url'],
+        #              label=f"{result.metadata['title']}", help=f"{result.page_content[:250]} ...", )
+        with st.container(border=True):
+            st.markdown(
+                f"## [{result.metadata['title']}]({result.metadata['url']}) ")
+            if len(result.page_content) > 500:
+                st.markdown(f"{result.page_content[:500]}...")
+            else:
+                st.markdown(result.page_content)
+        # with st.expander(f"**Titulo:** {result.metadata['title']}"):
+        #     st.markdown(f"**Resumen:** {result.page_content[:250]} ...")
+        #     st.markdown(
+        #         f"**Enlace:** [{result.metadata['title']}]({result.metadata['url']})")

chroma_utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from langchain_chroma import Chroma
+from langchain_ollama import OllamaEmbeddings
+from langchain_core.documents import Document
+embed = OllamaEmbeddings(
+    model="jina/jina-embeddings-v2-base-es")  # Initialize embeddings
+def save_handbook_to_chroma(handbook_data: list) -> bool:
+    """
+    Saves the entire handbook data to Chroma with embeddings.
+    Args:
+        handbook_data (list): List of dictionaries containing title, URL, and text content of each section.
+    Returns:
+        bool: True if the handbook is saved correctly, False otherwise.
+    """
+    embeddings = OllamaEmbeddings(
+        model="llama3.1",
+    )
+    documents = []
+    for chapter in handbook_data:
+        for section in chapter:
+            document = Document(
+                page_content=section.get('text', ''),
+                metadata={
+                    'title': section.get('title', ''),
+                    'url': section.get('url', '')
+                }
+            )
+            documents.append(document)
+    print("Saving handbook to Chroma. This process can take a long time.")
+    try:
+        ids = [str(i) for i in range(1, len(documents) + 1)]
+        Chroma.from_documents(
+            documents=documents, embedding=embed, persist_directory="./chroma_data", ids=ids)
+        return True
+    except Exception as e:
+        print(f"Error saving handbook to Chroma: {e}")
+        return False
+def ask_chroma(question: str, k: int = 3) -> dict:
+    """
+    Asks Chroma a question and returns the top k most similar results.
+    Args:
+        question (str): The question to ask Chroma.
+        k (int): The number of most similar results to return. Default is 3.
+    Returns:
+        dict: A dictionary containing the top k most similar results.
+    """
+    try:
+        vectorstore = Chroma(
+            embedding_function=embed,  # Provide the embedding function
+            persist_directory="./chroma_data"
+        )
+        results = vectorstore.similarity_search(question, k)
+        return results
+    except Exception as e:
+        print(f"Error asking Chroma: {e}")
+        return {}
+# similars = ask_chroma(
+#     "¿Quienes asisten al consejo de barrio?", 2)
+# for similar in similars:
+#     print(similar.page_content+"\n"*3)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+requests
+beautifulsoup4
+pandas
+langchain
+langchain-ollama
+langchain-chroma
+streamlit

scrap.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import re
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+from chroma_utils import save_handbook_to_chroma
+def get_chapters(base_url):
+    """
+    Gets all URLs from the General Handbook main page by finding links in the doc-map structure.
+    Only returns chapter-level URLs without section anchors.
+    Returns:
+        list: List of URLs for all chapters in the handbook
+    """
+    response = requests.get(base_url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    # Extract language from base_url if present
+    lang_match = re.search(r'lang=([a-z]{3})', base_url)
+    lang = lang_match.group(1) if lang_match else 'eng'
+    # Find all links within doc-map class elements
+    doc_maps = soup.find_all("ul", class_="doc-map")
+    urls = []
+    for doc_map in doc_maps:
+        links = doc_map.find_all("a", class_="list-tile")
+        for link in links:
+            href = link.get('href')
+            if href:
+                # Remove any section anchors and query parameters
+                base_href = href.split('?')[0].split('#')[0]
+                # Construct full URL from relative path, including language if present
+                full_url = f"https://www.churchofjesuschrist.org{base_href}?lang={lang}"
+                urls.append(full_url)
+    # Remove duplicates while preserving order
+    unique_urls = list(dict.fromkeys(urls))
+    return unique_urls
+def get_sections(url):
+    """
+    Gets all sections from a chapter page with their titles, URLs and text content.
+    Only processes sections within the body-block div.
+    Args:
+        url (str): URL of the chapter page
+    Returns:
+        dict: Dictionary with section title, URL and text content
+    """
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    # Find the body-block div first
+    body_block = soup.find("div", class_="body-block")
+    if not body_block:
+        return {}
+    sections = body_block.find_all("section")
+    result = []
+    for section in sections:
+        # Get section title
+        header = section.find("header")
+        if header:
+            title = header.find(re.compile(
+                "h\d+")).text if header.find(re.compile("h\d+")) else ""
+            # Get section URL from header link
+            link = header.find("a", class_="cross-ref")
+            section_id = section.get('id')
+            section_url = f"https://www.churchofjesuschrist.org{link['href']}" if link else f"{url}#{section_id}"
+            # Get section text
+            paragraphs = section.find_all("p")
+            # Exclude title number paragraph
+            text = [p.text for p in paragraphs if not p.get(
+                "class") or "title-number" not in p["class"]]
+            text = " ".join(text)
+            result.append({
+                'title': title,
+                'url': section_url,
+                'text': text
+            })
+    return result
+def update_handbook_data(handbook_url):
+    chapters_urls = get_chapters(handbook_url)
+    total_chapters = len(chapters_urls)
+    handbook_data = []
+    for i, chapter_url in enumerate(chapters_urls):
+        chapter_sections = get_sections(chapter_url)
+        if chapter_sections:
+            handbook_data.append(chapter_sections)
+            print(f"Progress: {int(((i+1)/total_chapters)*100)}%")
+    save_handbook_to_chroma(handbook_data)
+update_handbook_data(
+    "https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=spa")