danielRamon commited on
Commit
6998bc6
·
1 Parent(s): 20a2075

Init commit

Browse files
Files changed (6) hide show
  1. .gitignore +3 -0
  2. Dockerfile +18 -0
  3. app.py +23 -0
  4. chroma_utils.py +71 -0
  5. requirements.txt +8 -0
  6. scrap.py +106 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .vscode/launch.json
2
+ __pycache__/chroma_utils.cpython-311.pyc
3
+ chroma_data/
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python slim-buster as base image for smaller size
2
+ FROM python:3.10.15-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy requirements first to leverage Docker cache
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy Python source files
14
+ COPY *.py .
15
+
16
+ # Start Streamlit app
17
+ CMD ["streamlit", "run", "app.py", "--server.port", "2000"]
18
+
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from chroma_utils import ask_chroma
3
+
4
+ st.title("Encuentra en el Manual")
5
+
6
+ question = st.text_input("Haz una pregunta para encontrar en el manual")
7
+
8
+ if question:
9
+ results = ask_chroma(question, k=5)
10
+ for i, result in enumerate(results):
11
+ # st.page_link(result.metadata['url'],
12
+ # label=f"{result.metadata['title']}", help=f"{result.page_content[:250]} ...", )
13
+ with st.container(border=True):
14
+ st.markdown(
15
+ f"## [{result.metadata['title']}]({result.metadata['url']}) ")
16
+ if len(result.page_content) > 500:
17
+ st.markdown(f"{result.page_content[:500]}...")
18
+ else:
19
+ st.markdown(result.page_content)
20
+ # with st.expander(f"**Titulo:** {result.metadata['title']}"):
21
+ # st.markdown(f"**Resumen:** {result.page_content[:250]} ...")
22
+ # st.markdown(
23
+ # f"**Enlace:** [{result.metadata['title']}]({result.metadata['url']})")
chroma_utils.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from langchain_ollama import OllamaEmbeddings
3
+ from langchain_core.documents import Document
4
+
5
+ embed = OllamaEmbeddings(
6
+ model="jina/jina-embeddings-v2-base-es") # Initialize embeddings
7
+
8
+
9
+ def save_handbook_to_chroma(handbook_data: list) -> bool:
10
+ """
11
+ Saves the entire handbook data to Chroma with embeddings.
12
+
13
+ Args:
14
+ handbook_data (list): List of dictionaries containing title, URL, and text content of each section.
15
+
16
+ Returns:
17
+ bool: True if the handbook is saved correctly, False otherwise.
18
+ """
19
+ embeddings = OllamaEmbeddings(
20
+ model="llama3.1",
21
+ )
22
+
23
+ documents = []
24
+ for chapter in handbook_data:
25
+ for section in chapter:
26
+ document = Document(
27
+ page_content=section.get('text', ''),
28
+ metadata={
29
+ 'title': section.get('title', ''),
30
+ 'url': section.get('url', '')
31
+ }
32
+ )
33
+ documents.append(document)
34
+ print("Saving handbook to Chroma. This process can take a long time.")
35
+ try:
36
+ ids = [str(i) for i in range(1, len(documents) + 1)]
37
+ Chroma.from_documents(
38
+ documents=documents, embedding=embed, persist_directory="./chroma_data", ids=ids)
39
+ return True
40
+ except Exception as e:
41
+ print(f"Error saving handbook to Chroma: {e}")
42
+ return False
43
+
44
+
45
+ def ask_chroma(question: str, k: int = 3) -> dict:
46
+ """
47
+ Asks Chroma a question and returns the top k most similar results.
48
+
49
+ Args:
50
+ question (str): The question to ask Chroma.
51
+ k (int): The number of most similar results to return. Default is 3.
52
+
53
+ Returns:
54
+ dict: A dictionary containing the top k most similar results.
55
+ """
56
+ try:
57
+ vectorstore = Chroma(
58
+ embedding_function=embed, # Provide the embedding function
59
+ persist_directory="./chroma_data"
60
+ )
61
+ results = vectorstore.similarity_search(question, k)
62
+ return results
63
+ except Exception as e:
64
+ print(f"Error asking Chroma: {e}")
65
+ return {}
66
+
67
+
68
+ # similars = ask_chroma(
69
+ # "¿Quienes asisten al consejo de barrio?", 2)
70
+ # for similar in similars:
71
+ # print(similar.page_content+"\n"*3)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ pandas
4
+ langchain
5
+ langchain-ollama
6
+ langchain-chroma
7
+ streamlit
8
+
scrap.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import pandas as pd
4
+ from bs4 import BeautifulSoup
5
+ from chroma_utils import save_handbook_to_chroma
6
+
7
+
8
+ def get_chapters(base_url):
9
+ """
10
+ Gets all URLs from the General Handbook main page by finding links in the doc-map structure.
11
+ Only returns chapter-level URLs without section anchors.
12
+
13
+ Returns:
14
+ list: List of URLs for all chapters in the handbook
15
+ """
16
+ response = requests.get(base_url)
17
+ soup = BeautifulSoup(response.content, 'html.parser')
18
+
19
+ # Extract language from base_url if present
20
+ lang_match = re.search(r'lang=([a-z]{3})', base_url)
21
+ lang = lang_match.group(1) if lang_match else 'eng'
22
+
23
+ # Find all links within doc-map class elements
24
+ doc_maps = soup.find_all("ul", class_="doc-map")
25
+ urls = []
26
+
27
+ for doc_map in doc_maps:
28
+ links = doc_map.find_all("a", class_="list-tile")
29
+ for link in links:
30
+ href = link.get('href')
31
+ if href:
32
+ # Remove any section anchors and query parameters
33
+ base_href = href.split('?')[0].split('#')[0]
34
+ # Construct full URL from relative path, including language if present
35
+ full_url = f"https://www.churchofjesuschrist.org{base_href}?lang={lang}"
36
+ urls.append(full_url)
37
+
38
+ # Remove duplicates while preserving order
39
+ unique_urls = list(dict.fromkeys(urls))
40
+
41
+ return unique_urls
42
+
43
+
44
+ def get_sections(url):
45
+ """
46
+ Gets all sections from a chapter page with their titles, URLs and text content.
47
+ Only processes sections within the body-block div.
48
+
49
+ Args:
50
+ url (str): URL of the chapter page
51
+
52
+ Returns:
53
+ dict: Dictionary with section title, URL and text content
54
+ """
55
+ response = requests.get(url)
56
+ soup = BeautifulSoup(response.content, 'html.parser')
57
+
58
+ # Find the body-block div first
59
+ body_block = soup.find("div", class_="body-block")
60
+ if not body_block:
61
+ return {}
62
+
63
+ sections = body_block.find_all("section")
64
+ result = []
65
+ for section in sections:
66
+ # Get section title
67
+ header = section.find("header")
68
+ if header:
69
+ title = header.find(re.compile(
70
+ "h\d+")).text if header.find(re.compile("h\d+")) else ""
71
+
72
+ # Get section URL from header link
73
+ link = header.find("a", class_="cross-ref")
74
+ section_id = section.get('id')
75
+ section_url = f"https://www.churchofjesuschrist.org{link['href']}" if link else f"{url}#{section_id}"
76
+
77
+ # Get section text
78
+ paragraphs = section.find_all("p")
79
+ # Exclude title number paragraph
80
+ text = [p.text for p in paragraphs if not p.get(
81
+ "class") or "title-number" not in p["class"]]
82
+ text = " ".join(text)
83
+
84
+ result.append({
85
+ 'title': title,
86
+ 'url': section_url,
87
+ 'text': text
88
+ })
89
+
90
+ return result
91
+
92
+
93
+ def update_handbook_data(handbook_url):
94
+ chapters_urls = get_chapters(handbook_url)
95
+ total_chapters = len(chapters_urls)
96
+ handbook_data = []
97
+ for i, chapter_url in enumerate(chapters_urls):
98
+ chapter_sections = get_sections(chapter_url)
99
+ if chapter_sections:
100
+ handbook_data.append(chapter_sections)
101
+ print(f"Progress: {int(((i+1)/total_chapters)*100)}%")
102
+ save_handbook_to_chroma(handbook_data)
103
+
104
+
105
+ update_handbook_data(
106
+ "https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=spa")