Spaces:
Sleeping
Sleeping
Commit
·
6998bc6
1
Parent(s):
20a2075
Init commit
Browse files- .gitignore +3 -0
- Dockerfile +18 -0
- app.py +23 -0
- chroma_utils.py +71 -0
- requirements.txt +8 -0
- scrap.py +106 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.vscode/launch.json
|
2 |
+
__pycache__/chroma_utils.cpython-311.pyc
|
3 |
+
chroma_data/
|
Dockerfile
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python slim-buster as base image for smaller size
|
2 |
+
FROM python:3.10.15-slim
|
3 |
+
|
4 |
+
# Set working directory
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copy requirements first to leverage Docker cache
|
8 |
+
COPY requirements.txt .
|
9 |
+
|
10 |
+
# Install dependencies
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Copy Python source files
|
14 |
+
COPY *.py .
|
15 |
+
|
16 |
+
# Start Streamlit app
|
17 |
+
CMD ["streamlit", "run", "app.py", "--server.port", "2000"]
|
18 |
+
|
app.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from chroma_utils import ask_chroma
|
3 |
+
|
4 |
+
st.title("Encuentra en el Manual")
|
5 |
+
|
6 |
+
question = st.text_input("Haz una pregunta para encontrar en el manual")
|
7 |
+
|
8 |
+
if question:
|
9 |
+
results = ask_chroma(question, k=5)
|
10 |
+
for i, result in enumerate(results):
|
11 |
+
# st.page_link(result.metadata['url'],
|
12 |
+
# label=f"{result.metadata['title']}", help=f"{result.page_content[:250]} ...", )
|
13 |
+
with st.container(border=True):
|
14 |
+
st.markdown(
|
15 |
+
f"## [{result.metadata['title']}]({result.metadata['url']}) ")
|
16 |
+
if len(result.page_content) > 500:
|
17 |
+
st.markdown(f"{result.page_content[:500]}...")
|
18 |
+
else:
|
19 |
+
st.markdown(result.page_content)
|
20 |
+
# with st.expander(f"**Titulo:** {result.metadata['title']}"):
|
21 |
+
# st.markdown(f"**Resumen:** {result.page_content[:250]} ...")
|
22 |
+
# st.markdown(
|
23 |
+
# f"**Enlace:** [{result.metadata['title']}]({result.metadata['url']})")
|
chroma_utils.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_chroma import Chroma
|
2 |
+
from langchain_ollama import OllamaEmbeddings
|
3 |
+
from langchain_core.documents import Document
|
4 |
+
|
5 |
+
embed = OllamaEmbeddings(
|
6 |
+
model="jina/jina-embeddings-v2-base-es") # Initialize embeddings
|
7 |
+
|
8 |
+
|
9 |
+
def save_handbook_to_chroma(handbook_data: list) -> bool:
|
10 |
+
"""
|
11 |
+
Saves the entire handbook data to Chroma with embeddings.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
handbook_data (list): List of dictionaries containing title, URL, and text content of each section.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
bool: True if the handbook is saved correctly, False otherwise.
|
18 |
+
"""
|
19 |
+
embeddings = OllamaEmbeddings(
|
20 |
+
model="llama3.1",
|
21 |
+
)
|
22 |
+
|
23 |
+
documents = []
|
24 |
+
for chapter in handbook_data:
|
25 |
+
for section in chapter:
|
26 |
+
document = Document(
|
27 |
+
page_content=section.get('text', ''),
|
28 |
+
metadata={
|
29 |
+
'title': section.get('title', ''),
|
30 |
+
'url': section.get('url', '')
|
31 |
+
}
|
32 |
+
)
|
33 |
+
documents.append(document)
|
34 |
+
print("Saving handbook to Chroma. This process can take a long time.")
|
35 |
+
try:
|
36 |
+
ids = [str(i) for i in range(1, len(documents) + 1)]
|
37 |
+
Chroma.from_documents(
|
38 |
+
documents=documents, embedding=embed, persist_directory="./chroma_data", ids=ids)
|
39 |
+
return True
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error saving handbook to Chroma: {e}")
|
42 |
+
return False
|
43 |
+
|
44 |
+
|
45 |
+
def ask_chroma(question: str, k: int = 3) -> dict:
|
46 |
+
"""
|
47 |
+
Asks Chroma a question and returns the top k most similar results.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
question (str): The question to ask Chroma.
|
51 |
+
k (int): The number of most similar results to return. Default is 3.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
dict: A dictionary containing the top k most similar results.
|
55 |
+
"""
|
56 |
+
try:
|
57 |
+
vectorstore = Chroma(
|
58 |
+
embedding_function=embed, # Provide the embedding function
|
59 |
+
persist_directory="./chroma_data"
|
60 |
+
)
|
61 |
+
results = vectorstore.similarity_search(question, k)
|
62 |
+
return results
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error asking Chroma: {e}")
|
65 |
+
return {}
|
66 |
+
|
67 |
+
|
68 |
+
# similars = ask_chroma(
|
69 |
+
# "¿Quienes asisten al consejo de barrio?", 2)
|
70 |
+
# for similar in similars:
|
71 |
+
# print(similar.page_content+"\n"*3)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
beautifulsoup4
|
3 |
+
pandas
|
4 |
+
langchain
|
5 |
+
langchain-ollama
|
6 |
+
langchain-chroma
|
7 |
+
streamlit
|
8 |
+
|
scrap.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from chroma_utils import save_handbook_to_chroma
|
6 |
+
|
7 |
+
|
8 |
+
def get_chapters(base_url):
|
9 |
+
"""
|
10 |
+
Gets all URLs from the General Handbook main page by finding links in the doc-map structure.
|
11 |
+
Only returns chapter-level URLs without section anchors.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
list: List of URLs for all chapters in the handbook
|
15 |
+
"""
|
16 |
+
response = requests.get(base_url)
|
17 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
18 |
+
|
19 |
+
# Extract language from base_url if present
|
20 |
+
lang_match = re.search(r'lang=([a-z]{3})', base_url)
|
21 |
+
lang = lang_match.group(1) if lang_match else 'eng'
|
22 |
+
|
23 |
+
# Find all links within doc-map class elements
|
24 |
+
doc_maps = soup.find_all("ul", class_="doc-map")
|
25 |
+
urls = []
|
26 |
+
|
27 |
+
for doc_map in doc_maps:
|
28 |
+
links = doc_map.find_all("a", class_="list-tile")
|
29 |
+
for link in links:
|
30 |
+
href = link.get('href')
|
31 |
+
if href:
|
32 |
+
# Remove any section anchors and query parameters
|
33 |
+
base_href = href.split('?')[0].split('#')[0]
|
34 |
+
# Construct full URL from relative path, including language if present
|
35 |
+
full_url = f"https://www.churchofjesuschrist.org{base_href}?lang={lang}"
|
36 |
+
urls.append(full_url)
|
37 |
+
|
38 |
+
# Remove duplicates while preserving order
|
39 |
+
unique_urls = list(dict.fromkeys(urls))
|
40 |
+
|
41 |
+
return unique_urls
|
42 |
+
|
43 |
+
|
44 |
+
def get_sections(url):
|
45 |
+
"""
|
46 |
+
Gets all sections from a chapter page with their titles, URLs and text content.
|
47 |
+
Only processes sections within the body-block div.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
url (str): URL of the chapter page
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
dict: Dictionary with section title, URL and text content
|
54 |
+
"""
|
55 |
+
response = requests.get(url)
|
56 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
57 |
+
|
58 |
+
# Find the body-block div first
|
59 |
+
body_block = soup.find("div", class_="body-block")
|
60 |
+
if not body_block:
|
61 |
+
return {}
|
62 |
+
|
63 |
+
sections = body_block.find_all("section")
|
64 |
+
result = []
|
65 |
+
for section in sections:
|
66 |
+
# Get section title
|
67 |
+
header = section.find("header")
|
68 |
+
if header:
|
69 |
+
title = header.find(re.compile(
|
70 |
+
"h\d+")).text if header.find(re.compile("h\d+")) else ""
|
71 |
+
|
72 |
+
# Get section URL from header link
|
73 |
+
link = header.find("a", class_="cross-ref")
|
74 |
+
section_id = section.get('id')
|
75 |
+
section_url = f"https://www.churchofjesuschrist.org{link['href']}" if link else f"{url}#{section_id}"
|
76 |
+
|
77 |
+
# Get section text
|
78 |
+
paragraphs = section.find_all("p")
|
79 |
+
# Exclude title number paragraph
|
80 |
+
text = [p.text for p in paragraphs if not p.get(
|
81 |
+
"class") or "title-number" not in p["class"]]
|
82 |
+
text = " ".join(text)
|
83 |
+
|
84 |
+
result.append({
|
85 |
+
'title': title,
|
86 |
+
'url': section_url,
|
87 |
+
'text': text
|
88 |
+
})
|
89 |
+
|
90 |
+
return result
|
91 |
+
|
92 |
+
|
93 |
+
def update_handbook_data(handbook_url):
|
94 |
+
chapters_urls = get_chapters(handbook_url)
|
95 |
+
total_chapters = len(chapters_urls)
|
96 |
+
handbook_data = []
|
97 |
+
for i, chapter_url in enumerate(chapters_urls):
|
98 |
+
chapter_sections = get_sections(chapter_url)
|
99 |
+
if chapter_sections:
|
100 |
+
handbook_data.append(chapter_sections)
|
101 |
+
print(f"Progress: {int(((i+1)/total_chapters)*100)}%")
|
102 |
+
save_handbook_to_chroma(handbook_data)
|
103 |
+
|
104 |
+
|
105 |
+
update_handbook_data(
|
106 |
+
"https://www.churchofjesuschrist.org/study/manual/general-handbook?lang=spa")
|