Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -26,11 +26,7 @@ class Config:
|
|
26 |
REQUEST_TIMEOUT = 10
|
27 |
MAX_DEPTH = 1
|
28 |
SIMILARITY_THRESHOLD = 0.4 # Lowered threshold for testing
|
29 |
-
|
30 |
-
BASE_URLS = [
|
31 |
-
"https://www.sspencer.k12.in.us/", # Replace with actual school website
|
32 |
-
# Add more public URLs here
|
33 |
-
]
|
34 |
|
35 |
class ResourceItem:
|
36 |
def __init__(self, url: str, content: str, type: str):
|
@@ -58,11 +54,25 @@ class SchoolChatbot:
|
|
58 |
logger.error(f"Error setting up models: {e}")
|
59 |
raise
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def crawl_and_index_resources(self):
|
62 |
-
"""Crawl and index resources from the
|
63 |
logger.info("Starting to crawl and index resources...")
|
64 |
-
|
65 |
-
|
|
|
66 |
logger.info("Crawling and indexing completed.")
|
67 |
|
68 |
def crawl_url(self, url: str, depth: int):
|
@@ -218,7 +228,7 @@ Source: {best_resource.url}
|
|
218 |
def is_valid_url(self, url: str) -> bool:
|
219 |
"""Check if a URL is valid and within the base domain"""
|
220 |
parsed_url = urlparse(url)
|
221 |
-
return parsed_url.scheme in ('http', 'https')
|
222 |
|
223 |
def create_gradio_interface(chatbot: SchoolChatbot):
|
224 |
def respond(user_input):
|
|
|
26 |
REQUEST_TIMEOUT = 10
|
27 |
MAX_DEPTH = 1
|
28 |
SIMILARITY_THRESHOLD = 0.4 # Lowered threshold for testing
|
29 |
+
GOOGLE_DOC_URL = "https://docs.google.com/document/d/1p5jt-2ZEa1HLibkNPpPhmMRTLEhQZq38dH9hNTnSHo8/edit?tab=t.0"
|
|
|
|
|
|
|
|
|
30 |
|
31 |
class ResourceItem:
|
32 |
def __init__(self, url: str, content: str, type: str):
|
|
|
54 |
logger.error(f"Error setting up models: {e}")
|
55 |
raise
|
56 |
|
57 |
+
def fetch_google_doc_content(self):
|
58 |
+
"""Fetch content from the Google Document and extract URLs"""
|
59 |
+
try:
|
60 |
+
response = requests.get(Config.GOOGLE_DOC_URL)
|
61 |
+
response.raise_for_status()
|
62 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
63 |
+
urls = [a['href'] for a in soup.find_all('a', href=True)]
|
64 |
+
logger.info(f"Extracted {len(urls)} URLs from Google Document")
|
65 |
+
return urls
|
66 |
+
except Exception as e:
|
67 |
+
logger.error(f"Error fetching Google Document content: {e}")
|
68 |
+
return []
|
69 |
+
|
70 |
def crawl_and_index_resources(self):
|
71 |
+
"""Crawl and index resources from the extracted URLs"""
|
72 |
logger.info("Starting to crawl and index resources...")
|
73 |
+
urls = self.fetch_google_doc_content()
|
74 |
+
for url in urls:
|
75 |
+
self.crawl_url(url, depth=0)
|
76 |
logger.info("Crawling and indexing completed.")
|
77 |
|
78 |
def crawl_url(self, url: str, depth: int):
|
|
|
228 |
def is_valid_url(self, url: str) -> bool:
|
229 |
"""Check if a URL is valid and within the base domain"""
|
230 |
parsed_url = urlparse(url)
|
231 |
+
return parsed_url.scheme in ('http', 'https')
|
232 |
|
233 |
def create_gradio_interface(chatbot: SchoolChatbot):
|
234 |
def respond(user_input):
|