acecalisto3 commited on
Commit
4b6bb13
·
verified ·
1 Parent(s): 88430cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -26,11 +26,7 @@ class Config:
26
  REQUEST_TIMEOUT = 10
27
  MAX_DEPTH = 1
28
  SIMILARITY_THRESHOLD = 0.4 # Lowered threshold for testing
29
- # Add some example URLs that are publicly accessible
30
- BASE_URLS = [
31
- "https://www.sspencer.k12.in.us/", # Replace with actual school website
32
- # Add more public URLs here
33
- ]
34
 
35
  class ResourceItem:
36
  def __init__(self, url: str, content: str, type: str):
@@ -58,11 +54,25 @@ class SchoolChatbot:
58
  logger.error(f"Error setting up models: {e}")
59
  raise
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def crawl_and_index_resources(self):
62
- """Crawl and index resources from the base URLs."""
63
  logger.info("Starting to crawl and index resources...")
64
- for base_url in Config.BASE_URLS:
65
- self.crawl_url(base_url, depth=0)
 
66
  logger.info("Crawling and indexing completed.")
67
 
68
  def crawl_url(self, url: str, depth: int):
@@ -218,7 +228,7 @@ Source: {best_resource.url}
218
  def is_valid_url(self, url: str) -> bool:
219
  """Check if a URL is valid and within the base domain"""
220
  parsed_url = urlparse(url)
221
- return parsed_url.scheme in ('http', 'https') and any(base_url in url for base_url in Config.BASE_URLS)
222
 
223
  def create_gradio_interface(chatbot: SchoolChatbot):
224
  def respond(user_input):
 
26
  REQUEST_TIMEOUT = 10
27
  MAX_DEPTH = 1
28
  SIMILARITY_THRESHOLD = 0.4 # Lowered threshold for testing
29
+ GOOGLE_DOC_URL = "https://docs.google.com/document/d/1p5jt-2ZEa1HLibkNPpPhmMRTLEhQZq38dH9hNTnSHo8/edit?tab=t.0"
 
 
 
 
30
 
31
  class ResourceItem:
32
  def __init__(self, url: str, content: str, type: str):
 
54
  logger.error(f"Error setting up models: {e}")
55
  raise
56
 
57
+ def fetch_google_doc_content(self):
58
+ """Fetch content from the Google Document and extract URLs"""
59
+ try:
60
+ response = requests.get(Config.GOOGLE_DOC_URL)
61
+ response.raise_for_status()
62
+ soup = BeautifulSoup(response.content, 'html.parser')
63
+ urls = [a['href'] for a in soup.find_all('a', href=True)]
64
+ logger.info(f"Extracted {len(urls)} URLs from Google Document")
65
+ return urls
66
+ except Exception as e:
67
+ logger.error(f"Error fetching Google Document content: {e}")
68
+ return []
69
+
70
  def crawl_and_index_resources(self):
71
+ """Crawl and index resources from the extracted URLs"""
72
  logger.info("Starting to crawl and index resources...")
73
+ urls = self.fetch_google_doc_content()
74
+ for url in urls:
75
+ self.crawl_url(url, depth=0)
76
  logger.info("Crawling and indexing completed.")
77
 
78
  def crawl_url(self, url: str, depth: int):
 
228
  def is_valid_url(self, url: str) -> bool:
229
  """Check if a URL is valid and within the base domain"""
230
  parsed_url = urlparse(url)
231
+ return parsed_url.scheme in ('http', 'https')
232
 
233
  def create_gradio_interface(chatbot: SchoolChatbot):
234
  def respond(user_input):