Spaces:

acecalisto3
/

SouthSpencerQA

Sleeping

App Files Files Community

acecalisto3 commited on Dec 28, 2024

Commit

4b6bb13

verified ·

1 Parent(s): 88430cf

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -9

app.py CHANGED Viewed

@@ -26,11 +26,7 @@ class Config:
     REQUEST_TIMEOUT = 10
     MAX_DEPTH = 1
     SIMILARITY_THRESHOLD = 0.4  # Lowered threshold for testing
-    # Add some example URLs that are publicly accessible
-    BASE_URLS = [
-        "https://www.sspencer.k12.in.us/",  # Replace with actual school website
-        # Add more public URLs here
-    ]
 class ResourceItem:
     def __init__(self, url: str, content: str, type: str):
@@ -58,11 +54,25 @@ class SchoolChatbot:
             logger.error(f"Error setting up models: {e}")
             raise
     def crawl_and_index_resources(self):
-        """Crawl and index resources from the base URLs."""
         logger.info("Starting to crawl and index resources...")
-        for base_url in Config.BASE_URLS:
-            self.crawl_url(base_url, depth=0)
         logger.info("Crawling and indexing completed.")
     def crawl_url(self, url: str, depth: int):
@@ -218,7 +228,7 @@ Source: {best_resource.url}
     def is_valid_url(self, url: str) -> bool:
         """Check if a URL is valid and within the base domain"""
         parsed_url = urlparse(url)
-        return parsed_url.scheme in ('http', 'https') and any(base_url in url for base_url in Config.BASE_URLS)
 def create_gradio_interface(chatbot: SchoolChatbot):
     def respond(user_input):

     REQUEST_TIMEOUT = 10
     MAX_DEPTH = 1
     SIMILARITY_THRESHOLD = 0.4  # Lowered threshold for testing
+    GOOGLE_DOC_URL = "https://docs.google.com/document/d/1p5jt-2ZEa1HLibkNPpPhmMRTLEhQZq38dH9hNTnSHo8/edit?tab=t.0"
 class ResourceItem:
     def __init__(self, url: str, content: str, type: str):
             logger.error(f"Error setting up models: {e}")
             raise
+    def fetch_google_doc_content(self):
+        """Fetch content from the Google Document and extract URLs"""
+        try:
+            response = requests.get(Config.GOOGLE_DOC_URL)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            urls = [a['href'] for a in soup.find_all('a', href=True)]
+            logger.info(f"Extracted {len(urls)} URLs from Google Document")
+            return urls
+        except Exception as e:
+            logger.error(f"Error fetching Google Document content: {e}")
+            return []
     def crawl_and_index_resources(self):
+        """Crawl and index resources from the extracted URLs"""
         logger.info("Starting to crawl and index resources...")
+        urls = self.fetch_google_doc_content()
+        for url in urls:
+            self.crawl_url(url, depth=0)
         logger.info("Crawling and indexing completed.")
     def crawl_url(self, url: str, depth: int):
     def is_valid_url(self, url: str) -> bool:
         """Check if a URL is valid and within the base domain"""
         parsed_url = urlparse(url)
+        return parsed_url.scheme in ('http', 'https')
 def create_gradio_interface(chatbot: SchoolChatbot):
     def respond(user_input):