Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 1, 2024

Commit

b9f24c9

verified ·

1 Parent(s): ca646d2

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -76

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-```python
 import os
 import time
 import hashlib
@@ -30,17 +29,14 @@ HISTORY = []
 CURRENT_TASK = None
 STOP_THREADS = False
-# Define the list of URLs to monitor (you can add more URLs here)
-URLS_TO_MONITOR = ["https://twitter.com/wlcscrdp", "https://www.facebook.com/aurorareddevils/", "https://www.facebook.com/brightpanthers/", "https://www.facebook.com/carrollcountychamberin/", "https://www.facebook.com/Culver.Cavs.MHS", "https://www.facebook.com/culver.elementary.school", "https://www.facebook.com/CulverCommunitySchools", "https://www.facebook.com/DillsboroBulldogs/", "https://www.facebook.com/ECMSTROJANS", "https://www.facebook.com/enjoywhitecountyIN/", "https://www.facebook.com/farmersvilleelementary", "https://www.facebook.com/groups/SDMSparents", "https://www.facebook.com/jghsart/", "https://www.facebook.com/jgmusicdept", "https://www.facebook.com/John-Glenn-Education-Foundation-208326199636364/", "https://www.facebook.com/John-Glenn-High-School-1102148953201006/", "https://www.facebook.com/John-Glenn-Theatre-Company-383638295064502/", "https://www.facebook.com/JohnGlennFalconsAthletics", "https://www.facebook.com/KIRPC-Head-Start-1485812354989001", "https://www.facebook.com/KIRPC1", "https://www.facebook.com/LHNEeagles", "https://www.facebook.com/LuceElementarySchool/", "https://www.facebook.com/marrselementary", "https://www.facebook.com/messhiners/", "https://www.facebook.com/monticellocitypool", "https://www.facebook.com/monticelloinwastewater/", "https://www.facebook.com/MooresHillBobcats/", "https://www.facebook.com/msdmv", "https://www.facebook.com/msdnorthposey", "https://www.facebook.com/MUTPL/", "https://www.facebook.com/MVJHS/", "https://www.facebook.com/mvshs", "https://www.facebook.com/njspjrsrhighschool?mibextid=b06tZ0", "https://www.facebook.com/NorthElementaryStars/", "https://www.facebook.com/NorthLibertyElementary/", "https://www.facebook.com/northposey/", "https://www.facebook.com/northposeyhs/", "https://www.facebook.com/NPJuniorHigh", "https://www.facebook.com/Prairie-Heights-Elementary-659322230934707/", "https://www.facebook.com/Prairie-Heights-High-School-2027713067459043/", "https://www.facebook.com/PrairieHeightsPanthers/", "https://www.facebook.com/profile.php?id=100057030237096", "https://www.facebook.com/profile.php?id=100057451179651", "https://www.facebook.com/profile.php?id=100063463513451", "https://www.facebook.com/profile.php?id=100063612319256", "https://www.facebook.com/profile.php?id=100064532596422", "https://www.facebook.com/profile.php?id=100067180226810", "https://www.facebook.com/profile.php?id=61563484312348", "https://www.facebook.com/PTOSWES/", "https://www.facebook.com/RandolphSouthern/", "https://www.facebook.com/RochesterMiddleSchool", "https://www.facebook.com/RochesterZebraNewTechHigh", "https://www.facebook.com/rockportelementarysouthspencer/", "https://www.facebook.com/satellitesathletics/", "https://www.facebook.com/seymourcommunityschools/", "https://www.facebook.com/SeymourHighSchool/", "https://www.facebook.com/SouthDearbornHighSchool/", "https://www.facebook.com/southarbornschools/", "https://www.facebook.com/SouthDearbornSquires/", "https://www.facebook.com/southspencerhighschool", "https://www.facebook.com/southspencermiddleschool/", "https://www.facebook.com/SouthSpencerSchools", "https://www.facebook.com/SouthTerracePanthers/", "https://www.facebook.com/sunmantigers/", "https://www.facebook.com/SWShelbySpartan/", "https://www.facebook.com/TallTimbersMarina", "https://www.facebook.com/WabashValleyESC/", "https://www.facebook.com/Walkerton-Elementary-School-283088605088622/", "https://www.facebook.com/westcentralcte/", "https://www.facebook.com/westelementary", "https://www.facebook.com/wlcscrdp", "https://www.instagram.com/mutpl/", "https://www.instagram.com/northposeyhsathletics", "https://www.instagram.com/rchsprincipalcook/", "https://www.instagram.com/southdearbornhighschool/", "https://www.instagram.com/southdearbornschools/", "https://www.instagram.com/westcentralcte/", "https://www.tiktok.com/@mutplteen"]
-# Function to monitor URLs for changes
-def monitor_urls(storage_location, urls, scrape_interval, content_type):
-    global HISTORY, STOP_THREADS
     previous_hashes = {url: "" for url in urls}  # Use a dictionary for better organization
     try:
         with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
-            while not STOP_THREADS:
                 for url in urls:
                     try:
                         driver.get(url)
@@ -49,14 +45,9 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
                             current_content = driver.page_source
                         elif content_type == "media":
                             current_content = driver.find_elements(By.TAG_NAME, "img")
-                        elif content_type == "both":
-                            current_content = driver.page_source + str(driver.find_elements(By.TAG_NAME, "img"))
                         else:
                             current_content = driver.page_source
-                        # Calculate hash based on selected content type
                         current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
                         if current_hash != previous_hashes[url]:
                             previous_hashes[url] = current_hash
                             date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -65,17 +56,15 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
                                 csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
                                 csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
                                 logging.info(f"Change detected at {url} on {date_time_str}")
                     except (NoSuchElementException, Exception) as e:
                         logging.error(f"Error accessing {url}: {e}")
                 time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
     except Exception as e:
         logging.error(f"Error starting ChromeDriver: {e}")
-# Function to start scraping
-def start_scraping(storage_location, urls, scrape_interval, content_type):
-    global CURRENT_TASK, HISTORY, STOP_THREADS
-    STOP_THREADS = False  # Reset stop flag
     CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
     HISTORY.append(f"Task started: {CURRENT_TASK}")
@@ -95,8 +84,6 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
                     initial_content = driver.page_source
                 elif content_type == "media":
                     initial_content = driver.find_elements(By.TAG_NAME, "img")
-                elif content_type == "both":
-                    initial_content = driver.page_source + str(driver.find_elements(By.TAG_NAME, "img"))
                 else:
                     initial_content = driver.page_source
                 initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
@@ -106,64 +93,17 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
         except (NoSuchElementException, Exception) as e:
             HISTORY.append(f"Error accessing {url}: {e}")
-    # Start a separate thread for monitoring
-    thread = threading.Thread(target=monitor_urls, args=(storage_location, urls, scrape_interval, content_type))
-    thread.start()
     return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
-# Function to display CSV content
-def display_csv(url):
-    hostname = urlparse(url).hostname
-    folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
-    csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
-    if os.path.exists(csv_path):
-        with open(csv_path, "r") as file:
-            return file.read()
-    else:
-        return "No data available."
-# Define the chat response function using the Mistral model
-def respond(message, history, system_message, max_tokens, temperature, top_p):
-    model = AutoModelForSeq2SeqLM.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
-    tokenizer = AutoTokenizer.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    response = pipe(f"User: {message}\nHistory: {history}\nSystem: {system_message}", max_length=max_tokens, temperature=temperature, top_p=top_p)[0]
-    return response
-# Function to generate RSS feed for a given URL
-def generate_rss_feed(url):
-    hostname = urlparse(url).hostname
-    folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
-    csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
-    if os.path.exists(csv_path):
-        with open(csv_path, "r") as file:
-            reader = csv.DictReader(file)
-            feed = feedparser.parse(f"rss.xml")  # Create a new feed object
-            feed.feed.title = f"Changes for {hostname}"
-            feed.feed.link = url
-            feed.feed.description = "Recent changes detected on the website."
-            feed.entries = []
-            for row in reader:
-                feed.entries.append({
-                    "title": f"Change detected at {row['url']}",
-                    "link": row['url'],
-                    "description": f"Content changed on {row['date']} at {row['time']}",
-                    "published": datetime.datetime.strptime(f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S").isoformat(),
-                })
-            return feed.entries
-    else:
-        return "No data available."
-# Function to handle user input and generate response
-def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
-    response = respond(message, history, system_message, max_tokens, temperature, top_p)
-    history.append((message, response))
-    return history, response
-# Gradio Interface
-import gradio as gr
 def create_interface():
     with gr.Blocks() as demo:
         with gr.Row():
@@ -185,8 +125,9 @@ def create_interface():
                 chat_history = gr.Chatbot(label="Chat History")
                 response_box = gr.Textbox(label="Response")
-        start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type], outputs=csv_output)
-        stop_button.click(lambda: (STOP_THREADS, "Scraping stopped."), outputs=[stop_button, csv_output])
         message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
         # Add a button to display the CSV content for a selected URL

 import os
 import time
 import hashlib
 CURRENT_TASK = None
 STOP_THREADS = False
+# Define a function to monitor URLs for changes
+def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag):
+    global HISTORY
     previous_hashes = {url: "" for url in urls}  # Use a dictionary for better organization
     try:
         with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
+            while not stop_scraping_flag[0]:
                 for url in urls:
                     try:
                         driver.get(url)
                             current_content = driver.page_source
                         elif content_type == "media":
                             current_content = driver.find_elements(By.TAG_NAME, "img")
                         else:
                             current_content = driver.page_source
                         current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
                         if current_hash != previous_hashes[url]:
                             previous_hashes[url] = current_hash
                             date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                                 csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
                                 csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
                                 logging.info(f"Change detected at {url} on {date_time_str}")
                     except (NoSuchElementException, Exception) as e:
                         logging.error(f"Error accessing {url}: {e}")
                 time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
     except Exception as e:
         logging.error(f"Error starting ChromeDriver: {e}")
+# Define a function to start scraping
+def start_scraping(storage_location, urls, scrape_interval, content_type, stop_scraping_flag):
+    global CURRENT_TASK, HISTORY
     CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
     HISTORY.append(f"Task started: {CURRENT_TASK}")
                     initial_content = driver.page_source
                 elif content_type == "media":
                     initial_content = driver.find_elements(By.TAG_NAME, "img")
                 else:
                     initial_content = driver.page_source
                 initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
         except (NoSuchElementException, Exception) as e:
             HISTORY.append(f"Error accessing {url}: {e}")
+    # Start a new thread for monitoring URLs
+    threading.Thread(target=monitor_urls, args=(storage_location, urls, scrape_interval, content_type, stop_scraping_flag)).start()
     return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
+# Define a function to stop scraping
+def stop_scraping(stop_scraping_flag):
+    stop_scraping_flag[0] = True
+    return "Scraping stopped."
+# Define the Gradio interface
 def create_interface():
     with gr.Blocks() as demo:
         with gr.Row():
                 chat_history = gr.Chatbot(label="Chat History")
                 response_box = gr.Textbox(label="Response")
+        stop_scraping_flag = [False]
+        start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type, stop_scraping_flag], outputs=csv_output)
+        stop_button.click(stop_scraping, inputs=[stop_scraping_flag], outputs=[csv_output])
         message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
         # Add a button to display the CSV content for a selected URL