Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 1, 2024

Commit

d1685e2

verified ·

1 Parent(s): b320b58

Update agent.py

Browse files

Files changed (1) hide show

agent.py +40 -67

agent.py CHANGED Viewed

@@ -11,8 +11,7 @@ from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver.common.exceptions import NoSuchElementException
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from transformers import pipeline
 import feedparser
@@ -26,18 +25,10 @@ PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since
 HISTORY = []
 CURRENT_TASK = None
-# Define the list of URLs to monitor (you can add more URLs here)
-URLS_TO_MONITOR = [
-    "https://www.example1.com/",
-    "https://www.example2.com/",
-    "https://www.example3.com/",
-    # Add as many URLs as needed
-]
 # Function to monitor URLs for changes
-def monitor_urls(storage_location, urls, scrape_interval, content_type):
     global HISTORY
-    previous_hashes = {url: "" for url in urls}  # Use a dictionary for better organization
     try:
         with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
@@ -45,15 +36,26 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
                 for url in urls:
                     try:
                         driver.get(url)
-                        time.sleep(2)  # Wait for the page to load
                         if content_type == "text":
                             current_content = driver.page_source
                         elif content_type == "media":
-                            current_content = driver.find_elements(By.TAG_NAME, "img")
                         else:
                             current_content = driver.page_source
                         current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
-                        if current_hash!= previous_hashes[url]:
                             previous_hashes[url] = current_hash
                             date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                             HISTORY.append(f"Change detected at {url} on {date_time_str}")
@@ -61,14 +63,14 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
                                 csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
                                 csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
                                 logging.info(f"Change detected at {url} on {date_time_str}")
-                    except (NoSuchElementException, Exception) as e:
                         logging.error(f"Error accessing {url}: {e}")
                 time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
     except Exception as e:
         logging.error(f"Error starting ChromeDriver: {e}")
 # Function to start scraping
-def start_scraping(storage_location, urls, scrape_interval, content_type):
     global CURRENT_TASK, HISTORY
     CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
@@ -84,29 +86,40 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
         try:
             with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
                 driver.get(url)
-                time.sleep(2)  # Wait for the page to load
                 if content_type == "text":
                     initial_content = driver.page_source
                 elif content_type == "media":
-                    initial_content = driver.find_elements(By.TAG_NAME, "img")
                 else:
                     initial_content = driver.page_source
                 initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
                 HISTORY.append(f"Initial observation at {url}: {initial_hash}")
                 with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
                     file.write(f"Initial observation at {url}: {initial_hash}")
-        except (NoSuchElementException, Exception) as e:
             HISTORY.append(f"Error accessing {url}: {e}")
     # Monitor the URLs
-    monitor_urls(storage_location, urls, scrape_interval, content_type)
     return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
 # Function to display CSV content
-def display_csv(url):
     hostname = urlparse(url).hostname
-    folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
     csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
     if os.path.exists(csv_path):
         with open(csv_path, "r") as file:
@@ -123,9 +136,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
     return response
 # Function to generate RSS feed for a given URL
-def generate_rss_feed(url):
     hostname = urlparse(url).hostname
-    folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
     csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
     if os.path.exists(csv_path):
         with open(csv_path, "r") as file:
@@ -147,47 +160,7 @@ def generate_rss_feed(url):
         return "No data available."
 # Function to handle user input and generate response
-def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
     response = respond(message, history, system_message, max_tokens, temperature, top_p)
     history.append((message, response))
-    return history, response
-if __name__ == "__main__":
-    # Define the Gradio interface
-    import gradio as gr
-    def create_interface():
-        with gr.Blocks() as demo:
-            with gr.Row():
-                with gr.Column():
-                    message = gr.Textbox(label="Message")
-                    system_message = gr.Textbox(value="You are a helpful assistant.", label="System message")
-                    max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
-                    temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
-                    top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
-                    storage_location = gr.Textbox(value="scraped_data", label="Storage Location")
-                    urls = gr.Textbox(label="URLs (comma separated)")
-                    scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
-                    content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
-                    start_button = gr.Button("Start Scraping")
-                    csv_output = gr.Textbox(label="CSV Output", interactive=False)
-                with gr.Column():
-                    chat_history = gr.Chatbot(label="Chat History")
-                    response_box = gr.Textbox(label="Response")
-            start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type], outputs=csv_output)
-            message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
-            # Add a button to display the RSS feed for a selected URL
-            with gr.Row():
-                selected_url = gr.Textbox(label="Select URL for RSS Feed")
-                rss_button = gr.Button("Generate RSS Feed")
-                rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
-            rss_button.click(generate_rss_feed, inputs=[selected_url], outputs=rss_output)
-        return demo
-    demo = create_interface()
-    demo.launch()

 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from transformers import pipeline
 import feedparser
 HISTORY = []
 CURRENT_TASK = None
 # Function to monitor URLs for changes
+def monitor_urls(storage_location, urls, scrape_interval, content_type, selector=None):
     global HISTORY
+    previous_hashes = {url: "" for url in urls}
     try:
         with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
                 for url in urls:
                     try:
                         driver.get(url)
+                        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))  # Wait for basic page load
+                        time.sleep(2)  # Additional wait for dynamic content
                         if content_type == "text":
                             current_content = driver.page_source
                         elif content_type == "media":
+                            if selector:
+                                try:
+                                    elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
+                                    current_content = [element.get_attribute('src') for element in elements]
+                                except TimeoutException:
+                                    logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
+                                    current_content = []
+                            else:
+                                current_content = driver.find_elements(By.TAG_NAME, "img")
                         else:
                             current_content = driver.page_source
                         current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
+                        if current_hash != previous_hashes[url]:
                             previous_hashes[url] = current_hash
                             date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                             HISTORY.append(f"Change detected at {url} on {date_time_str}")
                                 csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
                                 csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
                                 logging.info(f"Change detected at {url} on {date_time_str}")
+                    except (NoSuchElementException, StaleElementReferenceException, Exception) as e:
                         logging.error(f"Error accessing {url}: {e}")
                 time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
     except Exception as e:
         logging.error(f"Error starting ChromeDriver: {e}")
 # Function to start scraping
+def start_scraping(storage_location, urls, scrape_interval, content_type, selector=None):
     global CURRENT_TASK, HISTORY
     CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
         try:
             with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
                 driver.get(url)
+                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))  # Wait for basic page load
+                time.sleep(2)  # Additional wait for dynamic content
                 if content_type == "text":
                     initial_content = driver.page_source
                 elif content_type == "media":
+                    if selector:
+                        try:
+                            elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
+                            initial_content = [element.get_attribute('src') for element in elements]
+                        except TimeoutException:
+                            logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
+                            initial_content = []
+                    else:
+                        initial_content = driver.find_elements(By.TAG_NAME, "img")
                 else:
                     initial_content = driver.page_source
                 initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
                 HISTORY.append(f"Initial observation at {url}: {initial_hash}")
                 with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
                     file.write(f"Initial observation at {url}: {initial_hash}")
+        except (NoSuchElementException, StaleElementReferenceException, Exception) as e:
             HISTORY.append(f"Error accessing {url}: {e}")
     # Monitor the URLs
+    monitor_urls(storage_location, urls, scrape_interval, content_type, selector)
     return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
 # Function to display CSV content
+def display_csv(storage_location, url):
     hostname = urlparse(url).hostname
+    folder_path = os.path.join(storage_location, hostname)
     csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
     if os.path.exists(csv_path):
         with open(csv_path, "r") as file:
     return response
 # Function to generate RSS feed for a given URL
+def generate_rss_feed(storage_location, url):
     hostname = urlparse(url).hostname
+    folder_path = os.path.join(storage_location, hostname)
     csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
     if os.path.exists(csv_path):
         with open(csv_path, "r") as file:
         return "No data available."
 # Function to handle user input and generate response
+def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type, selector):
     response = respond(message, history, system_message, max_tokens, temperature, top_p)
     history.append((message, response))
+    return history, response