import datetime import os import csv import time import hashlib import logging import gradio as gr from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager from huggingface_hub import InferenceClient # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Define constants PREFIX = "Task started at {date_time_str}. Purpose: {purpose}" TASK_PROMPT = "Current task: {task}. History:\n{history}" # Define current date/time date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Define purpose purpose = """ You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data. """ # Define history history = [] # Define current task current_task = None # Default file path default_file_path = "user/app/scraped_data/culver/culvers_changes.csv" # Ensure the directory exists os.makedirs(os.path.dirname(default_file_path), exist_ok=True) # Function to monitor URLs for changes def monitor_urls(storage_location, url1, url2, scrape_interval, content_type): global history urls = [url1, url2] previous_hashes = ["", ""] # Ensure the directory exists os.makedirs(os.path.dirname(storage_location), exist_ok=True) with open(storage_location, "w", newline='') as csvfile: csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"]) csv_toolkit.writeheader() options = Options() options.headless = True options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver: try: while True: for i, url in enumerate(urls): try: driver.get(url) time.sleep(2) # Wait for the page to load if content_type == "text": current_content = driver.page_source elif content_type == "media": current_content = driver.find_elements_by_tag_name("img") else: current_content = driver.page_source current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest() if current_hash != previous_hashes[i]: previous_hashes[i] = current_hash date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") history.append(f"Change detected at {url} on {date_time_str}") csv_toolkit.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"}) logging.info(f"Change detected at {url} on {date_time_str}") except Exception as e: logging.error(f"Error accessing {url}: {e}") time.sleep(scrape_interval * 60) # Check every scrape_interval minutes except KeyboardInterrupt: logging.info("Monitoring stopped by user.") finally: driver.quit() # Define main function to handle user input def handle_input(storage_location, url1, url2, scrape_interval, content_type): global current_task, history current_task = f"Monitoring URLs: {url1}, {url2}" history.append(f"Task started: {current_task}") monitor_urls(storage_location, url1, url2, scrape_interval, content_type) return TASK_PROMPT.format(task=current_task, history="\n".join(history)) # Define the chat response function client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response # Create Gradio interface demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), gr.Textbox(value=default_file_path, label="Storage Location"), gr.Textbox(value="https://www.culver.k12.in.us/", label="URL 1"), gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2"), gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)"), gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type"), ], title="Culvers Site Monitor and Chatbot", description="Monitor changes on Culvers' websites and log them into a CSV file. Also, chat with a friendly chatbot." ) if __name__ == "__main__": demo.launch()