Spaces:
Runtime error
Runtime error
import datetime | |
import os | |
import csv | |
import time | |
import hashlib | |
import logging | |
import threading | |
from pathlib import Path | |
import gradio as gr | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from webdriver_manager.chrome import ChromeDriverManager | |
from huggingface_hub import InferenceClient | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.FileHandler("monitoring.log"), | |
logging.StreamHandler() | |
] | |
) | |
# Define constants | |
PREFIX = "Task started at {date_time_str}. Purpose: {purpose}" | |
TASK_PROMPT = "Current task: {task}. History:\n{history}" | |
# Define purpose | |
purpose = """ | |
You monitor Culvers sites continuously, seeking changes since your last observation. | |
Any new changes are logged and dumped into a CSV, stored in your log folder at user/app/scraped_data. | |
""" | |
# Initialize history and task variables | |
history = [] | |
current_task = None | |
monitoring_thread = None | |
stop_event = threading.Event() | |
# Default file path using pathlib for cross-platform compatibility | |
default_file_path = Path("user/app/scraped_data/culver/culvers_changes.csv") | |
# Ensure the directory exists | |
default_file_path.parent.mkdir(parents=True, exist_ok=True) | |
def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_event): | |
""" | |
Monitor the given URLs for changes and log them into a CSV file. | |
Runs in a separate thread. | |
""" | |
global history | |
previous_hashes = [""] * len(urls) | |
storage_path = Path(storage_location) | |
# Initialize CSV file: write header if file doesn't exist | |
if not storage_path.exists(): | |
with storage_path.open("w", newline='', encoding='utf-8') as csvfile: | |
csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"]) | |
csv_toolkit.writeheader() | |
options = Options() | |
options.headless = True | |
options.add_argument("--disable-gpu") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | |
try: | |
while not stop_event.is_set(): | |
for i, url in enumerate(urls): | |
try: | |
driver.get(url) | |
time.sleep(2) # Wait for the page to load | |
if content_type == "text": | |
current_content = driver.page_source | |
elif content_type == "media": | |
images = driver.find_elements(By.TAG_NAME, "img") | |
current_content = ''.join([img.get_attribute('src') for img in images]) | |
elif content_type == "both": | |
images = driver.find_elements(By.TAG_NAME, "img") | |
current_content = driver.page_source + ''.join([img.get_attribute('src') for img in images]) | |
else: | |
current_content = driver.page_source | |
current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest() | |
if current_hash != previous_hashes[i]: | |
previous_hashes[i] = current_hash | |
date_time = datetime.datetime.now() | |
date_time_str = date_time.strftime("%Y-%m-%d %H:%M:%S") | |
history_entry = f"Change detected at {url} on {date_time_str}" | |
history.append(history_entry) | |
with storage_path.open("a", newline='', encoding='utf-8') as csvfile: | |
csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"]) | |
csv_toolkit.writerow({ | |
"date": date_time.strftime("%Y-%m-%d"), | |
"time": date_time.strftime("%H:%M:%S"), | |
"url": url, | |
"change": "Content changed" | |
}) | |
logging.info(history_entry) | |
except Exception as e: | |
logging.error(f"Error accessing {url}: {e}") | |
# Sleep in smaller intervals to allow quicker shutdown | |
for _ in range(scrape_interval * 60): | |
if stop_event.is_set(): | |
break | |
time.sleep(1) | |
except Exception as e: | |
logging.error(f"Unexpected error in monitoring thread: {e}") | |
finally: | |
driver.quit() | |
logging.info("Monitoring thread has been stopped.") | |
def start_monitoring(storage_location, url1, url2, scrape_interval, content_type): | |
global current_task, monitoring_thread, stop_event, history | |
if monitoring_thread and monitoring_thread.is_alive(): | |
return "Monitoring is already running.", history | |
history = [] | |
current_task = f"Monitoring URLs: {url1}, {url2}" | |
history.append(f"Task started: {current_task}") | |
logging.info(current_task) | |
stop_event.clear() | |
urls = [url1, url2] | |
monitoring_thread = threading.Thread( | |
target=monitor_urls, | |
args=(storage_location, urls, scrape_interval, content_type, stop_event), | |
daemon=True | |
) | |
monitoring_thread.start() | |
return "Monitoring started.", history | |
def stop_monitoring(): | |
global current_task, monitoring_thread, stop_event, history | |
if monitoring_thread and monitoring_thread.is_alive(): | |
stop_event.set() | |
monitoring_thread.join() | |
history.append("Monitoring stopped by user.") | |
logging.info("Monitoring stopped by user.") | |
current_task = None | |
return "Monitoring stopped.", history | |
else: | |
return "No monitoring task is currently running.", history | |
# Define the chat response function | |
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") | |
def respond( | |
message, | |
history: list[tuple[str, str]], | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
): | |
messages = [{"role": "system", "content": system_message}] | |
for user_msg, assistant_msg in history: | |
if user_msg: | |
messages.append({"role": "user", "content": user_msg}) | |
if assistant_msg: | |
messages.append({"role": "assistant", "content": assistant_msg}) | |
messages.append({"role": "user", "content": message}) | |
response = "" | |
try: | |
for msg in client.chat_completion( | |
messages, | |
max_tokens=max_tokens, | |
stream=True, | |
temperature=temperature, | |
top_p=top_p, | |
): | |
token = msg.choices[0].delta.get("content", "") | |
response += token | |
yield response | |
except Exception as e: | |
logging.error(f"Error in chatbot response: {e}") | |
yield "An error occurred while generating the response." | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Culvers Site Monitor and Chatbot") | |
gr.Markdown( | |
"Monitor changes on Culvers' websites and log them into a CSV file. " | |
"Also, chat with a friendly chatbot." | |
) | |
with gr.Tab("Monitor"): | |
with gr.Row(): | |
storage_location = gr.Textbox( | |
value=str(default_file_path), | |
label="Storage Location", | |
placeholder="Path to CSV file where changes will be logged" | |
) | |
with gr.Row(): | |
url1 = gr.Textbox( | |
value="https://www.culver.k12.in.us/", | |
label="URL 1", | |
placeholder="First URL to monitor" | |
) | |
url2 = gr.Textbox( | |
value="https://www.facebook.com/CulverCommunitySchools", | |
label="URL 2", | |
placeholder="Second URL to monitor" | |
) | |
with gr.Row(): | |
scrape_interval = gr.Slider( | |
minimum=1, | |
maximum=60, | |
value=5, | |
step=1, | |
label="Scrape Interval (minutes)" | |
) | |
content_type = gr.Radio( | |
choices=["text", "media", "both"], | |
value="text", | |
label="Content Type" | |
) | |
with gr.Row(): | |
start_button = gr.Button("Start Monitoring") | |
stop_button = gr.Button("Stop Monitoring") | |
with gr.Row(): | |
monitoring_status = gr.Textbox( | |
value="No active monitoring.", | |
label="Monitoring Status", | |
interactive=False | |
) | |
with gr.Row(): | |
monitoring_history = gr.Textbox( | |
value="", | |
label="Monitoring History", | |
lines=10, | |
interactive=False | |
) | |
with gr.Tab("Chatbot"): | |
chatbot = gr.Chatbot(label="Chat with the Assistant") | |
with gr.Row(): | |
system_message = gr.Textbox( | |
value="You are a friendly Chatbot.", | |
label="System Message", | |
visible=False | |
) | |
with gr.Row(): | |
user_input = gr.Textbox( | |
label="You:", | |
placeholder="Type your message here..." | |
) | |
submit_button = gr.Button("Send") | |
# Parameters | |
max_tokens = gr.Slider( | |
minimum=1, | |
maximum=2048, | |
value=512, | |
step=1, | |
label="Max new tokens" | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=4.0, | |
value=0.7, | |
step=0.1, | |
label="Temperature" | |
) | |
top_p = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-p (nucleus sampling)" | |
) | |
# Define interactions | |
def update_monitoring_history(message, history_text): | |
return history_text + message + "\n" | |
start_button.click( | |
fn=start_monitoring, | |
inputs=[storage_location, url1, url2, scrape_interval, content_type], | |
outputs=[monitoring_status, monitoring_history], | |
queue=False | |
) | |
stop_button.click( | |
fn=stop_monitoring, | |
inputs=None, | |
outputs=[monitoring_status, monitoring_history], | |
queue=False | |
) | |
def display_history(status, hist): | |
return status, "\n".join(hist) | |
# Update monitoring_status and monitoring_history periodically | |
def refresh_monitoring(status, hist): | |
return status, "\n".join(hist) | |
user_input.submit( | |
lambda msg, hist, sys, max_t, temp, tp: ( | |
gr.update(value=hist + [(msg, "")]), | |
respond(msg, hist, sys, max_t, temp, tp) | |
), | |
inputs=[user_input, chatbot, system_message, max_tokens, temperature, top_p], | |
outputs=[chatbot, chatbot] | |
) | |
submit_button.click( | |
lambda msg, hist, sys, max_t, temp, tp: ( | |
gr.update(value=hist + [(msg, "")]), | |
respond(msg, hist, sys, max_t, temp, tp) | |
), | |
inputs=[user_input, chatbot, system_message, max_tokens, temperature, top_p], | |
outputs=[chatbot, chatbot] | |
) | |
if __name__ == "__main__": | |
demo.launch() |