Spaces:
Runtime error
Runtime error
import datetime | |
import os | |
import csv | |
import time | |
import hashlib | |
import logging | |
import gradio as gr | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from webdriver_manager.chrome import ChromeDriverManager | |
from huggingface_hub import InferenceClient | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Define constants | |
PREFIX = "Task started at {date_time_str}. Purpose: {purpose}" | |
TASK_PROMPT = "Current task: {task}. History:\n{history}" | |
# Define current date/time | |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
# Define purpose | |
purpose = """ | |
You go to Culvers sites, you continuously seek changes on them since your last observation. | |
Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data. | |
""" | |
# Define history | |
history = [] | |
# Define current task | |
current_task = None | |
# Default file path | |
default_file_path = "user/app/scraped_data/culver/culvers_changes.csv" | |
# Ensure the directory exists | |
os.makedirs(os.path.dirname(default_file_path), exist_ok=True) | |
# Function to monitor URLs for changes | |
def monitor_urls(storage_location, url1, url2, scrape_interval, content_type): | |
global history | |
urls = [url1, url2] | |
previous_hashes = ["", ""] | |
# Ensure the directory exists | |
os.makedirs(os.path.dirname(storage_location), exist_ok=True) | |
with open(storage_location, "w", newline='') as csvfile: | |
csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"]) | |
csv_toolkit.writeheader() | |
options = Options() | |
options.headless = True | |
options.add_argument("--disable-gpu") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver: | |
try: | |
while True: | |
for i, url in enumerate(urls): | |
try: | |
driver.get(url) | |
time.sleep(2) # Wait for the page to load | |
if content_type == "text": | |
current_content = driver.page_source | |
elif content_type == "media": | |
current_content = driver.find_elements_by_tag_name("img") | |
else: | |
current_content = driver.page_source | |
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest() | |
if current_hash != previous_hashes[i]: | |
previous_hashes[i] = current_hash | |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
history.append(f"Change detected at {url} on {date_time_str}") | |
csv_toolkit.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"}) | |
logging.info(f"Change detected at {url} on {date_time_str}") | |
except Exception as e: | |
logging.error(f"Error accessing {url}: {e}") | |
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes | |
except KeyboardInterrupt: | |
logging.info("Monitoring stopped by user.") | |
finally: | |
driver.quit() | |
# Define main function to handle user input | |
def handle_input(storage_location, url1, url2, scrape_interval, content_type): | |
global current_task, history | |
current_task = f"Monitoring URLs: {url1}, {url2}" | |
history.append(f"Task started: {current_task}") | |
monitor_urls(storage_location, url1, url2, scrape_interval, content_type) | |
return TASK_PROMPT.format(task=current_task, history="\n".join(history)) | |
# Define the chat response function | |
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") | |
def respond( | |
message, | |
history: list[tuple[str, str]], | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
): | |
messages = [{"role": "system", "content": system_message}] | |
for val in history: | |
if val[0]: | |
messages.append({"role": "user", "content": val[0]}) | |
if val[1]: | |
messages.append({"role": "assistant", "content": val[1]}) | |
messages.append({"role": "user", "content": message}) | |
response = "" | |
for message in client.chat_completion( | |
messages, | |
max_tokens=max_tokens, | |
stream=True, | |
temperature=temperature, | |
top_p=top_p, | |
): | |
token = message.choices[0].delta.content | |
response += token | |
yield response | |
# Create Gradio interface | |
demo = gr.ChatInterface( | |
respond, | |
additional_inputs=[ | |
gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-p (nucleus sampling)", | |
), | |
gr.Textbox(value=default_file_path, label="Storage Location"), | |
gr.Textbox(value="https://www.culver.k12.in.us/", label="URL 1"), | |
gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2"), | |
gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)"), | |
gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type"), | |
], | |
title="Culvers Site Monitor and Chatbot", | |
description="Monitor changes on Culvers' websites and log them into a CSV file. Also, chat with a friendly chatbot." | |
) | |
if __name__ == "__main__": | |
demo.launch() |