Spaces:
Runtime error
Runtime error
File size: 13,525 Bytes
62cc7a7 885ce0d 2288f21 d7e93db 8540cd7 2288f21 8540cd7 2288f21 d7e93db 8540cd7 d7e93db 2288f21 d7e93db 8540cd7 2288f21 d7e93db 8540cd7 2288f21 d7e93db 2288f21 8540cd7 2288f21 8540cd7 2288f21 8540cd7 2288f21 62cc7a7 2288f21 d7e93db 8540cd7 d7e93db 2288f21 d7e93db 8540cd7 2288f21 d7e93db 2288f21 8540cd7 d7e93db 8540cd7 d7e93db 2288f21 8540cd7 2288f21 62cc7a7 d7e93db 2288f21 d7e93db 62cc7a7 2288f21 8540cd7 885ce0d 2288f21 885ce0d d7e93db 62cc7a7 2288f21 8540cd7 2288f21 d7e93db 2288f21 885ce0d 2288f21 62cc7a7 2288f21 885ce0d 2288f21 885ce0d 2288f21 d7e93db 2288f21 8540cd7 2288f21 8540cd7 2288f21 8540cd7 d7e93db 2288f21 d7e93db 2288f21 885ce0d 2288f21 8540cd7 d7e93db 2288f21 8540cd7 2288f21 8540cd7 d7e93db 8540cd7 d7e93db 2288f21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
import datetime
import os
import csv
import time
import hashlib
import threading
from pathlib import Path
import logging
from typing import List, Tuple
import gradio as gr
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
WebDriverException,
NoSuchElementException,
TimeoutException,
StaleElementReferenceException,
)
from webdriver_manager.chrome import ChromeDriverManager
from huggingface_hub import InferenceClient
import mysql.connector
import feedparser # For parsing RSS feeds
import sqlite3 # For simple local storage if needed
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s')
# Configuration (replace with your actual values or environment variables)
DB_HOST = os.environ.get("DB_HOST", "your_host")
DB_USER = os.environ.get("DB_USER", "your_user")
DB_PASSWORD = os.environ.get("DB_PASSWORD", "your_password")
DB_NAME = os.environ.get("DB_NAME", "your_database")
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY") # Add API key
DEFAULT_MONITORING_INTERVAL = 300 # 5 minutes in seconds
MAX_MONITORING_INTERVAL = 600 # 10 minutes in seconds
CHANGE_FREQUENCY_THRESHOLD = 3 # Number of changes to trigger faster monitoring
# Global variables
monitoring_thread = None
stop_event = threading.Event()
db_connection = None
current_task = None
history = []
url_monitoring_intervals = {} # Store monitoring intervals for each URL
change_counts = {} # Track change frequency for each URL
# Function to establish a database connection
def get_db_connection():
global db_connection
if db_connection is None or not db_connection.is_connected():
try:
db_connection = mysql.connector.connect(
host=DB_HOST,
user=DB_USER,
password=DB_PASSWORD,
database=DB_NAME
)
return db_connection
except Exception as e:
logging.error(f"Error connecting to database: {e}")
return None
else:
return db_connection
# Function to create the articles table if it doesn't exist
def create_articles_table():
conn = get_db_connection()
if conn:
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS articles (
id INT AUTO_INCREMENT PRIMARY KEY,
url VARCHAR(255) NOT NULL,
title VARCHAR(255),
content TEXT,
hash VARCHAR(32),
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
cursor.close()
# Initialize the articles table
create_articles_table()
# Function to monitor URLs for changes
def monitor_urls(target_urls: List[str], storage_location: str, feed_rss: bool, stop_event: threading.Event):
global history, url_monitoring_intervals, change_counts
previous_hashes = {url: "" for url in target_urls}
options = Options()
options.headless = True
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_experimental_option("excludeSwitches", ["enable-logging"]) # Suppress unnecessary logs
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.implicitly_wait(10) # Implicit wait for elements
try:
while not stop_event.is_set():
for url in target_urls:
try:
# Dynamic monitoring interval
interval = url_monitoring_intervals.get(url, DEFAULT_MONITORING_INTERVAL)
driver.get(url)
time.sleep(2) # Allow page to load
# Check for changes
try:
current_content = driver.find_element(By.TAG_NAME, "body").get_attribute("innerHTML")
current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest()
except (NoSuchElementException, TimeoutException, StaleElementReferenceException) as e:
logging.warning(f"Error getting content for {url}: {e}")
continue
if current_hash != previous_hashes[url]:
previous_hashes[url] = current_hash
timestamp = datetime.datetime.now()
try:
title_element = driver.find_element(By.TAG_NAME, "title")
title = title_element.text
except NoSuchElementException:
title = "No Title"
history.append(f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
if storage_location:
save_to_storage(storage_location, url, title, current_content, timestamp)
if feed_rss:
save_to_database(url, title, current_content, current_hash)
# Adjust monitoring interval based on change frequency
change_counts[url] = change_counts.get(url, 0) + 1
if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD:
url_monitoring_intervals[url] = 60 # Check more frequently after multiple changes
else:
url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL) # Gradually increase interval
else:
# Increase interval if no changes detected
change_counts[url] = 0 # Reset change count if no change
url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL)
except WebDriverException as e:
logging.error(f"Error accessing {url}: {e}")
if stop_event.is_set():
break # Exit inner loop if stop event is set
if not stop_event.is_set():
time.sleep(interval)
except Exception as e:
logging.error(f"Unexpected error in monitoring thread: {e}")
finally:
driver.quit()
logging.info("Monitoring thread has been stopped.")
# Function to save data to local storage (CSV)
def save_to_storage(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime):
try:
with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content])
except Exception as e:
logging.error(f"Error saving to storage: {e}")
# Function to save data to the database
def save_to_database(url: str, title: str, content: str, hash: str):
conn = get_db_connection()
if conn:
cursor = conn.cursor()
try:
sql = "INSERT INTO articles (url, title, content, hash) VALUES (%s, %s, %s, %s)"
val = (url, title, content, hash)
cursor.execute(sql, val)
conn.commit()
except Exception as e:
logging.error(f"Error saving to database: {e}")
finally:
cursor.close()
# Function to generate RSS feed from the database
def generate_rss_feed():
conn = get_db_connection()
if conn:
cursor = conn.cursor()
try:
cursor.execute("SELECT * FROM articles ORDER BY timestamp DESC")
articles = cursor.fetchall()
feed = feedparser.FeedParserDict()
feed['title'] = 'Website Changes Feed'
feed['link'] = 'http://yourwebsite.com/feed' # Replace with your actual feed URL
feed['description'] = 'Feed of changes detected on monitored websites.'
feed['entries'] = []
for article in articles:
entry = feedparser.FeedParserDict()
entry['title'] = article[2] # Title
entry['link'] = article[1] # URL
entry['description'] = article[3] # Content
entry['published'] = article[5] # Timestamp
feed['entries'].append(entry)
return feedparser.FeedGenerator().feed_from_dictionary(feed).writeString('utf-8')
except Exception as e:
logging.error(f"Error generating RSS feed: {e}")
finally:
cursor.close()
return None
# Function to start monitoring
def start_monitoring(target_urls: List[str], storage_location: str, feed_rss: bool):
global monitoring_thread, stop_event, current_task, history, change_counts
if monitoring_thread and monitoring_thread.is_alive():
return "Monitoring is already running.", history
stop_event.clear()
current_task = f"Monitoring URLs: {', '.join(target_urls)}"
history.append(f"Task started: {current_task}")
change_counts = {url: 0 for url in target_urls} # Reset change counts
monitoring_thread = threading.Thread(
target=monitor_urls,
args=(target_urls, storage_location, feed_rss, stop_event),
daemon=True
)
monitoring_thread.start()
return "Monitoring started.", history
# Function to stop monitoring
def stop_monitoring():
global monitoring_thread, stop_event, current_task, history
if monitoring_thread and monitoring_thread.is_alive():
stop_event.set()
monitoring_thread.join()
current_task = None
history.append("Monitoring stopped by user.")
return "Monitoring stopped.", history
else:
return "No monitoring task is currently running.", history
# Function to handle chatbot responses
def chatbot_response(message: str, history: List[Tuple[str, str]]):
try:
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
response = client.inference(message)
history.append((message, response))
return history, history
except Exception as e:
logging.error(f"Error getting chatbot response: {e}")
history.append((message, "Error: Could not get a response from the chatbot."))
return history, history
# --- Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown("# Website Monitor and Chatbot")
# Configuration Tab
with gr.Tab("Configuration"):
with gr.Row():
target_urls = gr.Textbox(
label="Target URLs (comma-separated)",
placeholder="https://example.com, https://another-site.com"
)
with gr.Row():
storage_location = gr.Textbox(
label="Storage Location (CSV file path)",
placeholder="/path/to/your/file.csv",
visible=False # You can enable this if you want CSV storage
)
with gr.Row():
feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed")
with gr.Row():
start_button = gr.Button("Start Monitoring")
stop_button = gr.Button("Stop Monitoring")
with gr.Row():
status_text = gr.Textbox(label="Status", interactive=False)
with gr.Row():
history_text = gr.Textbox(
label="History", lines=10, interactive=False
)
# User-End View Tab
with gr.Tab("User-End View"):
with gr.Row():
feed_content = gr.JSON(label="RSS Feed Content")
# Chatbot Tab
with gr.Tab("Chatbot"):
chatbot_interface = gr.Chatbot()
with gr.Row():
message_input = gr.Textbox(placeholder="Type your message here...")
send_button = gr.Button("Send")
# --- Event Handlers ---
# Start monitoring button click
def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool):
global history, url_monitoring_intervals
try:
target_urls = [url.strip() for url.strip() in target_urls_str.split(",")]
if not all(target_urls):
return "Please enter valid URLs.", history
# Reset monitoring intervals when starting
url_monitoring_intervals = {url: DEFAULT_MONITORING_INTERVAL for url in target_urls}
status, history = start_monitoring(target_urls, storage_loc if storage_loc else None, feed_enabled)
return status, history
except Exception as e:
return f"Error starting monitoring: {e}", history
start_button.click(
on_start_click,
inputs=[target_urls, storage_location, feed_rss_checkbox],
outputs=[status_text, history_text]
)
# Stop monitoring button click
stop_button.click(
stop_monitoring,
outputs=[status_text, history_text]
)
# Send message to chatbot button click
send_button.click(
chatbot_response,
inputs=[message_input, chatbot_interface],
outputs=[chatbot_interface, chatbot_interface]
)
# Update RSS feed content periodically
def update_feed_content():
return generate_rss_feed()
demo.load(update_feed_content, outputs=feed_content, every=5) # Update every 5 seconds
if __name__ == "__main__":
demo.launch() |