import os import asyncio import csv import logging from typing import List, Tuple from dotenv import load_dotenv import aiohttp import gradio as gr from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from sqlalchemy.exc import SQLAlchemyError from bs4 import BeautifulSoup import hashlib import datetime import feedparser from huggingface_hub import InferenceClient import validators # Load environment variables load_dotenv() # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Configuration DB_URL = os.getenv('DB_URL', 'sqlite:///monitoring.db') HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY') DEFAULT_MONITORING_INTERVAL = 300 MAX_MONITORING_INTERVAL = 600 CHANGE_FREQUENCY_THRESHOLD = 3 # Database setup Base = declarative_base() class Article(Base): __tablename__ = 'articles' id = Column(Integer, primary_key=True) url = Column(String(255), nullable=False) title = Column(String(255)) content = Column(Text) hash = Column(String(32)) timestamp = Column(DateTime, default=datetime.datetime.utcnow) engine = create_engine(DB_URL) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) # Global variables monitoring_tasks = {} url_monitoring_intervals = {} change_counts = {} history = [] # Utility functions def sanitize_url(url: str) -> str: return validators.url(url) async def fetch_url_content(url: str, session: aiohttp.ClientSession) -> Tuple[str, str]: async with session.get(url) as response: content = await response.text() soup = BeautifulSoup(content, 'html.parser') title = soup.title.string if soup.title else "No Title" return title, content def calculate_hash(content: str) -> str: return hashlib.md5(content.encode('utf-8')).hexdigest() async def save_to_database(url: str, title: str, content: str, hash: str): session = Session() try: article = Article(url=url, title=title, content=content, hash=hash) session.add(article) session.commit() except SQLAlchemyError as e: logger.error(f"Database error: {e}") session.rollback() finally: session.close() def save_to_csv(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime): try: with open(storage_location, "a", newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow([timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content]) except Exception as e: logger.error(f"Error saving to CSV: {e}") async def monitor_url(url: str, interval: int, storage_location: str, feed_rss: bool): previous_hash = "" async with aiohttp.ClientSession() as session: while True: try: title, content = await fetch_url_content(url, session) current_hash = calculate_hash(content) if current_hash != previous_hash: previous_hash = current_hash timestamp = datetime.datetime.now() if feed_rss: await save_to_database(url, title, content, current_hash) if storage_location: save_to_csv(storage_location, url, title, content, timestamp) history.append(f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}") logger.info(f"Change detected at {url}") change_counts[url] = change_counts.get(url, 0) + 1 if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD: interval = max(60, interval // 2) else: change_counts[url] = 0 interval = min(interval * 2, MAX_MONITORING_INTERVAL) url_monitoring_intervals[url] = interval except Exception as e: logger.error(f"Error monitoring {url}: {e}") history.append(f"Error monitoring {url}: {e}") await asyncio.sleep(interval) async def start_monitoring(urls: List[str], storage_location: str, feed_rss: bool): for url in urls: if url not in monitoring_tasks: sanitized_url = sanitize_url(url) if sanitized_url: task = asyncio.create_task(monitor_url(sanitized_url, DEFAULT_MONITORING_INTERVAL, storage_location, feed_rss)) monitoring_tasks[sanitized_url] = task else: logger.warning(f"Invalid URL: {url}") history.append(f"Invalid URL: {url}") def stop_monitoring(url: str): if url in monitoring_tasks: monitoring_tasks[url].cancel() del monitoring_tasks[url] def generate_rss_feed(): session = Session() try: articles = session.query(Article).order_by(Article.timestamp.desc()).limit(20).all() feed = feedparser.FeedParserDict() feed['title'] = 'Website Changes Feed' feed['link'] = 'http://yourwebsite.com/feed' feed['description'] = 'Feed of changes detected on monitored websites.' feed['entries'] = [ {'title': article.title, 'link': article.url, 'description': article.content, 'published': article.timestamp} for article in articles ] return feedparser.FeedGenerator().feed_from_dictionary(feed).writeString('utf-8') except SQLAlchemyError as e: logger.error(f"Database error: {e}") return None finally: session.close() async def chatbot_response(message: str, history: List[Tuple[str, str]]): try: client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY) response = await client.inference(message) history.append((message, response)) return history, history except Exception as e: logger.error(f"Chatbot error: {e}") history.append((message, "Error: Could not get a response from the chatbot.")) return history, history # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Website Monitor and Chatbot") with gr.Tab("Configuration"): target_urls = gr.Textbox(label="Target URLs (comma-separated)", placeholder="https://example.com, https://another-site.com") storage_location = gr.Textbox(label="Storage Location (CSV file path)", placeholder="/path/to/your/file.csv") feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed") start_button = gr.Button("Start Monitoring") stop_button = gr.Button("Stop Monitoring") status_text = gr.Textbox(label="Status", interactive=False) history_text = gr.Textbox(label="History", lines=10, interactive=False) with gr.Tab("User-End View"): feed_content = gr.JSON(label="RSS Feed Content") with gr.Tab("Chatbot"): chatbot_interface = gr.Chatbot() message_input = gr.Textbox(placeholder="Type your message here...") send_button = gr.Button("Send") async def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool): urls = [url.strip() for url in target_urls_str.split(",")] await start_monitoring(urls, storage_loc if storage_loc else None, feed_enabled) return "Monitoring started for valid URLs." async def on_stop_click(): for url in list(monitoring_tasks.keys()): stop_monitoring(url) return "Monitoring stopped for all URLs." start_button.click(on_start_click, inputs=[target_urls, storage_location, feed_rss_checkbox], outputs=[status_text]) stop_button.click(on_stop_click, outputs=[status_text]) send_button.click(chatbot_response, inputs=[message_input, chatbot_interface], outputs=[chatbot_interface, chatbot_interface]) async def update_feed_content(): return generate_rss_feed() feed_updater = gr.Timer(update_feed_content, every=5) if __name__ == "__main__": demo.launch()