Spaces:
Runtime error
Runtime error
import os | |
import asyncio | |
import csv | |
import logging | |
from typing import List, Tuple | |
from dotenv import load_dotenv | |
import aiohttp | |
import gradio as gr | |
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime | |
from sqlalchemy.ext.declarative import declarative_base | |
from sqlalchemy.orm import sessionmaker | |
from sqlalchemy.exc import SQLAlchemyError | |
from bs4 import BeautifulSoup | |
import hashlib | |
import datetime | |
import feedparser | |
from huggingface_hub import InferenceClient | |
import validators | |
# Load environment variables | |
load_dotenv() | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Configuration | |
DB_URL = os.getenv('DB_URL', 'sqlite:///monitoring.db') | |
HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY') | |
DEFAULT_MONITORING_INTERVAL = 300 | |
MAX_MONITORING_INTERVAL = 600 | |
CHANGE_FREQUENCY_THRESHOLD = 3 | |
# Database setup | |
Base = declarative_base() | |
class Article(Base): | |
__tablename__ = 'articles' | |
id = Column(Integer, primary_key=True) | |
url = Column(String(255), nullable=False) | |
title = Column(String(255)) | |
content = Column(Text) | |
hash = Column(String(32)) | |
timestamp = Column(DateTime, default=datetime.datetime.utcnow) | |
engine = create_engine(DB_URL) | |
Base.metadata.create_all(engine) | |
Session = sessionmaker(bind=engine) | |
# Global variables | |
monitoring_tasks = {} | |
url_monitoring_intervals = {} | |
change_counts = {} | |
history = [] | |
# Utility functions | |
def sanitize_url(url: str) -> str: | |
return validators.url(url) | |
async def fetch_url_content(url: str, session: aiohttp.ClientSession) -> Tuple[str, str]: | |
async with session.get(url) as response: | |
content = await response.text() | |
soup = BeautifulSoup(content, 'html.parser') | |
title = soup.title.string if soup.title else "No Title" | |
return title, content | |
def calculate_hash(content: str) -> str: | |
return hashlib.md5(content.encode('utf-8')).hexdigest() | |
async def save_to_database(url: str, title: str, content: str, hash: str): | |
session = Session() | |
try: | |
article = Article(url=url, title=title, content=content, hash=hash) | |
session.add(article) | |
session.commit() | |
except SQLAlchemyError as e: | |
logger.error(f"Database error: {e}") | |
session.rollback() | |
finally: | |
session.close() | |
def save_to_csv(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime): | |
try: | |
with open(storage_location, "a", newline='', encoding='utf-8') as csvfile: | |
csv_writer = csv.writer(csvfile) | |
csv_writer.writerow([timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content]) | |
except Exception as e: | |
logger.error(f"Error saving to CSV: {e}") | |
async def monitor_url(url: str, interval: int, storage_location: str, feed_rss: bool): | |
previous_hash = "" | |
async with aiohttp.ClientSession() as session: | |
while True: | |
try: | |
title, content = await fetch_url_content(url, session) | |
current_hash = calculate_hash(content) | |
if current_hash != previous_hash: | |
previous_hash = current_hash | |
timestamp = datetime.datetime.now() | |
if feed_rss: | |
await save_to_database(url, title, content, current_hash) | |
if storage_location: | |
save_to_csv(storage_location, url, title, content, timestamp) | |
history.append(f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}") | |
logger.info(f"Change detected at {url}") | |
change_counts[url] = change_counts.get(url, 0) + 1 | |
if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD: | |
interval = max(60, interval // 2) | |
else: | |
change_counts[url] = 0 | |
interval = min(interval * 2, MAX_MONITORING_INTERVAL) | |
url_monitoring_intervals[url] = interval | |
except Exception as e: | |
logger.error(f"Error monitoring {url}: {e}") | |
history.append(f"Error monitoring {url}: {e}") | |
await asyncio.sleep(interval) | |
async def start_monitoring(urls: List[str], storage_location: str, feed_rss: bool): | |
for url in urls: | |
if url not in monitoring_tasks: | |
sanitized_url = sanitize_url(url) | |
if sanitized_url: | |
task = asyncio.create_task(monitor_url(sanitized_url, DEFAULT_MONITORING_INTERVAL, storage_location, feed_rss)) | |
monitoring_tasks[sanitized_url] = task | |
else: | |
logger.warning(f"Invalid URL: {url}") | |
history.append(f"Invalid URL: {url}") | |
def stop_monitoring(url: str): | |
if url in monitoring_tasks: | |
monitoring_tasks[url].cancel() | |
del monitoring_tasks[url] | |
def generate_rss_feed(): | |
session = Session() | |
try: | |
articles = session.query(Article).order_by(Article.timestamp.desc()).limit(20).all() | |
feed = feedparser.FeedParserDict() | |
feed['title'] = 'Website Changes Feed' | |
feed['link'] = 'http://yourwebsite.com/feed' | |
feed['description'] = 'Feed of changes detected on monitored websites.' | |
feed['entries'] = [ | |
{'title': article.title, 'link': article.url, 'description': article.content, 'published': article.timestamp} | |
for article in articles | |
] | |
return feedparser.FeedGenerator().feed_from_dictionary(feed).writeString('utf-8') | |
except SQLAlchemyError as e: | |
logger.error(f"Database error: {e}") | |
return None | |
finally: | |
session.close() | |
async def chatbot_response(message: str, history: List[Tuple[str, str]]): | |
try: | |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY) | |
response = await client.inference(message) | |
history.append((message, response)) | |
return history, history | |
except Exception as e: | |
logger.error(f"Chatbot error: {e}") | |
history.append((message, "Error: Could not get a response from the chatbot.")) | |
return history, history | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Website Monitor and Chatbot") | |
with gr.Tab("Configuration"): | |
target_urls = gr.Textbox(label="Target URLs (comma-separated)", placeholder="https://example.com, https://another-site.com") | |
storage_location = gr.Textbox(label="Storage Location (CSV file path)", placeholder="/path/to/your/file.csv") | |
feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed") | |
start_button = gr.Button("Start Monitoring") | |
stop_button = gr.Button("Stop Monitoring") | |
status_text = gr.Textbox(label="Status", interactive=False) | |
history_text = gr.Textbox(label="History", lines=10, interactive=False) | |
with gr.Tab("User-End View"): | |
feed_content = gr.JSON(label="RSS Feed Content") | |
with gr.Tab("Chatbot"): | |
chatbot_interface = gr.Chatbot() | |
message_input = gr.Textbox(placeholder="Type your message here...") | |
send_button = gr.Button("Send") | |
async def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool): | |
urls = [url.strip() for url in target_urls_str.split(",")] | |
await start_monitoring(urls, storage_loc if storage_loc else None, feed_enabled) | |
return "Monitoring started for valid URLs." | |
async def on_stop_click(): | |
for url in list(monitoring_tasks.keys()): | |
stop_monitoring(url) | |
return "Monitoring stopped for all URLs." | |
start_button.click(on_start_click, inputs=[target_urls, storage_location, feed_rss_checkbox], outputs=[status_text]) | |
stop_button.click(on_stop_click, outputs=[status_text]) | |
send_button.click(chatbot_response, inputs=[message_input, chatbot_interface], outputs=[chatbot_interface, chatbot_interface]) | |
async def update_feed_content(): | |
return generate_rss_feed() | |
feed_updater = gr.Timer(update_feed_content, every=5) | |
if __name__ == "__main__": | |
demo.launch() |