Spaces:
Runtime error
Runtime error
import asyncio | |
import csv | |
import logging | |
import os | |
from typing import List, Tuple | |
import aiohttp | |
import datetime | |
import difflib | |
import hashlib | |
from pathlib import Path | |
import feedparser | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime | |
from sqlalchemy.orm import declarative_base, sessionmaker | |
from sqlalchemy.exc import SQLAlchemyError | |
import validators | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Configuration | |
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") | |
DEFAULT_MONITORING_INTERVAL = 300 | |
MAX_MONITORING_INTERVAL = 600 | |
CHANGE_FREQUENCY_THRESHOLD = 3 | |
# Global variables | |
monitoring_tasks = {} | |
url_monitoring_intervals = {} | |
change_counts = {} | |
history = [] | |
engine = None # Initialize the database engine globally | |
# Database setup | |
Base = declarative_base() | |
class Article(Base): | |
__tablename__ = 'articles' | |
id = Column(Integer, primary_key=True) | |
url = Column(String(255), nullable=False) | |
title = Column(String(255)) | |
content = Column(Text) | |
hash = Column(String(32)) | |
timestamp = Column(DateTime, default=datetime.datetime.utcnow) | |
# Utility functions | |
def sanitize_url(url: str) -> str: | |
return validators.url(url) | |
async def fetch_url_content(url: str, | |
session: aiohttp.ClientSession) -> Tuple[str, str]: | |
async with session.get(url) as response: | |
content = await response.text() | |
soup = BeautifulSoup(content, 'html.parser') | |
title = soup.title.string if soup.title else "No Title" | |
return title, content | |
def calculate_hash(content: str) -> str: | |
return hashlib.md5(content.encode('utf-8')).hexdigest() | |
async def save_to_database(url: str, title: str, content: str, hash: str): | |
session = Session() | |
try: | |
article = Article(url=url, title=title, content=content, hash=hash) | |
session.add(article) | |
session.commit() | |
except SQLAlchemyError as e: | |
logger.error(f"Database error: {e}") | |
session.rollback() | |
finally: | |
session.close() | |
def save_to_csv(storage_location: str, url: str, title: str, content: str, | |
timestamp: datetime.datetime): | |
try: | |
with open(storage_location, "a", newline='', encoding="utf-8") as csvfile: | |
csv_writer = csv.writer(csvfile) | |
csv_writer.writerow([ | |
timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content | |
]) | |
except Exception as e: | |
logger.error(f"Error saving to CSV: {e}") | |
async def monitor_url(url: str, interval: int, storage_location: str, | |
feed_rss: bool): | |
previous_hash = "" | |
async with aiohttp.ClientSession() as session: | |
while True: | |
try: | |
title, content = await fetch_url_content(url, session) | |
current_hash = calculate_hash(content) | |
if current_hash != previous_hash: | |
previous_hash = current_hash | |
timestamp = datetime.datetime.now() | |
if feed_rss: | |
await save_to_database(url, title, content, | |
current_hash) | |
if storage_location: | |
save_to_csv(storage_location, url, title, content, | |
timestamp) | |
history.append( | |
f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}" | |
) | |
logger.info(f"Change detected at {url}") | |
change_counts[url] = change_counts.get(url, 0) + 1 | |
if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD: | |
interval = max(60, interval // 2) | |
else: | |
change_counts[url] = 0 | |
interval = min(interval * 2, MAX_MONITORING_INTERVAL) | |
url_monitoring_intervals[url] = interval | |
except Exception as e: | |
logger.error(f"Error monitoring {url}: {e}") | |
history.append(f"Error monitoring {url}: {e}") | |
await asyncio.sleep(interval) | |
async def start_monitoring(urls: List[str], storage_location: str, | |
feed_rss: bool): | |
for url in urls: | |
if url not in monitoring_tasks: | |
sanitized_url = sanitize_url(url) | |
if sanitized_url: | |
task = asyncio.create_task( | |
monitor_url(sanitized_url, DEFAULT_MONITORING_INTERVAL, | |
storage_location, feed_rss)) | |
monitoring_tasks[sanitized_url] = task | |
else: | |
logger.warning(f"Invalid URL: {url}") | |
history.append(f"Invalid URL: {url}") | |
def stop_monitoring(url: str): | |
if url in monitoring_tasks: | |
monitoring_tasks[url].cancel() | |
del monitoring_tasks[url] | |
def generate_rss_feed(): | |
session = Session() | |
try: | |
articles = session.query(Article).order_by( | |
Article.timestamp.desc()).limit(20).all() | |
feed = feedparser.FeedParserDict() | |
feed['title'] = 'Website Changes Feed' | |
feed['link'] = 'http://yourwebsite.com/feed' | |
feed['description'] = 'Feed of changes detected on monitored websites.' | |
feed['entries'] = [{ | |
'title': article.title, | |
'link': article.url, | |
'description': article.content, | |
'published': article.timestamp | |
} for article in articles] | |
return feedparser.FeedGenerator().feed_from_dictionary( | |
feed).writeString('utf-8') | |
except SQLAlchemyError as e: | |
logger.error(f"Database error: {e}") | |
return None | |
finally: | |
session.close() | |
async def chatbot_response(message: str, history: List[Tuple[str, str]]): | |
try: | |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", | |
token=HUGGINGFACE_API_KEY) | |
response = await client.inference(message) | |
# Format the response as a dictionary | |
history.append({"role": "user", "content": message}) # Add user message | |
history.append({ | |
"role": "assistant", | |
"content": response | |
}) # Add assistant response | |
return history, history | |
except Exception as e: | |
logger.error(f"Chatbot error: {e}") | |
history.append({"role": "user", "content": message}) # Add user message | |
history.append({ | |
"role": "assistant", | |
"content": "Error: Could not get a response from the chatbot." | |
}) # Add error message | |
return history, history | |
def create_db_engine(db_url): | |
global engine, Base, Session | |
try: | |
engine = create_engine(db_url) | |
Base.metadata.create_all(engine) | |
Session = sessionmaker(bind=engine) | |
return "Database connected successfully!" | |
except SQLAlchemyError as e: | |
logger.error(f"Database error: {e}") | |
return f"Database error: {e}" | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Website Monitor and Chatbot") | |
with gr.Row(): | |
with gr.Column(): # Side pane for database configuration | |
db_url = gr.Textbox(label="Database URL", | |
placeholder="e.g., sqlite:///monitoring.db") | |
db_connect_button = gr.Button("Connect to Database") | |
db_status = gr.Textbox(label="Database Status", | |
interactive=False, | |
value="Not connected") | |
db_connect_button.click(create_db_engine, | |
inputs=db_url, | |
outputs=db_status) | |
with gr.Column(): # Main pane for monitoring and chatbot | |
with gr.Tab("Configuration"): | |
target_urls = gr.Textbox( | |
label="Target URLs (comma-separated)", | |
placeholder= | |
"https://example.com, https://another-site.com") | |
storage_location = gr.Textbox( | |
label="Storage Location (CSV file path)", | |
placeholder="/path/to/your/file.csv") | |
feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed") | |
start_button = gr.Button("Start Monitoring") | |
stop_button = gr.Button("Stop Monitoring") | |
status_text = gr.Textbox(label="Status", interactive=False) | |
history_text = gr.Textbox(label="History", | |
lines=10, | |
interactive=False) | |
with gr.Tab("User-End View"): | |
feed_content = gr.JSON(label="RSS Feed Content") | |
with gr.Tab("Chatbot"): | |
chatbot_interface = gr.Chatbot(type='messages') | |
message_input = gr.Textbox( | |
placeholder="Type your message here...") | |
send_button = gr.Button("Send") | |
async def on_start_click(target_urls_str: str, storage_loc: str, | |
feed_enabled: bool): | |
urls = [url.strip() for url in target_urls_str.split(",")] | |
await start_monitoring(urls, storage_loc if storage_loc else None, | |
feed_enabled) | |
return "Monitoring started for valid URLs." | |
async def on_stop_click(): | |
for url in list(monitoring_tasks.keys()): | |
stop_monitoring(url) | |
return "Monitoring stopped for all URLs." | |
start_button.click( | |
on_start_click, | |
inputs=[target_urls, storage_location, feed_rss_checkbox], | |
outputs=[status_text]) | |
stop_button.click(on_stop_click, outputs=[status_text]) | |
send_button.click( | |
chatbot_response, | |
inputs=[message_input, chatbot_interface], | |
outputs=[chatbot_interface, chatbot_interface]) | |
async def update_feed_content(): | |
return generate_rss_feed() | |
# Periodic update loop | |
async def periodic_update(): | |
while True: | |
await asyncio.sleep(300) # Wait for 5 minutes | |
await update_feed_content() | |
# Start the periodic update task | |
asyncio.create_task(periodic_update()) | |
if __name__ == "__main__": | |
demo.launch() |