CEEMEESEEK / app.py
acecalisto3's picture
Update app.py
0f0e2ce verified
raw
history blame
8.23 kB
import os
import asyncio
import csv
import logging
from typing import List, Tuple
from dotenv import load_dotenv
import aiohttp
import gradio as gr
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import SQLAlchemyError
from bs4 import BeautifulSoup
import hashlib
import datetime
import feedparser
from huggingface_hub import InferenceClient
import validators
# Load environment variables
load_dotenv()
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Configuration
DB_URL = os.getenv('DB_URL', 'sqlite:///monitoring.db')
HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')
DEFAULT_MONITORING_INTERVAL = 300
MAX_MONITORING_INTERVAL = 600
CHANGE_FREQUENCY_THRESHOLD = 3
# Database setup
Base = declarative_base()
class Article(Base):
__tablename__ = 'articles'
id = Column(Integer, primary_key=True)
url = Column(String(255), nullable=False)
title = Column(String(255))
content = Column(Text)
hash = Column(String(32))
timestamp = Column(DateTime, default=datetime.datetime.utcnow)
engine = create_engine(DB_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Global variables
monitoring_tasks = {}
url_monitoring_intervals = {}
change_counts = {}
history = []
# Utility functions
def sanitize_url(url: str) -> str:
return validators.url(url)
async def fetch_url_content(url: str, session: aiohttp.ClientSession) -> Tuple[str, str]:
async with session.get(url) as response:
content = await response.text()
soup = BeautifulSoup(content, 'html.parser')
title = soup.title.string if soup.title else "No Title"
return title, content
def calculate_hash(content: str) -> str:
return hashlib.md5(content.encode('utf-8')).hexdigest()
async def save_to_database(url: str, title: str, content: str, hash: str):
session = Session()
try:
article = Article(url=url, title=title, content=content, hash=hash)
session.add(article)
session.commit()
except SQLAlchemyError as e:
logger.error(f"Database error: {e}")
session.rollback()
finally:
session.close()
def save_to_csv(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime):
try:
with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([timestamp.strftime("%Y-%m-%d %H:%M:%S"), url, title, content])
except Exception as e:
logger.error(f"Error saving to CSV: {e}")
async def monitor_url(url: str, interval: int, storage_location: str, feed_rss: bool):
previous_hash = ""
async with aiohttp.ClientSession() as session:
while True:
try:
title, content = await fetch_url_content(url, session)
current_hash = calculate_hash(content)
if current_hash != previous_hash:
previous_hash = current_hash
timestamp = datetime.datetime.now()
if feed_rss:
await save_to_database(url, title, content, current_hash)
if storage_location:
save_to_csv(storage_location, url, title, content, timestamp)
history.append(f"Change detected at {url} on {timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
logger.info(f"Change detected at {url}")
change_counts[url] = change_counts.get(url, 0) + 1
if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD:
interval = max(60, interval // 2)
else:
change_counts[url] = 0
interval = min(interval * 2, MAX_MONITORING_INTERVAL)
url_monitoring_intervals[url] = interval
except Exception as e:
logger.error(f"Error monitoring {url}: {e}")
history.append(f"Error monitoring {url}: {e}")
await asyncio.sleep(interval)
async def start_monitoring(urls: List[str], storage_location: str, feed_rss: bool):
for url in urls:
if url not in monitoring_tasks:
sanitized_url = sanitize_url(url)
if sanitized_url:
task = asyncio.create_task(monitor_url(sanitized_url, DEFAULT_MONITORING_INTERVAL, storage_location, feed_rss))
monitoring_tasks[sanitized_url] = task
else:
logger.warning(f"Invalid URL: {url}")
history.append(f"Invalid URL: {url}")
def stop_monitoring(url: str):
if url in monitoring_tasks:
monitoring_tasks[url].cancel()
del monitoring_tasks[url]
def generate_rss_feed():
session = Session()
try:
articles = session.query(Article).order_by(Article.timestamp.desc()).limit(20).all()
feed = feedparser.FeedParserDict()
feed['title'] = 'Website Changes Feed'
feed['link'] = 'http://yourwebsite.com/feed'
feed['description'] = 'Feed of changes detected on monitored websites.'
feed['entries'] = [
{'title': article.title, 'link': article.url, 'description': article.content, 'published': article.timestamp}
for article in articles
]
return feedparser.FeedGenerator().feed_from_dictionary(feed).writeString('utf-8')
except SQLAlchemyError as e:
logger.error(f"Database error: {e}")
return None
finally:
session.close()
async def chatbot_response(message: str, history: List[Tuple[str, str]]):
try:
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
response = await client.inference(message)
history.append((message, response))
return history, history
except Exception as e:
logger.error(f"Chatbot error: {e}")
history.append((message, "Error: Could not get a response from the chatbot."))
return history, history
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Website Monitor and Chatbot")
with gr.Tab("Configuration"):
target_urls = gr.Textbox(label="Target URLs (comma-separated)", placeholder="https://example.com, https://another-site.com")
storage_location = gr.Textbox(label="Storage Location (CSV file path)", placeholder="/path/to/your/file.csv")
feed_rss_checkbox = gr.Checkbox(label="Enable RSS Feed")
start_button = gr.Button("Start Monitoring")
stop_button = gr.Button("Stop Monitoring")
status_text = gr.Textbox(label="Status", interactive=False)
history_text = gr.Textbox(label="History", lines=10, interactive=False)
with gr.Tab("User-End View"):
feed_content = gr.JSON(label="RSS Feed Content")
with gr.Tab("Chatbot"):
chatbot_interface = gr.Chatbot()
message_input = gr.Textbox(placeholder="Type your message here...")
send_button = gr.Button("Send")
async def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool):
urls = [url.strip() for url in target_urls_str.split(",")]
await start_monitoring(urls, storage_loc if storage_loc else None, feed_enabled)
return "Monitoring started for valid URLs."
async def on_stop_click():
for url in list(monitoring_tasks.keys()):
stop_monitoring(url)
return "Monitoring stopped for all URLs."
start_button.click(on_start_click, inputs=[target_urls, storage_location, feed_rss_checkbox], outputs=[status_text])
stop_button.click(on_stop_click, outputs=[status_text])
send_button.click(chatbot_response, inputs=[message_input, chatbot_interface], outputs=[chatbot_interface, chatbot_interface])
async def update_feed_content():
return generate_rss_feed()
feed_updater = gr.Timer(update_feed_content, every=5)
if __name__ == "__main__":
demo.launch()