Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -6,13 +6,19 @@ import hashlib
|
|
6 |
import threading
|
7 |
from pathlib import Path
|
8 |
import logging
|
|
|
9 |
|
10 |
import gradio as gr
|
11 |
from selenium import webdriver
|
12 |
from selenium.webdriver.chrome.service import Service
|
13 |
from selenium.webdriver.chrome.options import Options
|
14 |
from selenium.webdriver.common.by import By
|
15 |
-
from selenium.common.exceptions import
|
|
|
|
|
|
|
|
|
|
|
16 |
from webdriver_manager.chrome import ChromeDriverManager
|
17 |
from huggingface_hub import InferenceClient
|
18 |
import mysql.connector
|
@@ -20,7 +26,7 @@ import feedparser # For parsing RSS feeds
|
|
20 |
import sqlite3 # For simple local storage if needed
|
21 |
|
22 |
# Configure logging
|
23 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
24 |
|
25 |
# Configuration (replace with your actual values or environment variables)
|
26 |
DB_HOST = os.environ.get("DB_HOST", "your_host")
|
@@ -28,6 +34,9 @@ DB_USER = os.environ.get("DB_USER", "your_user")
|
|
28 |
DB_PASSWORD = os.environ.get("DB_PASSWORD", "your_password")
|
29 |
DB_NAME = os.environ.get("DB_NAME", "your_database")
|
30 |
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY") # Add API key
|
|
|
|
|
|
|
31 |
|
32 |
# Global variables
|
33 |
monitoring_thread = None
|
@@ -36,6 +45,7 @@ db_connection = None
|
|
36 |
current_task = None
|
37 |
history = []
|
38 |
url_monitoring_intervals = {} # Store monitoring intervals for each URL
|
|
|
39 |
|
40 |
# Function to establish a database connection
|
41 |
def get_db_connection():
|
@@ -77,28 +87,36 @@ def create_articles_table():
|
|
77 |
create_articles_table()
|
78 |
|
79 |
# Function to monitor URLs for changes
|
80 |
-
def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
|
81 |
-
global history, url_monitoring_intervals
|
82 |
previous_hashes = {url: "" for url in target_urls}
|
83 |
options = Options()
|
84 |
options.headless = True
|
85 |
options.add_argument("--disable-gpu")
|
86 |
options.add_argument("--no-sandbox")
|
87 |
options.add_argument("--disable-dev-shm-usage")
|
|
|
88 |
|
89 |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
|
|
90 |
|
91 |
try:
|
92 |
while not stop_event.is_set():
|
93 |
for url in target_urls:
|
94 |
try:
|
95 |
# Dynamic monitoring interval
|
96 |
-
interval = url_monitoring_intervals.get(url,
|
97 |
|
98 |
driver.get(url)
|
99 |
time.sleep(2) # Allow page to load
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
if current_hash != previous_hashes[url]:
|
104 |
previous_hashes[url] = current_hash
|
@@ -118,12 +136,17 @@ def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
|
|
118 |
if feed_rss:
|
119 |
save_to_database(url, title, current_content, current_hash)
|
120 |
|
121 |
-
# Adjust monitoring interval based on change frequency
|
122 |
-
|
|
|
|
|
|
|
|
|
123 |
|
124 |
else:
|
125 |
-
# Increase interval if no changes detected
|
126 |
-
|
|
|
127 |
|
128 |
except WebDriverException as e:
|
129 |
logging.error(f"Error accessing {url}: {e}")
|
@@ -132,7 +155,7 @@ def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
|
|
132 |
break # Exit inner loop if stop event is set
|
133 |
|
134 |
if not stop_event.is_set():
|
135 |
-
time.sleep(interval)
|
136 |
|
137 |
except Exception as e:
|
138 |
logging.error(f"Unexpected error in monitoring thread: {e}")
|
@@ -141,7 +164,7 @@ def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
|
|
141 |
logging.info("Monitoring thread has been stopped.")
|
142 |
|
143 |
# Function to save data to local storage (CSV)
|
144 |
-
def save_to_storage(storage_location, url, title, content, timestamp):
|
145 |
try:
|
146 |
with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
|
147 |
csv_writer = csv.writer(csvfile)
|
@@ -150,7 +173,7 @@ def save_to_storage(storage_location, url, title, content, timestamp):
|
|
150 |
logging.error(f"Error saving to storage: {e}")
|
151 |
|
152 |
# Function to save data to the database
|
153 |
-
def save_to_database(url, title, content, hash):
|
154 |
conn = get_db_connection()
|
155 |
if conn:
|
156 |
cursor = conn.cursor()
|
@@ -195,14 +218,15 @@ def generate_rss_feed():
|
|
195 |
return None
|
196 |
|
197 |
# Function to start monitoring
|
198 |
-
def start_monitoring(target_urls, storage_location, feed_rss):
|
199 |
-
global monitoring_thread, stop_event, current_task, history
|
200 |
if monitoring_thread and monitoring_thread.is_alive():
|
201 |
return "Monitoring is already running.", history
|
202 |
|
203 |
stop_event.clear()
|
204 |
current_task = f"Monitoring URLs: {', '.join(target_urls)}"
|
205 |
history.append(f"Task started: {current_task}")
|
|
|
206 |
monitoring_thread = threading.Thread(
|
207 |
target=monitor_urls,
|
208 |
args=(target_urls, storage_location, feed_rss, stop_event),
|
@@ -224,7 +248,7 @@ def stop_monitoring():
|
|
224 |
return "No monitoring task is currently running.", history
|
225 |
|
226 |
# Function to handle chatbot responses
|
227 |
-
def chatbot_response(message, history):
|
228 |
try:
|
229 |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
|
230 |
response = client.inference(message)
|
@@ -279,15 +303,15 @@ with gr.Blocks() as demo:
|
|
279 |
# --- Event Handlers ---
|
280 |
|
281 |
# Start monitoring button click
|
282 |
-
def on_start_click(target_urls_str, storage_loc, feed_enabled):
|
283 |
global history, url_monitoring_intervals
|
284 |
try:
|
285 |
-
target_urls = [url.strip() for url in target_urls_str.split(",")]
|
286 |
if not all(target_urls):
|
287 |
return "Please enter valid URLs.", history
|
288 |
-
|
289 |
# Reset monitoring intervals when starting
|
290 |
-
url_monitoring_intervals = {url:
|
291 |
|
292 |
status, history = start_monitoring(target_urls, storage_loc if storage_loc else None, feed_enabled)
|
293 |
return status, history
|
|
|
6 |
import threading
|
7 |
from pathlib import Path
|
8 |
import logging
|
9 |
+
from typing import List, Tuple
|
10 |
|
11 |
import gradio as gr
|
12 |
from selenium import webdriver
|
13 |
from selenium.webdriver.chrome.service import Service
|
14 |
from selenium.webdriver.chrome.options import Options
|
15 |
from selenium.webdriver.common.by import By
|
16 |
+
from selenium.common.exceptions import (
|
17 |
+
WebDriverException,
|
18 |
+
NoSuchElementException,
|
19 |
+
TimeoutException,
|
20 |
+
StaleElementReferenceException,
|
21 |
+
)
|
22 |
from webdriver_manager.chrome import ChromeDriverManager
|
23 |
from huggingface_hub import InferenceClient
|
24 |
import mysql.connector
|
|
|
26 |
import sqlite3 # For simple local storage if needed
|
27 |
|
28 |
# Configure logging
|
29 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s')
|
30 |
|
31 |
# Configuration (replace with your actual values or environment variables)
|
32 |
DB_HOST = os.environ.get("DB_HOST", "your_host")
|
|
|
34 |
DB_PASSWORD = os.environ.get("DB_PASSWORD", "your_password")
|
35 |
DB_NAME = os.environ.get("DB_NAME", "your_database")
|
36 |
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY") # Add API key
|
37 |
+
DEFAULT_MONITORING_INTERVAL = 300 # 5 minutes in seconds
|
38 |
+
MAX_MONITORING_INTERVAL = 600 # 10 minutes in seconds
|
39 |
+
CHANGE_FREQUENCY_THRESHOLD = 3 # Number of changes to trigger faster monitoring
|
40 |
|
41 |
# Global variables
|
42 |
monitoring_thread = None
|
|
|
45 |
current_task = None
|
46 |
history = []
|
47 |
url_monitoring_intervals = {} # Store monitoring intervals for each URL
|
48 |
+
change_counts = {} # Track change frequency for each URL
|
49 |
|
50 |
# Function to establish a database connection
|
51 |
def get_db_connection():
|
|
|
87 |
create_articles_table()
|
88 |
|
89 |
# Function to monitor URLs for changes
|
90 |
+
def monitor_urls(target_urls: List[str], storage_location: str, feed_rss: bool, stop_event: threading.Event):
|
91 |
+
global history, url_monitoring_intervals, change_counts
|
92 |
previous_hashes = {url: "" for url in target_urls}
|
93 |
options = Options()
|
94 |
options.headless = True
|
95 |
options.add_argument("--disable-gpu")
|
96 |
options.add_argument("--no-sandbox")
|
97 |
options.add_argument("--disable-dev-shm-usage")
|
98 |
+
options.add_experimental_option("excludeSwitches", ["enable-logging"]) # Suppress unnecessary logs
|
99 |
|
100 |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
101 |
+
driver.implicitly_wait(10) # Implicit wait for elements
|
102 |
|
103 |
try:
|
104 |
while not stop_event.is_set():
|
105 |
for url in target_urls:
|
106 |
try:
|
107 |
# Dynamic monitoring interval
|
108 |
+
interval = url_monitoring_intervals.get(url, DEFAULT_MONITORING_INTERVAL)
|
109 |
|
110 |
driver.get(url)
|
111 |
time.sleep(2) # Allow page to load
|
112 |
+
|
113 |
+
# Check for changes
|
114 |
+
try:
|
115 |
+
current_content = driver.find_element(By.TAG_NAME, "body").get_attribute("innerHTML")
|
116 |
+
current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest()
|
117 |
+
except (NoSuchElementException, TimeoutException, StaleElementReferenceException) as e:
|
118 |
+
logging.warning(f"Error getting content for {url}: {e}")
|
119 |
+
continue
|
120 |
|
121 |
if current_hash != previous_hashes[url]:
|
122 |
previous_hashes[url] = current_hash
|
|
|
136 |
if feed_rss:
|
137 |
save_to_database(url, title, current_content, current_hash)
|
138 |
|
139 |
+
# Adjust monitoring interval based on change frequency
|
140 |
+
change_counts[url] = change_counts.get(url, 0) + 1
|
141 |
+
if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD:
|
142 |
+
url_monitoring_intervals[url] = 60 # Check more frequently after multiple changes
|
143 |
+
else:
|
144 |
+
url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL) # Gradually increase interval
|
145 |
|
146 |
else:
|
147 |
+
# Increase interval if no changes detected
|
148 |
+
change_counts[url] = 0 # Reset change count if no change
|
149 |
+
url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL)
|
150 |
|
151 |
except WebDriverException as e:
|
152 |
logging.error(f"Error accessing {url}: {e}")
|
|
|
155 |
break # Exit inner loop if stop event is set
|
156 |
|
157 |
if not stop_event.is_set():
|
158 |
+
time.sleep(interval)
|
159 |
|
160 |
except Exception as e:
|
161 |
logging.error(f"Unexpected error in monitoring thread: {e}")
|
|
|
164 |
logging.info("Monitoring thread has been stopped.")
|
165 |
|
166 |
# Function to save data to local storage (CSV)
|
167 |
+
def save_to_storage(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime):
|
168 |
try:
|
169 |
with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
|
170 |
csv_writer = csv.writer(csvfile)
|
|
|
173 |
logging.error(f"Error saving to storage: {e}")
|
174 |
|
175 |
# Function to save data to the database
|
176 |
+
def save_to_database(url: str, title: str, content: str, hash: str):
|
177 |
conn = get_db_connection()
|
178 |
if conn:
|
179 |
cursor = conn.cursor()
|
|
|
218 |
return None
|
219 |
|
220 |
# Function to start monitoring
|
221 |
+
def start_monitoring(target_urls: List[str], storage_location: str, feed_rss: bool):
|
222 |
+
global monitoring_thread, stop_event, current_task, history, change_counts
|
223 |
if monitoring_thread and monitoring_thread.is_alive():
|
224 |
return "Monitoring is already running.", history
|
225 |
|
226 |
stop_event.clear()
|
227 |
current_task = f"Monitoring URLs: {', '.join(target_urls)}"
|
228 |
history.append(f"Task started: {current_task}")
|
229 |
+
change_counts = {url: 0 for url in target_urls} # Reset change counts
|
230 |
monitoring_thread = threading.Thread(
|
231 |
target=monitor_urls,
|
232 |
args=(target_urls, storage_location, feed_rss, stop_event),
|
|
|
248 |
return "No monitoring task is currently running.", history
|
249 |
|
250 |
# Function to handle chatbot responses
|
251 |
+
def chatbot_response(message: str, history: List[Tuple[str, str]]):
|
252 |
try:
|
253 |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
|
254 |
response = client.inference(message)
|
|
|
303 |
# --- Event Handlers ---
|
304 |
|
305 |
# Start monitoring button click
|
306 |
+
def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool):
|
307 |
global history, url_monitoring_intervals
|
308 |
try:
|
309 |
+
target_urls = [url.strip() for url.strip() in target_urls_str.split(",")]
|
310 |
if not all(target_urls):
|
311 |
return "Please enter valid URLs.", history
|
312 |
+
|
313 |
# Reset monitoring intervals when starting
|
314 |
+
url_monitoring_intervals = {url: DEFAULT_MONITORING_INTERVAL for url in target_urls}
|
315 |
|
316 |
status, history = start_monitoring(target_urls, storage_loc if storage_loc else None, feed_enabled)
|
317 |
return status, history
|