acecalisto3 commited on
Commit
8540cd7
·
verified ·
1 Parent(s): 923ad2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -21
app.py CHANGED
@@ -6,13 +6,19 @@ import hashlib
6
  import threading
7
  from pathlib import Path
8
  import logging
 
9
 
10
  import gradio as gr
11
  from selenium import webdriver
12
  from selenium.webdriver.chrome.service import Service
13
  from selenium.webdriver.chrome.options import Options
14
  from selenium.webdriver.common.by import By
15
- from selenium.common.exceptions import WebDriverException, NoSuchElementException
 
 
 
 
 
16
  from webdriver_manager.chrome import ChromeDriverManager
17
  from huggingface_hub import InferenceClient
18
  import mysql.connector
@@ -20,7 +26,7 @@ import feedparser # For parsing RSS feeds
20
  import sqlite3 # For simple local storage if needed
21
 
22
  # Configure logging
23
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
 
25
  # Configuration (replace with your actual values or environment variables)
26
  DB_HOST = os.environ.get("DB_HOST", "your_host")
@@ -28,6 +34,9 @@ DB_USER = os.environ.get("DB_USER", "your_user")
28
  DB_PASSWORD = os.environ.get("DB_PASSWORD", "your_password")
29
  DB_NAME = os.environ.get("DB_NAME", "your_database")
30
  HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY") # Add API key
 
 
 
31
 
32
  # Global variables
33
  monitoring_thread = None
@@ -36,6 +45,7 @@ db_connection = None
36
  current_task = None
37
  history = []
38
  url_monitoring_intervals = {} # Store monitoring intervals for each URL
 
39
 
40
  # Function to establish a database connection
41
  def get_db_connection():
@@ -77,28 +87,36 @@ def create_articles_table():
77
  create_articles_table()
78
 
79
  # Function to monitor URLs for changes
80
- def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
81
- global history, url_monitoring_intervals
82
  previous_hashes = {url: "" for url in target_urls}
83
  options = Options()
84
  options.headless = True
85
  options.add_argument("--disable-gpu")
86
  options.add_argument("--no-sandbox")
87
  options.add_argument("--disable-dev-shm-usage")
 
88
 
89
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
 
90
 
91
  try:
92
  while not stop_event.is_set():
93
  for url in target_urls:
94
  try:
95
  # Dynamic monitoring interval
96
- interval = url_monitoring_intervals.get(url, 300) # Default 5 minutes
97
 
98
  driver.get(url)
99
  time.sleep(2) # Allow page to load
100
- current_content = driver.page_source
101
- current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest()
 
 
 
 
 
 
102
 
103
  if current_hash != previous_hashes[url]:
104
  previous_hashes[url] = current_hash
@@ -118,12 +136,17 @@ def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
118
  if feed_rss:
119
  save_to_database(url, title, current_content, current_hash)
120
 
121
- # Adjust monitoring interval based on change frequency (example)
122
- url_monitoring_intervals[url] = 60 # Check more frequently after a change
 
 
 
 
123
 
124
  else:
125
- # Increase interval if no changes detected (example)
126
- url_monitoring_intervals[url] = min(url_monitoring_intervals[url] + 60, 600) # Max 10 min
 
127
 
128
  except WebDriverException as e:
129
  logging.error(f"Error accessing {url}: {e}")
@@ -132,7 +155,7 @@ def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
132
  break # Exit inner loop if stop event is set
133
 
134
  if not stop_event.is_set():
135
- time.sleep(interval)
136
 
137
  except Exception as e:
138
  logging.error(f"Unexpected error in monitoring thread: {e}")
@@ -141,7 +164,7 @@ def monitor_urls(target_urls, storage_location, feed_rss, stop_event):
141
  logging.info("Monitoring thread has been stopped.")
142
 
143
  # Function to save data to local storage (CSV)
144
- def save_to_storage(storage_location, url, title, content, timestamp):
145
  try:
146
  with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
147
  csv_writer = csv.writer(csvfile)
@@ -150,7 +173,7 @@ def save_to_storage(storage_location, url, title, content, timestamp):
150
  logging.error(f"Error saving to storage: {e}")
151
 
152
  # Function to save data to the database
153
- def save_to_database(url, title, content, hash):
154
  conn = get_db_connection()
155
  if conn:
156
  cursor = conn.cursor()
@@ -195,14 +218,15 @@ def generate_rss_feed():
195
  return None
196
 
197
  # Function to start monitoring
198
- def start_monitoring(target_urls, storage_location, feed_rss):
199
- global monitoring_thread, stop_event, current_task, history
200
  if monitoring_thread and monitoring_thread.is_alive():
201
  return "Monitoring is already running.", history
202
 
203
  stop_event.clear()
204
  current_task = f"Monitoring URLs: {', '.join(target_urls)}"
205
  history.append(f"Task started: {current_task}")
 
206
  monitoring_thread = threading.Thread(
207
  target=monitor_urls,
208
  args=(target_urls, storage_location, feed_rss, stop_event),
@@ -224,7 +248,7 @@ def stop_monitoring():
224
  return "No monitoring task is currently running.", history
225
 
226
  # Function to handle chatbot responses
227
- def chatbot_response(message, history):
228
  try:
229
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
230
  response = client.inference(message)
@@ -279,15 +303,15 @@ with gr.Blocks() as demo:
279
  # --- Event Handlers ---
280
 
281
  # Start monitoring button click
282
- def on_start_click(target_urls_str, storage_loc, feed_enabled):
283
  global history, url_monitoring_intervals
284
  try:
285
- target_urls = [url.strip() for url in target_urls_str.split(",")]
286
  if not all(target_urls):
287
  return "Please enter valid URLs.", history
288
-
289
  # Reset monitoring intervals when starting
290
- url_monitoring_intervals = {url: 300 for url in target_urls}
291
 
292
  status, history = start_monitoring(target_urls, storage_loc if storage_loc else None, feed_enabled)
293
  return status, history
 
6
  import threading
7
  from pathlib import Path
8
  import logging
9
+ from typing import List, Tuple
10
 
11
  import gradio as gr
12
  from selenium import webdriver
13
  from selenium.webdriver.chrome.service import Service
14
  from selenium.webdriver.chrome.options import Options
15
  from selenium.webdriver.common.by import By
16
+ from selenium.common.exceptions import (
17
+ WebDriverException,
18
+ NoSuchElementException,
19
+ TimeoutException,
20
+ StaleElementReferenceException,
21
+ )
22
  from webdriver_manager.chrome import ChromeDriverManager
23
  from huggingface_hub import InferenceClient
24
  import mysql.connector
 
26
  import sqlite3 # For simple local storage if needed
27
 
28
  # Configure logging
29
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s')
30
 
31
  # Configuration (replace with your actual values or environment variables)
32
  DB_HOST = os.environ.get("DB_HOST", "your_host")
 
34
  DB_PASSWORD = os.environ.get("DB_PASSWORD", "your_password")
35
  DB_NAME = os.environ.get("DB_NAME", "your_database")
36
  HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY") # Add API key
37
+ DEFAULT_MONITORING_INTERVAL = 300 # 5 minutes in seconds
38
+ MAX_MONITORING_INTERVAL = 600 # 10 minutes in seconds
39
+ CHANGE_FREQUENCY_THRESHOLD = 3 # Number of changes to trigger faster monitoring
40
 
41
  # Global variables
42
  monitoring_thread = None
 
45
  current_task = None
46
  history = []
47
  url_monitoring_intervals = {} # Store monitoring intervals for each URL
48
+ change_counts = {} # Track change frequency for each URL
49
 
50
  # Function to establish a database connection
51
  def get_db_connection():
 
87
  create_articles_table()
88
 
89
  # Function to monitor URLs for changes
90
+ def monitor_urls(target_urls: List[str], storage_location: str, feed_rss: bool, stop_event: threading.Event):
91
+ global history, url_monitoring_intervals, change_counts
92
  previous_hashes = {url: "" for url in target_urls}
93
  options = Options()
94
  options.headless = True
95
  options.add_argument("--disable-gpu")
96
  options.add_argument("--no-sandbox")
97
  options.add_argument("--disable-dev-shm-usage")
98
+ options.add_experimental_option("excludeSwitches", ["enable-logging"]) # Suppress unnecessary logs
99
 
100
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
101
+ driver.implicitly_wait(10) # Implicit wait for elements
102
 
103
  try:
104
  while not stop_event.is_set():
105
  for url in target_urls:
106
  try:
107
  # Dynamic monitoring interval
108
+ interval = url_monitoring_intervals.get(url, DEFAULT_MONITORING_INTERVAL)
109
 
110
  driver.get(url)
111
  time.sleep(2) # Allow page to load
112
+
113
+ # Check for changes
114
+ try:
115
+ current_content = driver.find_element(By.TAG_NAME, "body").get_attribute("innerHTML")
116
+ current_hash = hashlib.md5(current_content.encode('utf-8')).hexdigest()
117
+ except (NoSuchElementException, TimeoutException, StaleElementReferenceException) as e:
118
+ logging.warning(f"Error getting content for {url}: {e}")
119
+ continue
120
 
121
  if current_hash != previous_hashes[url]:
122
  previous_hashes[url] = current_hash
 
136
  if feed_rss:
137
  save_to_database(url, title, current_content, current_hash)
138
 
139
+ # Adjust monitoring interval based on change frequency
140
+ change_counts[url] = change_counts.get(url, 0) + 1
141
+ if change_counts[url] >= CHANGE_FREQUENCY_THRESHOLD:
142
+ url_monitoring_intervals[url] = 60 # Check more frequently after multiple changes
143
+ else:
144
+ url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL) # Gradually increase interval
145
 
146
  else:
147
+ # Increase interval if no changes detected
148
+ change_counts[url] = 0 # Reset change count if no change
149
+ url_monitoring_intervals[url] = min(interval + 60, MAX_MONITORING_INTERVAL)
150
 
151
  except WebDriverException as e:
152
  logging.error(f"Error accessing {url}: {e}")
 
155
  break # Exit inner loop if stop event is set
156
 
157
  if not stop_event.is_set():
158
+ time.sleep(interval)
159
 
160
  except Exception as e:
161
  logging.error(f"Unexpected error in monitoring thread: {e}")
 
164
  logging.info("Monitoring thread has been stopped.")
165
 
166
  # Function to save data to local storage (CSV)
167
+ def save_to_storage(storage_location: str, url: str, title: str, content: str, timestamp: datetime.datetime):
168
  try:
169
  with open(storage_location, "a", newline='', encoding='utf-8') as csvfile:
170
  csv_writer = csv.writer(csvfile)
 
173
  logging.error(f"Error saving to storage: {e}")
174
 
175
  # Function to save data to the database
176
+ def save_to_database(url: str, title: str, content: str, hash: str):
177
  conn = get_db_connection()
178
  if conn:
179
  cursor = conn.cursor()
 
218
  return None
219
 
220
  # Function to start monitoring
221
+ def start_monitoring(target_urls: List[str], storage_location: str, feed_rss: bool):
222
+ global monitoring_thread, stop_event, current_task, history, change_counts
223
  if monitoring_thread and monitoring_thread.is_alive():
224
  return "Monitoring is already running.", history
225
 
226
  stop_event.clear()
227
  current_task = f"Monitoring URLs: {', '.join(target_urls)}"
228
  history.append(f"Task started: {current_task}")
229
+ change_counts = {url: 0 for url in target_urls} # Reset change counts
230
  monitoring_thread = threading.Thread(
231
  target=monitor_urls,
232
  args=(target_urls, storage_location, feed_rss, stop_event),
 
248
  return "No monitoring task is currently running.", history
249
 
250
  # Function to handle chatbot responses
251
+ def chatbot_response(message: str, history: List[Tuple[str, str]]):
252
  try:
253
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)
254
  response = client.inference(message)
 
303
  # --- Event Handlers ---
304
 
305
  # Start monitoring button click
306
+ def on_start_click(target_urls_str: str, storage_loc: str, feed_enabled: bool):
307
  global history, url_monitoring_intervals
308
  try:
309
+ target_urls = [url.strip() for url.strip() in target_urls_str.split(",")]
310
  if not all(target_urls):
311
  return "Please enter valid URLs.", history
312
+
313
  # Reset monitoring intervals when starting
314
+ url_monitoring_intervals = {url: DEFAULT_MONITORING_INTERVAL for url in target_urls}
315
 
316
  status, history = start_monitoring(target_urls, storage_loc if storage_loc else None, feed_enabled)
317
  return status, history