Spaces:
Runtime error
Runtime error
acecalisto3
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -53,6 +53,7 @@ STOP_THREADS = False # Flag to stop scraping threads
|
|
53 |
def get_db_connection():
|
54 |
"""
|
55 |
Establishes and returns a MySQL database connection using environment variables.
|
|
|
56 |
"""
|
57 |
try:
|
58 |
connection = mysql.connector.connect(
|
@@ -66,11 +67,11 @@ def get_db_connection():
|
|
66 |
return connection
|
67 |
except mysql.connector.Error as err:
|
68 |
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
|
69 |
-
logging.
|
70 |
elif err.errno == errorcode.ER_BAD_DB_ERROR:
|
71 |
-
logging.
|
72 |
else:
|
73 |
-
logging.error
|
74 |
return None
|
75 |
|
76 |
# Initialize Database
|
@@ -80,7 +81,7 @@ def initialize_database():
|
|
80 |
"""
|
81 |
connection = get_db_connection()
|
82 |
if connection is None:
|
83 |
-
logging.
|
84 |
return
|
85 |
|
86 |
cursor = connection.cursor()
|
@@ -124,7 +125,7 @@ def monitor_urls(
|
|
124 |
selector: str = None,
|
125 |
):
|
126 |
"""
|
127 |
-
Monitors the specified URLs for changes and logs any detected changes to the database.
|
128 |
"""
|
129 |
global HISTORY, STOP_THREADS
|
130 |
previous_hashes = {url: "" for url in urls}
|
@@ -171,21 +172,28 @@ def monitor_urls(
|
|
171 |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
172 |
HISTORY.append(f"Change detected at {url} on {date_time_str}")
|
173 |
|
174 |
-
#
|
175 |
connection = get_db_connection()
|
176 |
if connection:
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
else:
|
188 |
-
|
|
|
189 |
|
190 |
except (
|
191 |
NoSuchElementException,
|
@@ -199,6 +207,33 @@ def monitor_urls(
|
|
199 |
driver.quit()
|
200 |
logging.info("ChromeDriver session ended.")
|
201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
# Function to create WebDriver
|
203 |
def create_driver(options: Options) -> webdriver.Chrome:
|
204 |
"""
|
@@ -296,21 +331,28 @@ def start_scraping(
|
|
296 |
if initial_hash:
|
297 |
HISTORY.append(f"Initial observation at {url}: {initial_hash}")
|
298 |
|
299 |
-
#
|
300 |
connection = get_db_connection()
|
301 |
if connection:
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
else:
|
313 |
-
|
|
|
314 |
|
315 |
except Exception as e:
|
316 |
HISTORY.append(f"Error accessing {url}: {e}")
|
@@ -339,91 +381,157 @@ def stop_scraping() -> str:
|
|
339 |
logging.info("Scraping stop signal sent.")
|
340 |
return "Scraping has been stopped."
|
341 |
|
342 |
-
# Function to display CSV content from MySQL
|
343 |
def display_csv(storage_location: str, url: str) -> str:
|
344 |
"""
|
345 |
-
Fetches and returns the scraped data for a given URL from the MySQL database.
|
346 |
"""
|
347 |
try:
|
348 |
connection = get_db_connection()
|
349 |
-
if
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
|
357 |
-
|
358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
-
df = pd.DataFrame(results)
|
361 |
-
cursor.close()
|
362 |
-
connection.close()
|
363 |
-
return df.to_string(index=False)
|
364 |
except Exception as e:
|
365 |
logging.error(f"Error fetching data for {url}: {e}")
|
366 |
return f"Error fetching data for {url}: {e}"
|
367 |
|
368 |
-
# Function to generate RSS feed from MySQL data
|
369 |
def generate_rss_feed(storage_location: str, url: str) -> str:
|
370 |
"""
|
371 |
-
Generates an RSS feed for the latest changes detected on a given URL from the MySQL database.
|
372 |
"""
|
373 |
try:
|
374 |
connection = get_db_connection()
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
382 |
|
383 |
-
|
384 |
-
|
|
|
385 |
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
|
390 |
-
|
391 |
-
|
392 |
-
title.text = f"RSS Feed for {urlparse(url).hostname}"
|
393 |
|
394 |
-
|
395 |
-
|
396 |
|
397 |
-
|
398 |
-
|
|
|
399 |
|
400 |
-
|
401 |
-
|
402 |
-
item = ET.SubElement(channel, "item")
|
403 |
|
404 |
-
|
405 |
-
|
406 |
|
407 |
-
|
408 |
-
|
409 |
|
410 |
-
|
411 |
-
|
|
|
|
|
412 |
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
|
|
417 |
|
418 |
-
# Generate the XML string
|
419 |
-
rss_feed = ET.tostring(rss, encoding="utf-8", method="xml")
|
420 |
-
return rss_feed.decode("utf-8")
|
421 |
except Exception as e:
|
422 |
logging.error(f"Error generating RSS feed for {url}: {e}")
|
423 |
return f"Error generating RSS feed for {url}: {e}"
|
424 |
-
finally:
|
425 |
-
cursor.close()
|
426 |
-
connection.close()
|
427 |
|
428 |
# Function to load the Mistral model
|
429 |
def load_model():
|
@@ -500,20 +608,62 @@ def filter_data(column: str, words: list) -> str:
|
|
500 |
Saves the filtered data to a new CSV file.
|
501 |
"""
|
502 |
try:
|
503 |
-
|
504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
return "No CSV files found to filter."
|
506 |
|
|
|
|
|
507 |
df = pd.read_csv(latest_csv)
|
508 |
-
|
509 |
-
|
510 |
-
|
|
|
|
|
511 |
|
512 |
if filtered_df.empty:
|
513 |
return f"No records found with words {words} in column '{column}'."
|
514 |
|
515 |
# Save the filtered data to a new CSV
|
516 |
-
filtered_csv = latest_csv.replace(".csv", "
|
517 |
filtered_df.to_csv(filtered_csv, index=False)
|
518 |
logging.info(f"Data filtered on column '{column}' for words {words}.")
|
519 |
return f"Data filtered and saved to {filtered_csv}."
|
@@ -527,16 +677,56 @@ def sort_data(column: str, order: str) -> str:
|
|
527 |
Saves the sorted data to a new CSV file.
|
528 |
"""
|
529 |
try:
|
530 |
-
|
531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
return "No CSV files found to sort."
|
533 |
|
|
|
|
|
534 |
df = pd.read_csv(latest_csv)
|
|
|
|
|
|
|
|
|
535 |
ascending = True if order.lower() == "ascending" else False
|
536 |
sorted_df = df.sort_values(by=column, ascending=ascending)
|
537 |
|
538 |
# Save the sorted data to a new CSV
|
539 |
-
sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}.csv")
|
540 |
sorted_df.to_csv(sorted_csv, index=False)
|
541 |
logging.info(f"Data sorted on column '{column}' in {order} order.")
|
542 |
return f"Data sorted and saved to {sorted_csv}."
|
@@ -549,12 +739,40 @@ def export_csv(filename: str) -> str:
|
|
549 |
Exports the latest scraped data to a specified CSV filename.
|
550 |
"""
|
551 |
try:
|
552 |
-
|
553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
return "No CSV files found to export."
|
555 |
|
556 |
-
|
|
|
557 |
df = pd.read_csv(latest_csv)
|
|
|
558 |
df.to_csv(export_path, index=False)
|
559 |
logging.info(f"Data exported to {export_path}.")
|
560 |
return f"Data exported to {export_path}."
|
@@ -564,50 +782,68 @@ def export_csv(filename: str) -> str:
|
|
564 |
|
565 |
def log_action(action: str) -> str:
|
566 |
"""
|
567 |
-
Logs a custom action message to the MySQL database.
|
568 |
"""
|
569 |
try:
|
570 |
connection = get_db_connection()
|
571 |
-
if
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
|
584 |
-
|
585 |
-
|
586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
587 |
except Exception as e:
|
588 |
logging.error(f"Error logging action: {e}")
|
589 |
return f"Error logging action: {e}"
|
590 |
|
|
|
591 |
def get_latest_csv() -> str:
|
592 |
"""
|
593 |
Retrieves the latest CSV file from the storage directory based on modification time.
|
594 |
"""
|
595 |
try:
|
596 |
-
|
597 |
-
if
|
|
|
598 |
return None
|
599 |
|
600 |
-
latest_csv =
|
601 |
-
latest_time = 0
|
602 |
-
for dir_name in storage_dirs:
|
603 |
-
dir_path = os.path.join(DEFAULT_FILE_PATH, dir_name)
|
604 |
-
csv_files = [f for f in os.listdir(dir_path) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
|
605 |
-
for csv_file in csv_files:
|
606 |
-
csv_path = os.path.join(dir_path, csv_file)
|
607 |
-
mod_time = os.path.getmtime(csv_path)
|
608 |
-
if mod_time > latest_time:
|
609 |
-
latest_time = mod_time
|
610 |
-
latest_csv = csv_path
|
611 |
return latest_csv
|
612 |
except Exception as e:
|
613 |
logging.error(f"Error retrieving latest CSV: {e}")
|
@@ -658,30 +894,6 @@ def respond(
|
|
658 |
logging.error(f"Error generating response: {e}")
|
659 |
return "Error generating response."
|
660 |
|
661 |
-
# Function to load the Mistral model
|
662 |
-
def load_model():
|
663 |
-
"""
|
664 |
-
Loads the Mistral model and tokenizer once and returns the pipeline.
|
665 |
-
"""
|
666 |
-
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
667 |
-
try:
|
668 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
669 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
670 |
-
pipe = pipeline(
|
671 |
-
"text-generation",
|
672 |
-
model=model,
|
673 |
-
tokenizer=tokenizer,
|
674 |
-
device=0 if torch.cuda.is_available() else -1,
|
675 |
-
)
|
676 |
-
logging.info("Mistral model loaded successfully.")
|
677 |
-
return pipe
|
678 |
-
except Exception as e:
|
679 |
-
logging.error(f"Error loading Mistral model: {e}")
|
680 |
-
return None
|
681 |
-
|
682 |
-
# Load the model once at the start
|
683 |
-
chat_pipeline = load_model()
|
684 |
-
|
685 |
# Define the Gradio interface
|
686 |
def create_interface() -> gr.Blocks:
|
687 |
"""
|
|
|
53 |
def get_db_connection():
|
54 |
"""
|
55 |
Establishes and returns a MySQL database connection using environment variables.
|
56 |
+
Returns None if connection fails.
|
57 |
"""
|
58 |
try:
|
59 |
connection = mysql.connector.connect(
|
|
|
67 |
return connection
|
68 |
except mysql.connector.Error as err:
|
69 |
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
|
70 |
+
logging.warning("Invalid database credentials. Falling back to CSV storage.")
|
71 |
elif err.errno == errorcode.ER_BAD_DB_ERROR:
|
72 |
+
logging.warning("Database does not exist. Falling back to CSV storage.")
|
73 |
else:
|
74 |
+
logging.warning(f"MySQL connection error: {err}. Falling back to CSV storage.")
|
75 |
return None
|
76 |
|
77 |
# Initialize Database
|
|
|
81 |
"""
|
82 |
connection = get_db_connection()
|
83 |
if connection is None:
|
84 |
+
logging.info("Database initialization skipped. Using CSV storage.")
|
85 |
return
|
86 |
|
87 |
cursor = connection.cursor()
|
|
|
125 |
selector: str = None,
|
126 |
):
|
127 |
"""
|
128 |
+
Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
|
129 |
"""
|
130 |
global HISTORY, STOP_THREADS
|
131 |
previous_hashes = {url: "" for url in urls}
|
|
|
172 |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
173 |
HISTORY.append(f"Change detected at {url} on {date_time_str}")
|
174 |
|
175 |
+
# Attempt to log to database
|
176 |
connection = get_db_connection()
|
177 |
if connection:
|
178 |
+
try:
|
179 |
+
cursor = connection.cursor()
|
180 |
+
insert_query = """
|
181 |
+
INSERT INTO scraped_data (url, content_hash, change_detected)
|
182 |
+
VALUES (%s, %s, %s)
|
183 |
+
"""
|
184 |
+
cursor.execute(insert_query, (url, current_hash, date_time_str))
|
185 |
+
connection.commit()
|
186 |
+
logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
|
187 |
+
except mysql.connector.Error as err:
|
188 |
+
logging.error(f"Error inserting data into database: {err}")
|
189 |
+
# Fallback to CSV
|
190 |
+
log_to_csv(storage_location, url, current_hash, date_time_str)
|
191 |
+
finally:
|
192 |
+
cursor.close()
|
193 |
+
connection.close()
|
194 |
else:
|
195 |
+
# Fallback to CSV
|
196 |
+
log_to_csv(storage_location, url, current_hash, date_time_str)
|
197 |
|
198 |
except (
|
199 |
NoSuchElementException,
|
|
|
207 |
driver.quit()
|
208 |
logging.info("ChromeDriver session ended.")
|
209 |
|
210 |
+
def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
|
211 |
+
"""
|
212 |
+
Logs the change to a CSV file in the storage_location.
|
213 |
+
"""
|
214 |
+
try:
|
215 |
+
os.makedirs(storage_location, exist_ok=True)
|
216 |
+
csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
|
217 |
+
file_exists = os.path.isfile(csv_file_path)
|
218 |
+
|
219 |
+
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
220 |
+
fieldnames = ["date", "time", "url", "content_hash", "change"]
|
221 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
222 |
+
if not file_exists:
|
223 |
+
writer.writeheader()
|
224 |
+
writer.writerow(
|
225 |
+
{
|
226 |
+
"date": change_detected.split()[0],
|
227 |
+
"time": change_detected.split()[1],
|
228 |
+
"url": url,
|
229 |
+
"content_hash": content_hash,
|
230 |
+
"change": "Content changed",
|
231 |
+
}
|
232 |
+
)
|
233 |
+
logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
|
234 |
+
except Exception as e:
|
235 |
+
logging.error(f"Error logging data to CSV: {e}")
|
236 |
+
|
237 |
# Function to create WebDriver
|
238 |
def create_driver(options: Options) -> webdriver.Chrome:
|
239 |
"""
|
|
|
331 |
if initial_hash:
|
332 |
HISTORY.append(f"Initial observation at {url}: {initial_hash}")
|
333 |
|
334 |
+
# Attempt to log to database
|
335 |
connection = get_db_connection()
|
336 |
if connection:
|
337 |
+
try:
|
338 |
+
cursor = connection.cursor()
|
339 |
+
insert_query = """
|
340 |
+
INSERT INTO scraped_data (url, content_hash, change_detected)
|
341 |
+
VALUES (%s, %s, %s)
|
342 |
+
"""
|
343 |
+
cursor.execute(insert_query, (url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
|
344 |
+
connection.commit()
|
345 |
+
logging.info(f"Initial observation logged for {url} in database.")
|
346 |
+
except mysql.connector.Error as err:
|
347 |
+
logging.error(f"Error inserting initial observation into database: {err}")
|
348 |
+
# Fallback to CSV
|
349 |
+
log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
350 |
+
finally:
|
351 |
+
cursor.close()
|
352 |
+
connection.close()
|
353 |
else:
|
354 |
+
# Fallback to CSV
|
355 |
+
log_to_csv(storage_location, url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
356 |
|
357 |
except Exception as e:
|
358 |
HISTORY.append(f"Error accessing {url}: {e}")
|
|
|
381 |
logging.info("Scraping stop signal sent.")
|
382 |
return "Scraping has been stopped."
|
383 |
|
384 |
+
# Function to display CSV content from MySQL or CSV
|
385 |
def display_csv(storage_location: str, url: str) -> str:
|
386 |
"""
|
387 |
+
Fetches and returns the scraped data for a given URL from the MySQL database or CSV.
|
388 |
"""
|
389 |
try:
|
390 |
connection = get_db_connection()
|
391 |
+
if connection:
|
392 |
+
try:
|
393 |
+
cursor = connection.cursor(dictionary=True)
|
394 |
+
query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
|
395 |
+
cursor.execute(query, (url,))
|
396 |
+
results = cursor.fetchall()
|
397 |
+
|
398 |
+
if not results:
|
399 |
+
return "No data available for the selected URL."
|
400 |
+
|
401 |
+
df = pd.DataFrame(results)
|
402 |
+
cursor.close()
|
403 |
+
connection.close()
|
404 |
+
return df.to_string(index=False)
|
405 |
+
except mysql.connector.Error as err:
|
406 |
+
logging.error(f"Error fetching data from database: {err}")
|
407 |
+
# Fallback to CSV
|
408 |
+
else:
|
409 |
+
logging.info("No database connection. Fetching data from CSV.")
|
410 |
|
411 |
+
# Fallback to CSV
|
412 |
+
hostname = urlparse(url).hostname
|
413 |
+
csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
|
414 |
+
if os.path.exists(csv_path):
|
415 |
+
df = pd.read_csv(csv_path)
|
416 |
+
return df.to_string(index=False)
|
417 |
+
else:
|
418 |
+
return "No data available."
|
419 |
|
|
|
|
|
|
|
|
|
420 |
except Exception as e:
|
421 |
logging.error(f"Error fetching data for {url}: {e}")
|
422 |
return f"Error fetching data for {url}: {e}"
|
423 |
|
424 |
+
# Function to generate RSS feed from MySQL or CSV data
|
425 |
def generate_rss_feed(storage_location: str, url: str) -> str:
|
426 |
"""
|
427 |
+
Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV.
|
428 |
"""
|
429 |
try:
|
430 |
connection = get_db_connection()
|
431 |
+
rss_feed = ""
|
432 |
+
|
433 |
+
if connection:
|
434 |
+
try:
|
435 |
+
cursor = connection.cursor(dictionary=True)
|
436 |
+
query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
|
437 |
+
cursor.execute(query, (url,))
|
438 |
+
results = cursor.fetchall()
|
439 |
+
|
440 |
+
if not results:
|
441 |
+
return "No changes detected to include in RSS feed."
|
442 |
+
|
443 |
+
# Create the root RSS element
|
444 |
+
rss = ET.Element("rss", version="2.0")
|
445 |
+
channel = ET.SubElement(rss, "channel")
|
446 |
+
|
447 |
+
# Add channel elements
|
448 |
+
title = ET.SubElement(channel, "title")
|
449 |
+
title.text = f"RSS Feed for {urlparse(url).hostname}"
|
450 |
+
|
451 |
+
link = ET.SubElement(channel, "link")
|
452 |
+
link.text = url
|
453 |
+
|
454 |
+
description = ET.SubElement(channel, "description")
|
455 |
+
description.text = "Recent changes detected on the website."
|
456 |
+
|
457 |
+
# Add items to the feed
|
458 |
+
for row in results:
|
459 |
+
item = ET.SubElement(channel, "item")
|
460 |
+
|
461 |
+
item_title = ET.SubElement(item, "title")
|
462 |
+
item_title.text = f"Change detected at {row['url']}"
|
463 |
+
|
464 |
+
item_link = ET.SubElement(item, "link")
|
465 |
+
item_link.text = row["url"]
|
466 |
+
|
467 |
+
item_description = ET.SubElement(item, "description")
|
468 |
+
item_description.text = f"Content changed on {row['change_detected']}"
|
469 |
+
|
470 |
+
pub_date = ET.SubElement(item, "pubDate")
|
471 |
+
pub_date.text = datetime.datetime.strptime(
|
472 |
+
str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
|
473 |
+
).strftime("%a, %d %b %Y %H:%M:%S +0000")
|
474 |
+
|
475 |
+
# Generate the XML string
|
476 |
+
rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
|
477 |
+
cursor.close()
|
478 |
+
connection.close()
|
479 |
+
return rss_feed
|
480 |
+
except mysql.connector.Error as err:
|
481 |
+
logging.error(f"Error fetching data from database: {err}")
|
482 |
+
# Fallback to CSV
|
483 |
+
else:
|
484 |
+
logging.info("No database connection. Generating RSS feed from CSV.")
|
485 |
|
486 |
+
# Fallback to CSV
|
487 |
+
hostname = urlparse(url).hostname
|
488 |
+
csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
|
489 |
+
if os.path.exists(csv_path):
|
490 |
+
df = pd.read_csv(csv_path).tail(10)
|
491 |
+
if df.empty:
|
492 |
+
return "No changes detected to include in RSS feed."
|
493 |
|
494 |
+
# Create the root RSS element
|
495 |
+
rss = ET.Element("rss", version="2.0")
|
496 |
+
channel = ET.SubElement(rss, "channel")
|
497 |
|
498 |
+
# Add channel elements
|
499 |
+
title = ET.SubElement(channel, "title")
|
500 |
+
title.text = f"RSS Feed for {hostname}"
|
501 |
|
502 |
+
link = ET.SubElement(channel, "link")
|
503 |
+
link.text = url
|
|
|
504 |
|
505 |
+
description = ET.SubElement(channel, "description")
|
506 |
+
description.text = "Recent changes detected on the website."
|
507 |
|
508 |
+
# Add items to the feed
|
509 |
+
for _, row in df.iterrows():
|
510 |
+
item = ET.SubElement(channel, "item")
|
511 |
|
512 |
+
item_title = ET.SubElement(item, "title")
|
513 |
+
item_title.text = f"Change detected at {row['url']}"
|
|
|
514 |
|
515 |
+
item_link = ET.SubElement(item, "link")
|
516 |
+
item_link.text = row["url"]
|
517 |
|
518 |
+
item_description = ET.SubElement(item, "description")
|
519 |
+
item_description.text = f"Content changed on {row['date']} at {row['time']}"
|
520 |
|
521 |
+
pub_date = ET.SubElement(item, "pubDate")
|
522 |
+
pub_date.text = datetime.datetime.strptime(
|
523 |
+
f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S"
|
524 |
+
).strftime("%a, %d %b %Y %H:%M:%S +0000")
|
525 |
|
526 |
+
# Generate the XML string
|
527 |
+
rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
|
528 |
+
return rss_feed
|
529 |
+
else:
|
530 |
+
return "No data available."
|
531 |
|
|
|
|
|
|
|
532 |
except Exception as e:
|
533 |
logging.error(f"Error generating RSS feed for {url}: {e}")
|
534 |
return f"Error generating RSS feed for {url}: {e}"
|
|
|
|
|
|
|
535 |
|
536 |
# Function to load the Mistral model
|
537 |
def load_model():
|
|
|
608 |
Saves the filtered data to a new CSV file.
|
609 |
"""
|
610 |
try:
|
611 |
+
storage_location = DEFAULT_FILE_PATH
|
612 |
+
url = "" # Placeholder since filtering isn't URL-specific here
|
613 |
+
|
614 |
+
connection = get_db_connection()
|
615 |
+
if connection:
|
616 |
+
try:
|
617 |
+
cursor = connection.cursor(dictionary=True)
|
618 |
+
# Fetch all data
|
619 |
+
query = "SELECT * FROM scraped_data"
|
620 |
+
cursor.execute(query)
|
621 |
+
results = cursor.fetchall()
|
622 |
+
|
623 |
+
if not results:
|
624 |
+
return "No data available to filter."
|
625 |
+
|
626 |
+
df = pd.DataFrame(results)
|
627 |
+
# Create a regex pattern to match any of the words
|
628 |
+
pattern = '|'.join(words)
|
629 |
+
if column not in df.columns:
|
630 |
+
return f"Column '{column}' does not exist in the data."
|
631 |
+
|
632 |
+
filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
|
633 |
+
|
634 |
+
if filtered_df.empty:
|
635 |
+
return f"No records found with words {words} in column '{column}'."
|
636 |
+
|
637 |
+
# Save the filtered data to a new CSV
|
638 |
+
filtered_csv = os.path.join(storage_location, f"filtered_data_{int(time.time())}.csv")
|
639 |
+
filtered_df.to_csv(filtered_csv, index=False)
|
640 |
+
logging.info(f"Data filtered on column '{column}' for words {words}.")
|
641 |
+
return f"Data filtered and saved to {filtered_csv}."
|
642 |
+
except mysql.connector.Error as err:
|
643 |
+
logging.error(f"Error fetching data from database: {err}")
|
644 |
+
# Fallback to CSV
|
645 |
+
else:
|
646 |
+
logging.info("No database connection. Filtering data from CSV.")
|
647 |
+
|
648 |
+
# Fallback to CSV
|
649 |
+
csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
|
650 |
+
if not csv_files:
|
651 |
return "No CSV files found to filter."
|
652 |
|
653 |
+
# Assume the latest CSV is the target
|
654 |
+
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
|
655 |
df = pd.read_csv(latest_csv)
|
656 |
+
|
657 |
+
if column not in df.columns:
|
658 |
+
return f"Column '{column}' does not exist in the data."
|
659 |
+
|
660 |
+
filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)]
|
661 |
|
662 |
if filtered_df.empty:
|
663 |
return f"No records found with words {words} in column '{column}'."
|
664 |
|
665 |
# Save the filtered data to a new CSV
|
666 |
+
filtered_csv = latest_csv.replace(".csv", f"_filtered_{int(time.time())}.csv")
|
667 |
filtered_df.to_csv(filtered_csv, index=False)
|
668 |
logging.info(f"Data filtered on column '{column}' for words {words}.")
|
669 |
return f"Data filtered and saved to {filtered_csv}."
|
|
|
677 |
Saves the sorted data to a new CSV file.
|
678 |
"""
|
679 |
try:
|
680 |
+
storage_location = DEFAULT_FILE_PATH
|
681 |
+
url = "" # Placeholder since sorting isn't URL-specific here
|
682 |
+
|
683 |
+
connection = get_db_connection()
|
684 |
+
if connection:
|
685 |
+
try:
|
686 |
+
cursor = connection.cursor(dictionary=True)
|
687 |
+
# Fetch all data
|
688 |
+
query = "SELECT * FROM scraped_data"
|
689 |
+
cursor.execute(query)
|
690 |
+
results = cursor.fetchall()
|
691 |
+
|
692 |
+
if not results:
|
693 |
+
return "No data available to sort."
|
694 |
+
|
695 |
+
df = pd.DataFrame(results)
|
696 |
+
if column not in df.columns:
|
697 |
+
return f"Column '{column}' does not exist in the data."
|
698 |
+
|
699 |
+
ascending = True if order.lower() == "ascending" else False
|
700 |
+
sorted_df = df.sort_values(by=column, ascending=ascending)
|
701 |
+
|
702 |
+
# Save the sorted data to a new CSV
|
703 |
+
sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{int(time.time())}.csv")
|
704 |
+
sorted_df.to_csv(sorted_csv, index=False)
|
705 |
+
logging.info(f"Data sorted on column '{column}' in {order} order.")
|
706 |
+
return f"Data sorted and saved to {sorted_csv}."
|
707 |
+
except mysql.connector.Error as err:
|
708 |
+
logging.error(f"Error fetching data from database: {err}")
|
709 |
+
# Fallback to CSV
|
710 |
+
else:
|
711 |
+
logging.info("No database connection. Sorting data from CSV.")
|
712 |
+
|
713 |
+
# Fallback to CSV
|
714 |
+
csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
|
715 |
+
if not csv_files:
|
716 |
return "No CSV files found to sort."
|
717 |
|
718 |
+
# Assume the latest CSV is the target
|
719 |
+
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
|
720 |
df = pd.read_csv(latest_csv)
|
721 |
+
|
722 |
+
if column not in df.columns:
|
723 |
+
return f"Column '{column}' does not exist in the data."
|
724 |
+
|
725 |
ascending = True if order.lower() == "ascending" else False
|
726 |
sorted_df = df.sort_values(by=column, ascending=ascending)
|
727 |
|
728 |
# Save the sorted data to a new CSV
|
729 |
+
sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{int(time.time())}.csv")
|
730 |
sorted_df.to_csv(sorted_csv, index=False)
|
731 |
logging.info(f"Data sorted on column '{column}' in {order} order.")
|
732 |
return f"Data sorted and saved to {sorted_csv}."
|
|
|
739 |
Exports the latest scraped data to a specified CSV filename.
|
740 |
"""
|
741 |
try:
|
742 |
+
storage_location = DEFAULT_FILE_PATH
|
743 |
+
|
744 |
+
connection = get_db_connection()
|
745 |
+
if connection:
|
746 |
+
try:
|
747 |
+
cursor = connection.cursor(dictionary=True)
|
748 |
+
# Fetch all data
|
749 |
+
query = "SELECT * FROM scraped_data"
|
750 |
+
cursor.execute(query)
|
751 |
+
results = cursor.fetchall()
|
752 |
+
|
753 |
+
if not results:
|
754 |
+
return "No data available to export."
|
755 |
+
|
756 |
+
df = pd.DataFrame(results)
|
757 |
+
export_path = os.path.join(storage_location, filename)
|
758 |
+
df.to_csv(export_path, index=False)
|
759 |
+
logging.info(f"Data exported to {export_path}.")
|
760 |
+
return f"Data exported to {export_path}."
|
761 |
+
except mysql.connector.Error as err:
|
762 |
+
logging.error(f"Error exporting data from database: {err}")
|
763 |
+
# Fallback to CSV
|
764 |
+
else:
|
765 |
+
logging.info("No database connection. Exporting data from CSV.")
|
766 |
+
|
767 |
+
# Fallback to CSV
|
768 |
+
csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
|
769 |
+
if not csv_files:
|
770 |
return "No CSV files found to export."
|
771 |
|
772 |
+
# Assume the latest CSV is the target
|
773 |
+
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
|
774 |
df = pd.read_csv(latest_csv)
|
775 |
+
export_path = os.path.join(storage_location, filename)
|
776 |
df.to_csv(export_path, index=False)
|
777 |
logging.info(f"Data exported to {export_path}.")
|
778 |
return f"Data exported to {export_path}."
|
|
|
782 |
|
783 |
def log_action(action: str) -> str:
|
784 |
"""
|
785 |
+
Logs a custom action message to the MySQL database or CSV.
|
786 |
"""
|
787 |
try:
|
788 |
connection = get_db_connection()
|
789 |
+
if connection:
|
790 |
+
try:
|
791 |
+
cursor = connection.cursor()
|
792 |
+
insert_query = """
|
793 |
+
INSERT INTO action_logs (action)
|
794 |
+
VALUES (%s)
|
795 |
+
"""
|
796 |
+
cursor.execute(insert_query, (action,))
|
797 |
+
connection.commit()
|
798 |
+
logging.info(f"Action logged in database: {action}")
|
799 |
+
cursor.close()
|
800 |
+
connection.close()
|
801 |
+
return f"Action logged: {action}"
|
802 |
+
except mysql.connector.Error as err:
|
803 |
+
logging.error(f"Error logging action to database: {err}")
|
804 |
+
# Fallback to CSV
|
805 |
+
else:
|
806 |
+
logging.info("No database connection. Logging action to CSV.")
|
807 |
|
808 |
+
# Fallback to CSV
|
809 |
+
storage_location = DEFAULT_FILE_PATH
|
810 |
+
try:
|
811 |
+
os.makedirs(storage_location, exist_ok=True)
|
812 |
+
csv_file_path = os.path.join(storage_location, "action_logs.csv")
|
813 |
+
file_exists = os.path.isfile(csv_file_path)
|
814 |
+
|
815 |
+
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
816 |
+
fieldnames = ["timestamp", "action"]
|
817 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
818 |
+
if not file_exists:
|
819 |
+
writer.writeheader()
|
820 |
+
writer.writerow(
|
821 |
+
{
|
822 |
+
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
823 |
+
"action": action,
|
824 |
+
}
|
825 |
+
)
|
826 |
+
logging.info(f"Action logged to CSV: {action}")
|
827 |
+
return f"Action logged: {action}"
|
828 |
+
except Exception as e:
|
829 |
+
logging.error(f"Error logging action to CSV: {e}")
|
830 |
+
return f"Error logging action: {e}"
|
831 |
except Exception as e:
|
832 |
logging.error(f"Error logging action: {e}")
|
833 |
return f"Error logging action: {e}"
|
834 |
|
835 |
+
# Function to get the latest CSV file based on modification time
|
836 |
def get_latest_csv() -> str:
|
837 |
"""
|
838 |
Retrieves the latest CSV file from the storage directory based on modification time.
|
839 |
"""
|
840 |
try:
|
841 |
+
storage_location = DEFAULT_FILE_PATH
|
842 |
+
csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")]
|
843 |
+
if not csv_files:
|
844 |
return None
|
845 |
|
846 |
+
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
847 |
return latest_csv
|
848 |
except Exception as e:
|
849 |
logging.error(f"Error retrieving latest CSV: {e}")
|
|
|
894 |
logging.error(f"Error generating response: {e}")
|
895 |
return "Error generating response."
|
896 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
897 |
# Define the Gradio interface
|
898 |
def create_interface() -> gr.Blocks:
|
899 |
"""
|