acecalisto3 commited on
Commit
ac4d529
·
1 Parent(s): 7321c1c
Files changed (1) hide show
  1. app.py +1239 -3
app.py CHANGED
@@ -1,4 +1,41 @@
1
- # ... (your existing imports and code before model loading) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # Dictionary to store model loading functions
4
  model_loaders = {
@@ -12,8 +49,1146 @@ model_option = st.selectbox("Select a Model", list(model_loaders.keys()))
12
  # Load the selected model
13
  model = model_loaders[model_option]()
14
 
15
- # ... (rest of your existing code) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def load_model(model_name: str):
19
  """
@@ -40,4 +1215,65 @@ def load_model(model_name: str):
40
  logging.error(f"Error loading {model_name} model: {e}")
41
  return None
42
 
43
- # ... (rest of your existing code) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import hashlib
4
+ import logging
5
+ import streamlit as st
6
+ import datetime
7
+ import csv
8
+ import threading
9
+ import re
10
+ import unittest
11
+ from urllib.parse import urlparse
12
+ import spaces
13
+
14
+ import pandas as pd
15
+ from selenium import webdriver
16
+ from selenium.webdriver.chrome.service import Service
17
+ from selenium.webdriver.chrome.options import Options
18
+ from selenium.webdriver.common.by import By
19
+ from selenium.webdriver.support.ui import WebDriverWait
20
+ from selenium.webdriver.support import expected_conditions as EC
21
+ from selenium.common.exceptions import (
22
+ TimeoutException,
23
+
24
+ NoSuchElementException,
25
+ StaleElementReferenceException,
26
+ )
27
+ from webdriver_manager.chrome import ChromeDriverManager
28
+
29
+ from transformers import AutoTokenizer, OpenLlamaForCausalLM, pipeline
30
+ import gradio as gr
31
+ import xml.etree.ElementTree as ET
32
+ import torch
33
+ import mysql.connector
34
+ from mysql.connector import errorcode, pooling
35
+ import nltk
36
+ import importlib
37
+
38
+ st.title("CEEMEESEEK with Model Selection")
39
 
40
  # Dictionary to store model loading functions
41
  model_loaders = {
 
49
  # Load the selected model
50
  model = model_loaders[model_option]()
51
 
52
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
53
+
54
+ if not HUGGINGFACE_TOKEN:
55
+ raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
56
+ add_to_git_credential=True
57
+ login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
58
+
59
+
60
+ # Load environment variables from .env file
61
+ load_dotenv()
62
+
63
+ # Configure logging
64
+ logging.basicConfig(
65
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
66
+ )
67
+
68
+ # Define constants
69
+ DEFAULT_FILE_PATH = "scraped_data"
70
+ PURPOSE = (
71
+ "You go to Culvers sites, you continuously seek changes on them since your last observation. "
72
+ "Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
73
+ )
74
+
75
+ # Global variables for task management
76
+ HISTORY = []
77
+ CURRENT_TASK = None
78
+ STOP_THREADS = False # Flag to stop scraping threads
79
+
80
+ # Database Pooling Configuration
81
+ DB_POOL_NAME = "mypool"
82
+ DB_POOL_SIZE = 5 # Adjust based on expected load
83
+
84
+ try:
85
+ dbconfig = {
86
+ "host": os.getenv("DB_HOST"),
87
+ "user": os.getenv("DB_USER"),
88
+ "password": os.getenv("DB_PASSWORD"),
89
+ "database": os.getenv("DB_NAME"),
90
+ }
91
+ connection_pool = mysql.connector.pooling.MySQLConnectionPool(
92
+ pool_name=DB_POOL_NAME,
93
+ pool_size=DB_POOL_SIZE,
94
+ pool_reset_session=True,
95
+ **dbconfig
96
+ )
97
+ logging.info("Database connection pool created successfully.")
98
+ except mysql.connector.Error as err:
99
+ logging.warning(f"Database connection pool creation failed: {err}")
100
+ connection_pool = None # Will use CSV as fallback
101
+
102
+ # Function to get a database connection from the pool
103
+ def get_db_connection():
104
+ """
105
+ Retrieves a connection from the pool. Returns None if pool is not available.
106
+ """
107
+ if connection_pool:
108
+ try:
109
+ connection = connection_pool.get_connection()
110
+ if connection.is_connected():
111
+ return connection
112
+ except mysql.connector.Error as err:
113
+ logging.error(f"Error getting connection from pool: {err}")
114
+ return None
115
+
116
+ # Initialize Database: Create tables and indexes
117
+ def initialize_database():
118
+ """
119
+ Initializes the database by creating necessary tables and indexes if they do not exist.
120
+ """
121
+ connection = get_db_connection()
122
+ if connection is None:
123
+ logging.info("Database initialization skipped. Using CSV storage.")
124
+ return
125
+
126
+ cursor = connection.cursor()
127
+ try:
128
+ # Create table for scraped data
129
+ create_scraped_data_table = """
130
+ CREATE TABLE IF NOT EXISTS scraped_data (
131
+ id INT AUTO_INCREMENT PRIMARY KEY,
132
+ url VARCHAR(255) NOT NULL,
133
+ content_hash VARCHAR(64) NOT NULL,
134
+ change_detected DATETIME NOT NULL
135
+ )
136
+ """
137
+ cursor.execute(create_scraped_data_table)
138
+ logging.info("Table 'scraped_data' is ready.")
139
+
140
+ # Create indexes for performance
141
+ create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
142
+ create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
143
+ cursor.execute(create_index_url)
144
+ cursor.execute(create_index_change)
145
+ logging.info("Indexes on 'url' and 'change_detected' columns created.")
146
+
147
+ # Create table for action logs
148
+ create_action_logs_table = """
149
+ CREATE TABLE IF NOT EXISTS action_logs (
150
+ id INT AUTO_INCREMENT PRIMARY KEY,
151
+ action VARCHAR(255) NOT NULL,
152
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
153
+ )
154
+ """
155
+ cursor.execute(create_action_logs_table)
156
+ logging.info("Table 'action_logs' is ready.")
157
+
158
+ except mysql.connector.Error as err:
159
+ logging.error(f"Error initializing database: {err}")
160
+ finally:
161
+ cursor.close()
162
+ connection.close()
163
+ logging.info("Database initialization complete.")
164
+
165
+ # Function to create WebDriver
166
+ def create_driver(options: Options) -> webdriver.Chrome:
167
+ """
168
+ Initializes and returns a Selenium Chrome WebDriver instance.
169
+ """
170
+ try:
171
+ driver = webdriver.Chrome(
172
+ service=Service(ChromeDriverManager().install()), options=options
173
+ )
174
+ logging.info("ChromeDriver initialized successfully.")
175
+ return driver
176
+ except Exception as exception:
177
+ logging.error(f"Error initializing ChromeDriver: {exception}")
178
+ return None
179
+
180
+ # Function to log changes to CSV
181
+ def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
182
+ """
183
+ Logs the change to a CSV file in the storage_location.
184
+ """
185
+ try:
186
+ os.makedirs(storage_location, exist_ok=True)
187
+ csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
188
+ file_exists = os.path.isfile(csv_file_path)
189
+
190
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
191
+ fieldnames = ["date", "time", "url", "content_hash", "change"]
192
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
193
+ if not file_exists:
194
+ writer.writeheader()
195
+ writer.writerow(
196
+ {
197
+ "date": change_detected.split()[0],
198
+ "time": change_detected.split()[1],
199
+ "url": url,
200
+ "content_hash": content_hash,
201
+ "change": "Content changed",
202
+ }
203
+ )
204
+ logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
205
+ except Exception as e:
206
+ logging.error(f"Error logging data to CSV: {e}")
207
+
208
+ # Function to get initial observation
209
+ def get_initial_observation(
210
+ driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
211
+ ) -> str:
212
+ """
213
+ Retrieves the initial content from the URL and returns its MD5 hash.
214
+ """
215
+ try:
216
+ driver.get(url)
217
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
218
+ time.sleep(2) # Additional wait for dynamic content
219
+
220
+ if content_type == "text":
221
+ initial_content = driver.page_source
222
+ elif content_type == "media":
223
+ if selector:
224
+ try:
225
+ elements = WebDriverWait(driver, 5).until(
226
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
227
+ )
228
+ initial_content = [element.get_attribute("src") for element in elements]
229
+ except TimeoutException:
230
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
231
+ initial_content = []
232
+ else:
233
+ elements = driver.find_elements(By.TAG_NAME, "img")
234
+ initial_content = [element.get_attribute("src") for element in elements]
235
+ else:
236
+ initial_content = driver.page_source
237
+
238
+ initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
239
+ logging.info(f"Initial hash for {url}: {initial_hash}")
240
+ return initial_hash
241
+ except Exception as exception:
242
+ logging.error(f"Error accessing {url}: {exception}")
243
+ return None
244
+
245
+ # Function to monitor URLs for changes
246
+ def monitor_urls(
247
+ storage_location: str,
248
+ urls: list,
249
+ scrape_interval: int,
250
+ content_type: str,
251
+ selector: str = None,
252
+ progress: gr.Progress = None
253
+ ):
254
+ """
255
+ Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
256
+ """
257
+ global HISTORY, STOP_THREADS
258
+ previous_hashes = {url: "" for url in urls}
259
+
260
+ options = Options()
261
+ options.add_argument("--headless")
262
+ options.add_argument("--no-sandbox")
263
+ options.add_argument("--disable-dev-shm-usage")
264
+
265
+ driver = create_driver(options)
266
+ if driver is None:
267
+ logging.error("WebDriver could not be initialized. Exiting monitor.")
268
+ return
269
+
270
+ try:
271
+ while not STOP_THREADS:
272
+ for url in urls:
273
+ if STOP_THREADS:
274
+ break
275
+ try:
276
+ driver.get(url)
277
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
278
+ time.sleep(2) # Additional wait for dynamic content
279
+
280
+ if content_type == "text":
281
+ current_content = driver.page_source
282
+ elif content_type == "media":
283
+ if selector:
284
+ try:
285
+ elements = WebDriverWait(driver, 5).until(
286
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
287
+ )
288
+ current_content = [element.get_attribute("src") for element in elements]
289
+ except TimeoutException:
290
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
291
+ current_content = []
292
+ else:
293
+ elements = driver.find_elements(By.TAG_NAME, "img")
294
+ current_content = [element.get_attribute("src") for element in elements]
295
+ else:
296
+ current_content = driver.page_source
297
+
298
+ current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
299
+ if current_hash != previous_hashes[url]:
300
+ previous_hashes[url] = current_hash
301
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
302
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
303
+
304
+ # Attempt to log to database
305
+ connection = get_db_connection()
306
+ if connection:
307
+ try:
308
+ cursor = connection.cursor()
309
+ insert_query = """
310
+ INSERT INTO scraped_data (url, content_hash, change_detected)
311
+ VALUES (%s, %s, %s)
312
+ """
313
+ cursor.execute(insert_query, (url, current_hash, date_time_str))
314
+ connection.commit()
315
+ logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
316
+ except mysql.connector.Error as err:
317
+ logging.error(f"Error inserting data into database: {err}")
318
+ # Fallback to CSV
319
+ log_to_csv(storage_location, url, current_hash, date_time_str)
320
+ finally:
321
+ cursor.close()
322
+ connection.close()
323
+ else:
324
+ # Fallback to CSV
325
+ log_to_csv(storage_location, url, current_hash, date_time_str)
326
+
327
+ # Update progress
328
+ if progress:
329
+ progress(1)
330
+ except (
331
+ NoSuchElementException,
332
+ StaleElementReferenceException,
333
+ TimeoutException,
334
+ Exception,
335
+ ) as e:
336
+ logging.error(f"Error accessing {url}: {e}")
337
+ if progress:
338
+ progress(1)
339
+ time.sleep(scrape_interval * 60) # Wait for the next scrape interval
340
+ finally:
341
+ driver.quit()
342
+ logging.info("ChromeDriver session ended.")
343
+
344
+ # Function to start scraping
345
+ def start_scraping(
346
+ storage_location: str,
347
+ urls: str,
348
+ scrape_interval: int,
349
+ content_type: str,
350
+ selector: str = None,
351
+ progress: gr.Progress = None
352
+ ) -> str:
353
+ """
354
+ Starts the scraping process in a separate thread with progress indication.
355
+ """
356
+ global CURRENT_TASK, HISTORY, STOP_THREADS
357
+
358
+ if STOP_THREADS:
359
+ STOP_THREADS = False # Reset the flag if previously stopped
360
+
361
+ url_list = [url.strip() for url in urls.split(",") if url.strip()]
362
+ CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
363
+ HISTORY.append(f"Task started: {CURRENT_TASK}")
364
+ logging.info(f"Task started: {CURRENT_TASK}")
365
+
366
+ # Initialize database tables
367
+ initialize_database()
368
 
369
+ # Log initial observations
370
+ def log_initial_observations():
371
+ options = Options()
372
+ options.add_argument("--headless")
373
+ options.add_argument("--no-sandbox")
374
+ options.add_argument("--disable-dev-shm-usage")
375
+
376
+ driver = create_driver(options)
377
+ if driver is None:
378
+ return
379
+
380
+ for url in url_list:
381
+ if STOP_THREADS:
382
+ break
383
+ try:
384
+ initial_hash = get_initial_observation(driver, url, content_type, selector)
385
+ if initial_hash:
386
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
387
+ HISTORY.append(f"Initial observation at {url}: {initial_hash}")
388
+
389
+ # Attempt to log to database
390
+ connection = get_db_connection()
391
+ if connection:
392
+ try:
393
+ cursor = connection.cursor()
394
+ insert_query = """
395
+ INSERT INTO scraped_data (url, content_hash, change_detected)
396
+ VALUES (%s, %s, %s)
397
+ """
398
+ cursor.execute(insert_query, (url, initial_hash, date_time_str))
399
+ connection.commit()
400
+ logging.info(f"Initial observation logged for {url} in database.")
401
+ except mysql.connector.Error as err:
402
+ logging.error(f"Error inserting initial observation into database: {err}")
403
+ # Fallback to CSV
404
+ log_to_csv(storage_location, url, initial_hash, date_time_str)
405
+ finally:
406
+ cursor.close()
407
+ connection.close()
408
+ else:
409
+ # Fallback to CSV
410
+ log_to_csv(storage_location, url, initial_hash, date_time_str)
411
+ except Exception as e:
412
+ HISTORY.append(f"Error accessing {url}: {e}")
413
+ logging.error(f"Error accessing {url}: {e}")
414
+ driver.quit()
415
+
416
+ # Start logging initial observations
417
+ initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
418
+ initial_thread.start()
419
+
420
+ # Start the monitoring thread with progress
421
+ monitor_thread = threading.Thread(
422
+ target=monitor_urls,
423
+ args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
424
+ daemon=True,
425
+ )
426
+ monitor_thread.start()
427
+ logging.info("Started scraping thread.")
428
+ return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
429
+
430
+ # Function to stop scraping
431
+ def stop_scraping() -> str:
432
+ """
433
+ Stops all ongoing scraping threads.
434
+ """
435
+ global STOP_THREADS
436
+ STOP_THREADS = True
437
+ HISTORY.append("Scraping stopped by user.")
438
+ logging.info("Scraping stop signal sent.")
439
+ return "Scraping has been stopped."
440
+
441
+ # Function to display CSV content from MySQL or CSV
442
+ def display_csv(storage_location: str, url: str) -> str:
443
+ """
444
+ Fetches and returns the scraped data for a given URL from the MySQL database or CSV.
445
+ """
446
+ try:
447
+ connection = get_db_connection()
448
+ if connection:
449
+ try:
450
+ cursor = connection.cursor(dictionary=True)
451
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
452
+ cursor.execute(query, (url,))
453
+ results = cursor.fetchall()
454
+
455
+ if not results:
456
+ return "No data available for the selected URL."
457
+
458
+ df = pd.DataFrame(results)
459
+ cursor.close()
460
+ connection.close()
461
+ return df.to_string(index=False)
462
+ except mysql.connector.Error as err:
463
+ logging.error(f"Error fetching data from database: {err}")
464
+ # Fallback to CSV
465
+ else:
466
+ logging.info("No database connection. Fetching data from CSV.")
467
+
468
+ # Fallback to CSV
469
+ hostname = urlparse(url).hostname
470
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
471
+ if os.path.exists(csv_path):
472
+ df = pd.read_csv(csv_path)
473
+ return df.to_string(index=False)
474
+ else:
475
+ return "No data available."
476
+
477
+ except Exception as e:
478
+ logging.error(f"Error fetching data for {url}: {e}")
479
+ return f"Error fetching data for {url}: {e}"
480
+
481
+ # Function to generate RSS feed from MySQL or CSV data
482
+ def generate_rss_feed(storage_location: str, url: str) -> str:
483
+ """
484
+ Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV.
485
+ """
486
+ try:
487
+ connection = get_db_connection()
488
+ rss_feed = ""
489
+
490
+ if connection:
491
+ try:
492
+ cursor = connection.cursor(dictionary=True)
493
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
494
+ cursor.execute(query, (url,))
495
+ results = cursor.fetchall()
496
+
497
+ if not results:
498
+ return "No changes detected to include in RSS feed."
499
+
500
+ # Create the root RSS element
501
+ rss = ET.Element("rss", version="2.0")
502
+ channel = ET.SubElement(rss, "channel")
503
+
504
+ # Add channel elements
505
+ title = ET.SubElement(channel, "title")
506
+ title.text = f"RSS Feed for {urlparse(url).hostname}"
507
+
508
+ link = ET.SubElement(channel, "link")
509
+ link.text = url
510
+
511
+ description = ET.SubElement(channel, "description")
512
+ description.text = "Recent changes detected on the website."
513
+
514
+ # Add items to the feed
515
+ for row in results:
516
+ item = ET.SubElement(channel, "item")
517
+
518
+ item_title = ET.SubElement(item, "title")
519
+ item_title.text = f"Change detected at {row['url']}"
520
+
521
+ item_link = ET.SubElement(item, "link")
522
+ item_link.text = row["url"]
523
+
524
+ item_description = ET.SubElement(item, "description")
525
+ item_description.text = f"Content changed on {row['change_detected']}"
526
+
527
+ pub_date = ET.SubElement(item, "pubDate")
528
+ pub_date.text = datetime.datetime.strptime(
529
+ str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
530
+ ).strftime("%a, %d %b %Y %H:%M:%S +0000")
531
+
532
+ # Generate the XML string
533
+ rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
534
+ cursor.close()
535
+ connection.close()
536
+ return rss_feed
537
+ except mysql.connector.Error as err:
538
+ logging.error(f"Error fetching data from database: {err}")
539
+ # Fallback to CSV
540
+ else:
541
+ logging.info("No database connection. Generating RSS feed from CSV.")
542
+
543
+ # Fallback to CSV
544
+ hostname = urlparse(url).hostname
545
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
546
+ if os.path.exists(csv_path):
547
+ df = pd.read_csv(csv_path).tail(10)
548
+ if df.empty:
549
+ return "No changes detected to include in RSS feed."
550
+
551
+ # Create the root RSS element
552
+ rss = ET.Element("rss", version="2.0")
553
+ channel = ET.SubElement(rss, "channel")
554
+
555
+ # Add channel elements
556
+ title = ET.SubElement(channel, "title")
557
+ title.text = f"RSS Feed for {hostname}"
558
+
559
+ link = ET.SubElement(channel, "link")
560
+ link.text = url
561
+
562
+ description = ET.SubElement(channel, "description")
563
+ description.text = "Recent changes detected on the website."
564
+
565
+ # Add items to the feed
566
+ for _, row in df.iterrows():
567
+ item = ET.SubElement(channel, "item")
568
+
569
+ item_title = ET.SubElement(item, "title")
570
+ item_title.text = f"Change detected at {row['url']}"
571
+
572
+ item_link = ET.SubElement(item, "link")
573
+ item_link.text = row["url"]
574
+
575
+ item_description = ET.SubElement(item, "description")
576
+ item_description.text = f"Content changed on {row['date']} at {row['time']}"
577
+
578
+ pub_date = ET.SubElement(item, "pubDate")
579
+ pub_date.text = datetime.datetime.strptime(
580
+ f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S"
581
+ ).strftime("%a, %d %b %Y %H:%M:%S +0000")
582
+
583
+ # Generate the XML string
584
+ rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
585
+ return rss_feed
586
+ else:
587
+ return "No data available."
588
+
589
+ except Exception as e:
590
+ logging.error(f"Error generating RSS feed for {url}: {e}")
591
+ return f"Error generating RSS feed for {url}: {e}"
592
+
593
+ # Function to parse user commands using spaCy
594
+ def parse_command(message: str) -> tuple:
595
+ """
596
+ Parses the user message using spaCy to identify if it contains a command.
597
+ Returns the command and its parameters if found, else (None, None).
598
+ """
599
+ doc = nlp(message.lower())
600
+ command = None
601
+ params = {}
602
+
603
+ # Define command patterns
604
+ if "filter" in message.lower():
605
+ # Example: "Filter apples, oranges in column Description"
606
+ match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE)
607
+ if match:
608
+ words = [word.strip() for word in match.group(1).split(",")]
609
+ column = match.group(2)
610
+ command = "filter"
611
+ params = {"words": words, "column": column}
612
+
613
+ elif "sort" in message.lower():
614
+ # Example: "Sort Price ascending"
615
+ match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE)
616
+ if match:
617
+ column = match.group(1)
618
+ order = match.group(2)
619
+ command = "sort"
620
+ params = {"column": column, "order": order}
621
+
622
+ elif "export to csv as" in message.lower():
623
+ # Example: "Export to CSV as filtered_data.csv"
624
+ match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE)
625
+ if match:
626
+ filename = match.group(1)
627
+ command = "export"
628
+ params = {"filename": filename}
629
+
630
+ elif "log action" in message.lower():
631
+ # Example: "Log action Filtered data for specific fruits"
632
+ match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE)
633
+ if match:
634
+ action = match.group(1)
635
+ command = "log"
636
+ params = {"action": action}
637
+
638
+ return command, params
639
+
640
+ # Function to execute parsed commands
641
+ def execute_command(command: str, params: dict) -> str:
642
+ """
643
+ Executes the corresponding function based on the command and parameters.
644
+ """
645
+ if command == "filter":
646
+ words = params["words"]
647
+ column = params["column"]
648
+ return filter_data(column, words)
649
+ elif command == "sort":
650
+ column = params["column"]
651
+ order = params["order"]
652
+ return sort_data(column, order)
653
+ elif command == "export":
654
+ filename = params["filename"]
655
+ return export_csv(filename)
656
+ elif command == "log":
657
+ action = params["action"]
658
+ return log_action(action)
659
+ else:
660
+ return "Unknown command."
661
+
662
+ # Data Manipulation Functions
663
+ def filter_data(column: str, words: list) -> str:
664
+ """
665
+ Filters the scraped data to include only rows where the specified column contains the given words.
666
+ Saves the filtered data to a new CSV file.
667
+ """
668
+ try:
669
+ storage_location = DEFAULT_FILE_PATH
670
+
671
+ connection = get_db_connection()
672
+ if connection:
673
+ try:
674
+ cursor = connection.cursor(dictionary=True)
675
+ # Fetch all data
676
+ query = "SELECT * FROM scraped_data"
677
+ cursor.execute(query)
678
+ results = cursor.fetchall()
679
+
680
+ if not results:
681
+ return "No data available to filter."
682
+
683
+ df = pd.DataFrame(results)
684
+ # Create a regex pattern to match any of the words
685
+ pattern = '|'.join(words)
686
+ if column not in df.columns:
687
+ return f"Column '{column}' does not exist in the data."
688
+
689
+ filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
690
+
691
+ if filtered_df.empty:
692
+ return f"No records found with words {words} in column '{column}'."
693
+
694
+ # Save the filtered data to a new CSV
695
+ timestamp = int(time.time())
696
+ filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv")
697
+ filtered_df.to_csv(filtered_csv, index=False)
698
+ logging.info(f"Data filtered on column '{column}' for words {words}.")
699
+ return f"Data filtered and saved to {filtered_csv}."
700
+ except mysql.connector.Error as err:
701
+ logging.error(f"Error fetching data from database: {err}")
702
+ # Fallback to CSV
703
+ else:
704
+ logging.info("No database connection. Filtering data from CSV.")
705
+
706
+ # Fallback to CSV
707
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
708
+ if not csv_files:
709
+ return "No CSV files found to filter."
710
+
711
+ # Assume the latest CSV is the target
712
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
713
+ df = pd.read_csv(latest_csv)
714
+
715
+ if column not in df.columns:
716
+ return f"Column '{column}' does not exist in the data."
717
+
718
+ filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)]
719
+
720
+ if filtered_df.empty:
721
+ return f"No records found with words {words} in column '{column}'."
722
+
723
+ # Save the filtered data to a new CSV
724
+ timestamp = int(time.time())
725
+ filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv")
726
+ filtered_df.to_csv(filtered_csv, index=False)
727
+ logging.info(f"Data filtered on column '{column}' for words {words}.")
728
+ return f"Data filtered and saved to {filtered_csv}."
729
+ except Exception as e:
730
+ logging.error(f"Error filtering data: {e}")
731
+ return f"Error filtering data: {e}"
732
+
733
+ def sort_data(column: str, order: str) -> str:
734
+ """
735
+ Sorts the scraped data based on the specified column and order.
736
+ Saves the sorted data to a new CSV file.
737
+ """
738
+ try:
739
+ storage_location = DEFAULT_FILE_PATH
740
+
741
+ connection = get_db_connection()
742
+ if connection:
743
+ try:
744
+ cursor = connection.cursor(dictionary=True)
745
+ # Fetch all data
746
+ query = "SELECT * FROM scraped_data"
747
+ cursor.execute(query)
748
+ results = cursor.fetchall()
749
+
750
+ if not results:
751
+ return "No data available to sort."
752
+
753
+ df = pd.DataFrame(results)
754
+ if column not in df.columns:
755
+ return f"Column '{column}' does not exist in the data."
756
+
757
+ ascending = True if order.lower() == "ascending" else False
758
+ sorted_df = df.sort_values(by=column, ascending=ascending)
759
+
760
+ # Save the sorted data to a new CSV
761
+ timestamp = int(time.time())
762
+ sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv")
763
+ sorted_df.to_csv(sorted_csv, index=False)
764
+ logging.info(f"Data sorted on column '{column}' in {order} order.")
765
+ return f"Data sorted and saved to {sorted_csv}."
766
+ except mysql.connector.Error as err:
767
+ logging.error(f"Error fetching data from database: {err}")
768
+ # Fallback to CSV
769
+ else:
770
+ logging.info("No database connection. Sorting data from CSV.")
771
+
772
+ # Fallback to CSV
773
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
774
+ if not csv_files:
775
+ return "No CSV files found to sort."
776
+
777
+ # Assume the latest CSV is the target
778
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
779
+ df = pd.read_csv(latest_csv)
780
+
781
+ if column not in df.columns:
782
+ return f"Column '{column}' does not exist in the data."
783
+
784
+ ascending = True if order.lower() == "ascending" else False
785
+ sorted_df = df.sort_values(by=column, ascending=ascending)
786
+
787
+ # Save the sorted data to a new CSV
788
+ timestamp = int(time.time())
789
+ sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv")
790
+ sorted_df.to_csv(sorted_csv, index=False)
791
+ logging.info(f"Data sorted on column '{column}' in {order} order.")
792
+ return f"Data sorted and saved to {sorted_csv}."
793
+ except Exception as e:
794
+ logging.error(f"Error sorting data: {e}")
795
+ return f"Error sorting data: {e}"
796
+
797
+ def export_csv(filename: str) -> str:
798
+ """
799
+ Exports the latest scraped data to a specified CSV filename.
800
+ """
801
+ try:
802
+ storage_location = DEFAULT_FILE_PATH
803
+
804
+ connection = get_db_connection()
805
+ if connection:
806
+ try:
807
+ cursor = connection.cursor(dictionary=True)
808
+ # Fetch all data
809
+ query = "SELECT * FROM scraped_data"
810
+ cursor.execute(query)
811
+ results = cursor.fetchall()
812
+
813
+ if not results:
814
+ return "No data available to export."
815
+
816
+ df = pd.DataFrame(results)
817
+ export_path = os.path.join(storage_location, filename)
818
+ df.to_csv(export_path, index=False)
819
+ logging.info(f"Data exported to {export_path}.")
820
+ return f"Data exported to {export_path}."
821
+ except mysql.connector.Error as err:
822
+ logging.error(f"Error exporting data from database: {err}")
823
+ # Fallback to CSV
824
+ else:
825
+ logging.info("No database connection. Exporting data from CSV.")
826
+
827
+ # Fallback to CSV
828
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
829
+ if not csv_files:
830
+ return "No CSV files found to export."
831
+
832
+ # Assume the latest CSV is the target
833
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
834
+ df = pd.read_csv(latest_csv)
835
+ export_path = os.path.join(storage_location, filename)
836
+ df.to_csv(export_path, index=False)
837
+ logging.info(f"Data exported to {export_path}.")
838
+ return f"Data exported to {export_path}."
839
+ except Exception as e:
840
+ logging.error(f"Error exporting CSV: {e}")
841
+ return f"Error exporting CSV: {e}"
842
+
843
+ def log_action(action: str) -> str:
844
+ """
845
+ Logs a custom action message to the MySQL database or CSV.
846
+ """
847
+ try:
848
+ connection = get_db_connection()
849
+ if connection:
850
+ try:
851
+ cursor = connection.cursor()
852
+ insert_query = """
853
+ INSERT INTO action_logs (action)
854
+ VALUES (%s)
855
+ """
856
+ cursor.execute(insert_query, (action,))
857
+ connection.commit()
858
+ logging.info(f"Action logged in database: {action}")
859
+ cursor.close()
860
+ connection.close()
861
+ return f"Action logged: {action}"
862
+ except mysql.connector.Error as err:
863
+ logging.error(f"Error logging action to database: {err}")
864
+ # Fallback to CSV
865
+ else:
866
+ logging.info("No database connection. Logging action to CSV.")
867
+
868
+ # Fallback to CSV
869
+ storage_location = DEFAULT_FILE_PATH
870
+ try:
871
+ os.makedirs(storage_location, exist_ok=True)
872
+ csv_file_path = os.path.join(storage_location, "action_logs.csv")
873
+ file_exists = os.path.isfile(csv_file_path)
874
+
875
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
876
+ fieldnames = ["timestamp", "action"]
877
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
878
+ if not file_exists:
879
+ writer.writeheader()
880
+ writer.writerow(
881
+ {
882
+ "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
883
+ "action": action,
884
+ }
885
+ )
886
+ logging.info(f"Action logged to CSV: {action}")
887
+ return f"Action logged: {action}"
888
+ except Exception as e:
889
+ logging.error(f"Error logging action to CSV: {e}")
890
+ return f"Error logging action: {e}"
891
+ except Exception as e:
892
+ logging.error(f"Error logging action: {e}")
893
+ return f"Error logging action: {e}"
894
+
895
+ # Function to get the latest CSV file based on modification time
896
+ def get_latest_csv() -> str:
897
+ """
898
+ Retrieves the latest CSV file from the storage directory based on modification time.
899
+ """
900
+ try:
901
+ storage_location = "/home/users/app/scraped_data"
902
+ csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")]
903
+ if not csv_files:
904
+ return None
905
+
906
+ latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
907
+ return latest_csv
908
+ except Exception as e:
909
+ logging.error(f"Error retrieving latest CSV: {e}")
910
+ return None
911
+
912
+ def respond(
913
+ message: str,
914
+ history: list,
915
+ system_message: str,
916
+ max_tokens: int,
917
+ temperature: float,
918
+ top_p: float,
919
+ ) -> str:
920
+ """
921
+ Generates a response using OpenLlamaForCausalLM.
922
+ """
923
+ try:
924
+ # Check if the message contains a command
925
+ command, params = parse_command(message)
926
+ if command:
927
+ # Execute the corresponding function
928
+ response = execute_command(command, params)
929
+ else:
930
+ # Generate a regular response using OpenLlama
931
+ prompt = (
932
+ f"System: {system_message}\n"
933
+ f"History: {history}\n"
934
+ f"User: {message}\n"
935
+ f"Assistant:"
936
+ )
937
+ response = openllama_pipeline(
938
+ prompt,
939
+ max_length=max_tokens,
940
+ temperature=temperature,
941
+ top_p=top_p,
942
+ )[0]["generated_text"]
943
+
944
+
945
+ # Extract the assistant's reply
946
+ response = response.split("Assistant:")[-1].strip()
947
+ return response
948
+ except Exception as e:
949
+ logging.error(f"Error generating response: {e}")
950
+ return "Error generating response."
951
+
952
+ # Define the Gradio interface
953
+ def create_interface() -> gr.Blocks():
954
+ """
955
+ Defines and returns the Gradio interface for the application.
956
+ """
957
+ with gr.Blocks() as demo:
958
+ gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
959
+
960
+ with gr.Row():
961
+ with gr.Column():
962
+ # Scraping Controls
963
+ storage_location = gr.Textbox(
964
+ value=DEFAULT_FILE_PATH, label="Storage Location"
965
+ )
966
+ urls = gr.Textbox(
967
+ label="URLs (comma separated)",
968
+ placeholder="https://example.com, https://anotherexample.com",
969
+ )
970
+ scrape_interval = gr.Slider(
971
+ minimum=1,
972
+ maximum=60,
973
+ value=5,
974
+ step=1,
975
+ label="Scrape Interval (minutes)",
976
+ )
977
+ content_type = gr.Radio(
978
+ choices=["text", "media", "both"],
979
+ value="text",
980
+ label="Content Type",
981
+ )
982
+ selector = gr.Textbox(
983
+ label="CSS Selector for Media (Optional)",
984
+ placeholder="e.g., img.main-image",
985
+ )
986
+ start_button = gr.Button("Start Scraping")
987
+ stop_button = gr.Button("Stop Scraping")
988
+ status_output = gr.Textbox(
989
+ label="Status Output", interactive=False, lines=2
990
+ )
991
+
992
+ with gr.Column():
993
+ # Chat Interface
994
+ chat_history = gr.Chatbot(label="Chat History")
995
+ with gr.Row():
996
+ message = gr.Textbox(label="Message", placeholder="Type your message here...")
997
+ system_message = gr.Textbox(
998
+ value="You are a helpful assistant.", label="System message"
999
+ )
1000
+ max_tokens = gr.Slider(
1001
+ minimum=1,
1002
+ maximum=2048,
1003
+ value=512,
1004
+ step=1,
1005
+ label="Max new tokens",
1006
+ )
1007
+ temperature = gr.Slider(
1008
+ minimum=0.1,
1009
+ maximum=4.0,
1010
+ value=0.7,
1011
+ step=0.1,
1012
+ label="Temperature",
1013
+ )
1014
+ top_p = gr.Slider(
1015
+ minimum=0.1,
1016
+ maximum=1.0,
1017
+ value=0.95,
1018
+ step=0.05,
1019
+ label="Top-p (nucleus sampling)",
1020
+ )
1021
+ response_box = gr.Textbox(label="Response", interactive=False, lines=2)
1022
+
1023
+ with gr.Row():
1024
+ with gr.Column():
1025
+ # CSV Display Controls
1026
+ selected_url_csv = gr.Textbox(
1027
+ label="Select URL for CSV Content",
1028
+ placeholder="https://example.com",
1029
+ )
1030
+ csv_button = gr.Button("Display CSV Content")
1031
+ csv_content_output = gr.Textbox(
1032
+ label="CSV Content Output", interactive=False, lines=10
1033
+ )
1034
+
1035
+ with gr.Column():
1036
+ # RSS Feed Generation Controls
1037
+ selected_url_rss = gr.Textbox(
1038
+ label="Select URL for RSS Feed",
1039
+ placeholder="https://example.com",
1040
+ )
1041
+ rss_button = gr.Button("Generate RSS Feed")
1042
+ rss_output = gr.Textbox(
1043
+ label="RSS Feed Output", interactive=False, lines=20
1044
+ )
1045
+
1046
+ # Historical Data View
1047
+ with gr.Row():
1048
+ historical_view_url = gr.Textbox(
1049
+ label="Select URL for Historical Data",
1050
+ placeholder="https://example.com",
1051
+ )
1052
+ historical_button = gr.Button("View Historical Data")
1053
+ historical_output = gr.Dataframe(
1054
+ headers=["ID", "URL", "Content Hash", "Change Detected"],
1055
+ label="Historical Data",
1056
+ interactive=False
1057
+ )
1058
+
1059
+
1060
+
1061
+ # Connect buttons to their respective functions
1062
+ start_button.click(
1063
+ fn=start_scraping,
1064
+ inputs=[
1065
+ storage_location,
1066
+ urls,
1067
+ scrape_interval,
1068
+ content_type,
1069
+ selector,
1070
+
1071
+ ],
1072
+ outputs=status_output,
1073
+ )
1074
+
1075
+ stop_button.click(fn=stop_scraping, outputs=status_output)
1076
+
1077
+ csv_button.click(
1078
+ fn=display_csv,
1079
+ inputs=[storage_location, selected_url_csv],
1080
+ outputs=csv_content_output,
1081
+ )
1082
+
1083
+ rss_button.click(
1084
+ fn=generate_rss_feed,
1085
+ inputs=[storage_location, selected_url_rss],
1086
+ outputs=rss_output,
1087
+ )
1088
+
1089
+ historical_button.click(
1090
+ fn=display_historical_data,
1091
+ inputs=[storage_location, historical_view_url],
1092
+ outputs=historical_output,
1093
+ )
1094
+
1095
+ # Connect message submission to the chat interface
1096
+ def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
1097
+ if not message_input.strip():
1098
+ return history, "Please enter a message."
1099
+
1100
+ response = respond(
1101
+ message_input,
1102
+ history,
1103
+ system_msg,
1104
+ max_toks,
1105
+ temp,
1106
+ top_p_val,
1107
+ )
1108
+ history.append((message_input, response))
1109
+ return history, response
1110
+
1111
+ message.submit(
1112
+ update_chat,
1113
+ inputs=[
1114
+ message,
1115
+ chat_history,
1116
+ system_message,
1117
+ max_tokens,
1118
+ temperature,
1119
+ top_p,
1120
+ ],
1121
+ outputs=[chat_history, response_box],
1122
+ )
1123
+
1124
+ return demo
1125
+
1126
+ # Function to display historical data
1127
+ def display_historical_data(storage_location: str, url: str):
1128
+ """
1129
+ Retrieves and displays historical scraping data for a given URL.
1130
+ """
1131
+ try:
1132
+ connection = get_db_connection()
1133
+ if connection:
1134
+ try:
1135
+ cursor = connection.cursor(dictionary=True)
1136
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
1137
+ cursor.execute(query, (url,))
1138
+ results = cursor.fetchall()
1139
+
1140
+ if not results:
1141
+ return pd.DataFrame()
1142
+
1143
+ df = pd.DataFrame(results)
1144
+ cursor.close()
1145
+ connection.close()
1146
+ return df
1147
+ except mysql.connector.Error as err:
1148
+ logging.error(f"Error fetching historical data from database: {err}")
1149
+ # Fallback to CSV
1150
+ else:
1151
+ logging.info("No database connection. Fetching historical data from CSV.")
1152
+
1153
+ # Fallback to CSV
1154
+ hostname = urlparse(url).hostname
1155
+ csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
1156
+ if os.path.exists(csv_path):
1157
+ df = pd.read_csv(csv_path)
1158
+ return df
1159
+ else:
1160
+ return pd.DataFrame()
1161
+ except Exception as e:
1162
+ logging.error(f"Error fetching historical data for {url}: {e}")
1163
+ return pd.DataFrame()
1164
+
1165
+ def load_model():
1166
+ """
1167
+ Loads the openLlama model and tokenizer once and returns the pipeline.
1168
+ """
1169
+ try:
1170
+ model_name = "openlm-research/open_llama_3b_v2"
1171
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
1172
+ model = AutoModelForCausalLM.from_pretrained(model_name)
1173
+
1174
+ # This should be inside the try block
1175
+ max_supported_length = 2048
1176
+
1177
+ openllama_pipeline = pipeline(
1178
+ "text-generation",
1179
+ model=model,
1180
+ tokenizer=tokenizer,
1181
+ truncation=True,
1182
+ max_length=max_supported_length,
1183
+ temperature=0.7,
1184
+ top_p=0.95,
1185
+ device=0 if torch.cuda.is_available() else -1,
1186
+ )
1187
+ logging.info("Model loaded successfully.")
1188
+ return openllama_pipeline # Return the pipeline
1189
+ except Exception as e:
1190
+ logging.error(f"Error loading google/flan-t5-xl model: {e}")
1191
+ return None
1192
 
1193
  def load_model(model_name: str):
1194
  """
 
1215
  logging.error(f"Error loading {model_name} model: {e}")
1216
  return None
1217
 
1218
+ # Automated Testing using unittest
1219
+ class TestApp(unittest.TestCase):
1220
+ def test_parse_command_filter(self):
1221
+ command = "Filter apples, oranges in column Description"
1222
+ parsed_command = parse_command(command)
1223
+ self.assertEqual(parsed_command[0], "filter")
1224
+ self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"])
1225
+ self.assertEqual(parsed_command[1]["column"], "Description")
1226
+
1227
+ def test_parse_command_sort(self):
1228
+ command = "Sort Price ascending"
1229
+ parsed_command = parse_command(command)
1230
+ self.assertEqual(parsed_command[0], "sort")
1231
+ self.assertEqual(parsed_command[1]["column"], "Price")
1232
+ self.assertEqual(parsed_command[1]["order"], "ascending")
1233
+
1234
+ def test_parse_command_export(self):
1235
+ command = "Export to CSV as filtered_data.csv"
1236
+ parsed_command = parse_command(command)
1237
+ self.assertEqual(parsed_command[0], "export")
1238
+ self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv")
1239
+
1240
+ def test_parse_command_log(self):
1241
+ command = "Log action Filtered data for specific fruits"
1242
+ parsed_command = parse_command(command)
1243
+ self.assertEqual(parsed_command[0], "log")
1244
+ self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits")
1245
+
1246
+ def test_database_connection(self):
1247
+ connection = get_db_connection()
1248
+ # Connection may be None if not configured; adjust the test accordingly
1249
+ if connection:
1250
+ self.assertTrue(connection.is_connected())
1251
+ connection.close()
1252
+ else:
1253
+ self.assertIsNone(connection)
1254
+
1255
+ def main():
1256
+ # Initialize and run the application
1257
+ logging.info("Starting the application...")
1258
+ model = load_model()
1259
+ if model:
1260
+ logging.info("Application started successfully.")
1261
+ print("Main function executed")
1262
+ print("Creating interface...")
1263
+ demo = create_interface()
1264
+ print("Launching interface...")
1265
+ demo.launch(server_name="0.0.0.0", server_port=7860)
1266
+ else:
1267
+ logging.error("Failed to start the application.")
1268
+
1269
+ # Main execution
1270
+ if __name__ == "__main__":
1271
+ # Initialize database
1272
+ initialize_database()
1273
+
1274
+ # Create and launch Gradio interface
1275
+ demo = create_interface()
1276
+ demo.launch()
1277
+
1278
+ # Run automated tests
1279
+ unittest.main(argv=[''], exit=False)