acecalisto3 commited on
Commit
bcc7de3
·
verified ·
1 Parent(s): 9c0b2a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +382 -116
app.py CHANGED
@@ -1,457 +1,723 @@
1
- import os
2
- import time
3
- import hashlib
4
- import logging
5
- import streamlit as st
6
- import datetime
7
- import csv
8
- import threading
9
- import re
10
- import unittest
11
- from urllib.parse import urlparse
12
- import pandas as pd
13
- from selenium import webdriver
14
- from selenium.webdriver.chrome.service import Service
15
-
16
- from selenium.webdriver.chrome.options import Options
17
- from selenium.webdriver.common.by import By
18
- from selenium.webdriver.support.ui import WebDriverWait
19
- from selenium.webdriver.support import expected_conditions as EC
20
- from selenium.common.exceptions import (
21
- TimeoutException,
22
-
23
- NoSuchElementException,
24
- StaleElementReferenceException,
25
- )
26
- from webdriver_manager.chrome import ChromeDriverManager
27
-
28
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
29
- import gradio as gr
30
- import xml.etree.ElementTree as ET
31
- import torch
32
- import mysql.connector
33
- from mysql.connector import pooling
34
- import nltk
35
- from huggingface_hub import login
36
- from dotenv import load_dotenv
37
-
38
- # Initialize NLTK resources (you may need to download these)
39
- st.title("CEEMEESEEK with Model Selection")
40
 
41
  # Dictionary to store model loading functions
 
42
  model_loaders = {
 
43
  "Falcon": lambda: load_model("tiiuae/falcon-7b"),
 
44
  "Flan-T5": lambda: load_model("google/flan-t5-xl"),
 
45
  "Flan-T5-Small": lambda: load_model("google/flan-t5-small") # Add a smaller model
46
- }
47
 
48
- model_option = st.selectbox("Select a Model", list(model_loaders.keys()))
49
 
50
- def load_model(model_name: str):
51
- """
52
- Loads the specified model and tokenizer.
53
- """
54
- try:
55
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
56
- model = AutoModelForCausalLM.from_pretrained(model_name)
57
- # This should be inside the try block
58
- max_supported_length = 2048 # Get this from the model config
59
- openllama_pipeline = pipeline(
60
- "text-generation",
61
- model=model,
62
- tokenizer=tokenizer,
63
- truncation=True,
64
- max_length=max_supported_length,
65
- temperature=0.7,
66
- top_p=0.95,
67
- device=0 if torch.cuda.is_available() else -1,
68
- )
69
- logging.info(f"{model_name} loaded successfully.")
70
- return openllama_pipeline
71
- except Exception as e:
72
- logging.error(f"Error loading {model_name} model: {e}")
73
- return None
74
 
75
- HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
76
- if not HUGGINGFACE_TOKEN:
77
- raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
78
 
79
- login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
80
 
81
- if not HUGGINGFACE_TOKEN:
 
82
  raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
83
- add_to_git_credential=True
84
- login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
85
 
86
 
87
- # Load environment variables from .env file
88
- load_dotenv()
89
 
90
- # Configure logging
 
91
  logging.basicConfig(
 
92
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
93
- )
94
 
95
- # Define constants
 
96
  DEFAULT_FILE_PATH = "scraped_data"
 
97
  PURPOSE = (
 
98
  "You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
 
99
  "Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
100
- )
101
 
102
- # Global variables for task management
 
103
  HISTORY = []
 
104
  CURRENT_TASK = None
105
- STOP_THREADS = False # Flag to stop scraping threads
106
 
107
- # Database Pooling Configuration
 
108
  DB_POOL_NAME = "mypool"
109
- DB_POOL_SIZE = 5 # Adjust based on expected load
110
 
111
- try:
 
112
  dbconfig = {
 
113
  "host": os.getenv("DB_HOST"),
 
114
  "user": os.getenv("DB_USER"),
 
115
  "password": os.getenv("DB_PASSWORD"),
 
116
  "database": os.getenv("DB_NAME"),
 
117
  }
 
118
  connection_pool = mysql.connector.pooling.MySQLConnectionPool(
 
119
  pool_name=DB_POOL_NAME,
 
120
  pool_size=DB_POOL_SIZE,
 
121
  pool_reset_session=True,
 
122
  **dbconfig
 
123
  )
124
- logging.info("Database connection pool created successfully.")
125
- except mysql.connector.Error as err:
 
126
  logging.warning(f"Database connection pool creation failed: {err}")
127
- connection_pool = None # Will use CSV as fallback
128
 
129
- # Function to get a database connection from the pool
130
- def get_db_connection():
131
  """
 
132
  Retrieves a connection from the pool. Returns None if pool is not available.
 
133
  """
 
134
  if connection_pool:
 
135
  try:
 
136
  connection = connection_pool.get_connection()
 
137
  if connection.is_connected():
 
138
  return connection
 
139
  except mysql.connector.Error as err:
 
140
  logging.error(f"Error getting connection from pool: {err}")
141
- return None
142
 
143
- # Initialize Database: Create tables and indexes
144
- def initialize_database():
145
  """
 
146
  Initializes the database by creating necessary tables and indexes if they do not exist.
 
147
  """
 
148
  connection = get_db_connection()
 
149
  if connection is None:
 
150
  logging.info("Database initialization skipped. Using CSV storage.")
 
151
  return
152
 
 
 
153
  cursor = connection.cursor()
 
154
  try:
 
155
  # Create table for scraped data
 
156
  create_scraped_data_table = """
 
157
  CREATE TABLE IF NOT EXISTS scraped_data (
 
158
  id INT AUTO_INCREMENT PRIMARY KEY,
 
159
  url VARCHAR(255) NOT NULL,
 
160
  content_hash VARCHAR(64) NOT NULL,
 
161
  change_detected DATETIME NOT NULL
 
162
  )
 
163
  """
 
164
  cursor.execute(create_scraped_data_table)
 
165
  logging.info("Table 'scraped_data' is ready.")
166
 
 
 
167
  # Create indexes for performance
 
168
  create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
 
169
  create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
 
170
  cursor.execute(create_index_url)
 
171
  cursor.execute(create_index_change)
 
172
  logging.info("Indexes on 'url' and 'change_detected' columns created.")
173
 
 
 
174
  # Create table for action logs
 
175
  create_action_logs_table = """
 
176
  CREATE TABLE IF NOT EXISTS action_logs (
 
177
  id INT AUTO_INCREMENT PRIMARY KEY,
 
178
  action VARCHAR(255) NOT NULL,
 
179
  timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
 
180
  )
 
181
  """
 
182
  cursor.execute(create_action_logs_table)
 
183
  logging.info("Table 'action_logs' is ready.")
184
 
 
 
185
  except mysql.connector.Error as err:
 
186
  logging.error(f"Error initializing database: {err}")
 
187
  finally:
 
188
  cursor.close()
 
189
  connection.close()
190
- logging.info("Database initialization complete.")
191
 
192
- # Function to create WebDriver
193
- def create_driver(options: Options) -> webdriver.Chrome:
194
  """
 
195
  Initializes and returns a Selenium Chrome WebDriver instance.
 
196
  """
 
197
  try:
 
198
  driver = webdriver.Chrome(
 
199
  service=Service(ChromeDriverManager().install()), options=options
 
200
  )
 
201
  logging.info("ChromeDriver initialized successfully.")
 
202
  return driver
 
203
  except Exception as exception:
 
204
  logging.error(f"Error initializing ChromeDriver: {exception}")
205
- return None
206
 
207
- # Function to log changes to CSV
208
- def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
209
  """
 
210
  Logs the change to a CSV file in the storage_location.
 
211
  """
 
212
  try:
 
213
  os.makedirs(storage_location, exist_ok=True)
 
214
  csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
 
215
  file_exists = os.path.isfile(csv_file_path)
216
 
 
 
217
  with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
 
218
  fieldnames = ["date", "time", "url", "content_hash", "change"]
 
219
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
220
  if not file_exists:
 
221
  writer.writeheader()
 
222
  writer.writerow(
 
223
  {
 
224
  "date": change_detected.split()[0],
 
225
  "time": change_detected.split()[1],
 
226
  "url": url,
 
227
  "content_hash": content_hash,
 
228
  "change": "Content changed",
 
229
  }
 
230
  )
 
231
  logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
 
232
  except Exception as e:
233
- logging.error(f"Error logging data to CSV: {e}")
234
 
235
- # Function to get initial observation
236
- def get_initial_observation(
237
- driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
238
- ) -> str:
239
  """
 
240
  Retrieves the initial content from the URL and returns its MD5 hash.
 
241
  """
 
242
  try:
 
243
  driver.get(url)
 
244
  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
 
245
  time.sleep(2) # Additional wait for dynamic content
246
 
 
 
247
  if content_type == "text":
 
248
  initial_content = driver.page_source
 
249
  elif content_type == "media":
 
250
  if selector:
 
251
  try:
 
252
  elements = WebDriverWait(driver, 5).until(
 
253
  EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
 
254
  )
 
255
  initial_content = [element.get_attribute("src") for element in elements]
256
- except TimeoutException:
257
- logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
 
258
  initial_content = []
 
259
  else:
 
260
  elements = driver.find_elements(By.TAG_NAME, "img")
 
261
  initial_content = [element.get_attribute("src") for element in elements]
 
262
  else:
 
263
  initial_content = driver.page_source
264
 
 
 
265
  initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
 
266
  logging.info(f"Initial hash for {url}: {initial_hash}")
 
267
  return initial_hash
 
268
  except Exception as exception:
 
269
  logging.error(f"Error accessing {url}: {exception}")
270
- return None
271
 
272
- # Function to monitor URLs for changes
273
- def monitor_urls(
274
  storage_location: str,
 
275
  urls: list,
 
276
  scrape_interval: int,
 
277
  content_type: str,
 
278
  selector: str = None,
279
- progress: gr.Progress = None
280
- ):
 
281
  """
 
282
  Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
 
283
  """
 
284
  global HISTORY, STOP_THREADS
 
285
  previous_hashes = {url: "" for url in urls}
286
 
 
 
287
  options = Options()
 
288
  options.add_argument("--headless")
 
289
  options.add_argument("--no-sandbox")
290
- options.add_argument("--disable-dev-shm-usage")
 
 
 
 
 
 
 
291
 
292
  driver = create_driver(options)
 
293
  if driver is None:
 
294
  logging.error("WebDriver could not be initialized. Exiting monitor.")
 
295
  return
296
 
 
 
297
  try:
 
298
  while not STOP_THREADS:
 
299
  for url in urls:
 
300
  if STOP_THREADS:
 
301
  break
 
302
  try:
 
303
  driver.get(url)
 
304
  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
 
305
  time.sleep(2) # Additional wait for dynamic content
306
 
 
 
307
  if content_type == "text":
 
308
  current_content = driver.page_source
 
309
  elif content_type == "media":
 
310
  if selector:
 
311
  try:
 
312
  elements = WebDriverWait(driver, 5).until(
 
313
  EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
 
314
  )
 
315
  current_content = [element.get_attribute("src") for element in elements]
 
316
  except TimeoutException:
 
317
  logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
 
318
  current_content = []
 
319
  else:
 
320
  elements = driver.find_elements(By.TAG_NAME, "img")
 
321
  current_content = [element.get_attribute("src") for element in elements]
 
322
  else:
 
323
  current_content = driver.page_source
324
 
 
 
325
  current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
 
326
  if current_hash != previous_hashes[url]:
 
327
  previous_hashes[url] = current_hash
 
328
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
329
  HISTORY.append(f"Change detected at {url} on {date_time_str}")
330
 
 
 
331
  # Attempt to log to database
 
332
  connection = get_db_connection()
 
333
  if connection:
 
334
  try:
 
335
  cursor = connection.cursor()
 
336
  insert_query = """
 
337
  INSERT INTO scraped_data (url, content_hash, change_detected)
 
338
  VALUES (%s, %s, %s)
 
339
  """
 
340
  cursor.execute(insert_query, (url, current_hash, date_time_str))
 
341
  connection.commit()
 
342
  logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
 
343
  except mysql.connector.Error as err:
 
344
  logging.error(f"Error inserting data into database: {err}")
 
345
  # Fallback to CSV
 
346
  log_to_csv(storage_location, url, current_hash, date_time_str)
 
347
  finally:
 
348
  cursor.close()
 
349
  connection.close()
 
350
  else:
 
351
  # Fallback to CSV
 
352
  log_to_csv(storage_location, url, current_hash, date_time_str)
353
 
 
 
354
  # Update progress
 
355
  if progress:
 
356
  progress(1)
 
357
  except (
 
358
  NoSuchElementException,
 
359
  StaleElementReferenceException,
 
360
  TimeoutException,
 
361
  Exception,
 
362
  ) as e:
 
363
  logging.error(f"Error accessing {url}: {e}")
 
364
  if progress:
 
365
  progress(1)
 
366
  time.sleep(scrape_interval * 60) # Wait for the next scrape interval
 
367
  finally:
 
368
  driver.quit()
369
- logging.info("ChromeDriver session ended.")
370
 
371
- # Function to start scraping
372
- def start_scraping(
373
  storage_location: str,
 
374
  urls: str,
 
375
  scrape_interval: int,
 
376
  content_type: str,
 
377
  selector: str = None,
378
- progress: gr.Progress = None
379
- ) -> str:
 
380
  """
 
381
  Starts the scraping process in a separate thread with progress indication.
 
382
  """
 
383
  global CURRENT_TASK, HISTORY, STOP_THREADS
384
 
 
 
385
  if STOP_THREADS:
 
386
  STOP_THREADS = False # Reset the flag if previously stopped
387
 
 
 
388
  url_list = [url.strip() for url in urls.split(",") if url.strip()]
 
389
  CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
 
390
  HISTORY.append(f"Task started: {CURRENT_TASK}")
 
391
  logging.info(f"Task started: {CURRENT_TASK}")
392
 
 
 
393
  # Initialize database tables
 
394
  initialize_database()
395
 
 
 
396
  # Log initial observations
 
397
  def log_initial_observations():
 
398
  options = Options()
 
399
  options.add_argument("--headless")
 
400
  options.add_argument("--no-sandbox")
401
- options.add_argument("--disable-dev-shm-usage")
 
 
 
 
 
 
 
402
 
403
  driver = create_driver(options)
 
404
  if driver is None:
 
405
  return
406
 
 
 
407
  for url in url_list:
 
408
  if STOP_THREADS:
 
409
  break
 
410
  try:
 
411
  initial_hash = get_initial_observation(driver, url, content_type, selector)
 
412
  if initial_hash:
 
413
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
414
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
415
 
 
 
416
  # Attempt to log to database
 
417
  connection = get_db_connection()
 
418
  if connection:
 
419
  try:
 
420
  cursor = connection.cursor()
 
421
  insert_query = """
 
422
  INSERT INTO scraped_data (url, content_hash, change_detected)
 
423
  VALUES (%s, %s, %s)
 
424
  """
 
425
  cursor.execute(insert_query, (url, initial_hash, date_time_str))
 
426
  connection.commit()
 
427
  logging.info(f"Initial observation logged for {url} in database.")
 
428
  except mysql.connector.Error as err:
 
429
  logging.error(f"Error inserting initial observation into database: {err}")
 
430
  # Fallback to CSV
 
431
  log_to_csv(storage_location, url, initial_hash, date_time_str)
 
432
  finally:
 
433
  cursor.close()
 
434
  connection.close()
 
435
  else:
 
436
  # Fallback to CSV
 
437
  log_to_csv(storage_location, url, initial_hash, date_time_str)
 
438
  except Exception as e:
 
439
  HISTORY.append(f"Error accessing {url}: {e}")
 
440
  logging.error(f"Error accessing {url}: {e}")
 
441
  driver.quit()
442
 
 
 
443
  # Start logging initial observations
 
444
  initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
 
445
  initial_thread.start()
446
 
 
 
447
  # Start the monitoring thread with progress
 
448
  monitor_thread = threading.Thread(
 
449
  target=monitor_urls,
 
450
  args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
 
451
  daemon=True,
 
452
  )
 
453
  monitor_thread.start()
 
454
  logging.info("Started scraping thread.")
 
455
  return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
456
 
457
  # Function to stop scraping
 
1
+ limport datetimeimport osimport csvimport timeimport hashlibimport loggingfrom collections import defaultdictimport mysql.connectorimport threadingfrom urllib.parse import urlparseimport gradio as grfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as  
2
+
3
+
4
+
5
+ ECfrom selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutExceptionfrom selenium.webdriver.chrome.service  
6
+
7
+
8
+
9
+ import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManager  
10
+
11
+
12
+
13
+ from huggingface_hub import InferenceClient, loginfrom transformers import AutoTokenizer, AutoModelForCausalLM, pipelineimport randomimport yamlimport torchimport pandas as pdimport xml.etree.ElementTree as ETimport reimport spacyimport unittestfrom dotenv import load_dotenvimport nltk# Initialize NLTK resources (you may need to download these)
14
+
15
+ nltk.download('punkt')nltk.download('averaged_perceptron_tagger')
16
+
17
+ nltk.download('maxent_ne_chunker')
18
+
19
+ nltk.download('words')  
20
+
21
+
22
+
23
+ # Load spaCy model
24
+
25
+ nlp = spacy.load("en_core_web_sm")  
26
+
27
+
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Dictionary to store model loading functions
30
+
31
  model_loaders = {
32
+
33
  "Falcon": lambda: load_model("tiiuae/falcon-7b"),
34
+
35
  "Flan-T5": lambda: load_model("google/flan-t5-xl"),
36
+
37
  "Flan-T5-Small": lambda: load_model("google/flan-t5-small") # Add a smaller model
 
38
 
39
+ }# Load environment variables from .env file
40
 
41
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
 
 
 
43
 
 
44
 
45
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")if not HUGGINGFACE_TOKEN:
46
+
47
  raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
 
 
48
 
49
 
 
 
50
 
51
+ login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)# Configure logging
52
+
53
  logging.basicConfig(
54
+
55
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 
56
 
57
+ )# Define constants
58
+
59
  DEFAULT_FILE_PATH = "scraped_data"
60
+
61
  PURPOSE = (
62
+
63
  "You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
64
+
65
  "Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
 
66
 
67
+ )# Global variables for task management
68
+
69
  HISTORY = []
70
+
71
  CURRENT_TASK = None
 
72
 
73
+ STOP_THREADS = False # Flag to stop scraping threads# Database Pooling Configuration
74
+
75
  DB_POOL_NAME = "mypool"
 
76
 
77
+ DB_POOL_SIZE = 5 # Adjust based on expected loadtry:
78
+
79
  dbconfig = {
80
+
81
  "host": os.getenv("DB_HOST"),
82
+
83
  "user": os.getenv("DB_USER"),
84
+
85
  "password": os.getenv("DB_PASSWORD"),
86
+
87
  "database": os.getenv("DB_NAME"),
88
+
89
  }
90
+
91
  connection_pool = mysql.connector.pooling.MySQLConnectionPool(
92
+
93
  pool_name=DB_POOL_NAME,
94
+
95
  pool_size=DB_POOL_SIZE,
96
+
97
  pool_reset_session=True,
98
+
99
  **dbconfig
100
+
101
  )
102
+
103
+ logging.info("Database connection pool created successfully.")except mysql.connector.Error as err:
104
+
105
  logging.warning(f"Database connection pool creation failed: {err}")
 
106
 
107
+ connection_pool = None # Will use CSV as fallback# Function to get a database connection from the pooldef get_db_connection():
108
+
109
  """
110
+
111
  Retrieves a connection from the pool. Returns None if pool is not available.
112
+
113
  """
114
+
115
  if connection_pool:
116
+
117
  try:
118
+
119
  connection = connection_pool.get_connection()
120
+
121
  if connection.is_connected():
122
+
123
  return connection
124
+
125
  except mysql.connector.Error as err:
126
+
127
  logging.error(f"Error getting connection from pool: {err}")
 
128
 
129
+ return None# Initialize Database: Create tables and indexesdef initialize_database():
130
+
131
  """
132
+
133
  Initializes the database by creating necessary tables and indexes if they do not exist.
134
+
135
  """
136
+
137
  connection = get_db_connection()
138
+
139
  if connection is None:
140
+
141
  logging.info("Database initialization skipped. Using CSV storage.")
142
+
143
  return
144
 
145
+
146
+
147
  cursor = connection.cursor()
148
+
149
  try:
150
+
151
  # Create table for scraped data
152
+
153
  create_scraped_data_table = """
154
+
155
  CREATE TABLE IF NOT EXISTS scraped_data (
156
+
157
  id INT AUTO_INCREMENT PRIMARY KEY,
158
+
159
  url VARCHAR(255) NOT NULL,
160
+
161
  content_hash VARCHAR(64) NOT NULL,
162
+
163
  change_detected DATETIME NOT NULL
164
+
165
  )
166
+
167
  """
168
+
169
  cursor.execute(create_scraped_data_table)
170
+
171
  logging.info("Table 'scraped_data' is ready.")
172
 
173
+
174
+
175
  # Create indexes for performance
176
+
177
  create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
178
+
179
  create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
180
+
181
  cursor.execute(create_index_url)
182
+
183
  cursor.execute(create_index_change)
184
+
185
  logging.info("Indexes on 'url' and 'change_detected' columns created.")
186
 
187
+
188
+
189
  # Create table for action logs
190
+
191
  create_action_logs_table = """
192
+
193
  CREATE TABLE IF NOT EXISTS action_logs (
194
+
195
  id INT AUTO_INCREMENT PRIMARY KEY,
196
+
197
  action VARCHAR(255) NOT NULL,
198
+
199
  timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
200
+
201
  )
202
+
203
  """
204
+
205
  cursor.execute(create_action_logs_table)
206
+
207
  logging.info("Table 'action_logs' is ready.")
208
 
209
+
210
+
211
  except mysql.connector.Error as err:
212
+
213
  logging.error(f"Error initializing database: {err}")
214
+
215
  finally:
216
+
217
  cursor.close()
218
+
219
  connection.close()
 
220
 
221
+ logging.info("Database initialization complete.")# Function to create WebDriverdef create_driver(options: Options) -> webdriver.Chrome:
222
+
223
  """
224
+
225
  Initializes and returns a Selenium Chrome WebDriver instance.
226
+
227
  """
228
+
229
  try:
230
+
231
  driver = webdriver.Chrome(
232
+
233
  service=Service(ChromeDriverManager().install()), options=options
234
+
235
  )
236
+
237
  logging.info("ChromeDriver initialized successfully.")
238
+
239
  return driver
240
+
241
  except Exception as exception:
242
+
243
  logging.error(f"Error initializing ChromeDriver: {exception}")
 
244
 
245
+ return None# Function to log changes to CSVdef log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
246
+
247
  """
248
+
249
  Logs the change to a CSV file in the storage_location.
250
+
251
  """
252
+
253
  try:
254
+
255
  os.makedirs(storage_location, exist_ok=True)
256
+
257
  csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
258
+
259
  file_exists = os.path.isfile(csv_file_path)
260
 
261
+
262
+
263
  with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
264
+
265
  fieldnames = ["date", "time", "url", "content_hash", "change"]
266
+
267
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
268
+
269
  if not file_exists:
270
+
271
  writer.writeheader()
272
+
273
  writer.writerow(
274
+
275
  {
276
+
277
  "date": change_detected.split()[0],
278
+
279
  "time": change_detected.split()[1],
280
+
281
  "url": url,
282
+
283
  "content_hash": content_hash,
284
+
285
  "change": "Content changed",
286
+
287
  }
288
+
289
  )
290
+
291
  logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
292
+
293
  except Exception as e:
 
294
 
295
+ logging.error(f"Error logging data to CSV: {e}")# Function to get initial observationdef get_initial_observation(
296
+
297
+ driver: webdriver.Chrome, url: str, content_type: str, selector: str = None) -> str:
298
+
299
  """
300
+
301
  Retrieves the initial content from the URL and returns its MD5 hash.
302
+
303
  """
304
+
305
  try:
306
+
307
  driver.get(url)
308
+
309
  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
310
+
311
  time.sleep(2) # Additional wait for dynamic content
312
 
313
+
314
+
315
  if content_type == "text":
316
+
317
  initial_content = driver.page_source
318
+
319
  elif content_type == "media":
320
+
321
  if selector:
322
+
323
  try:
324
+
325
  elements = WebDriverWait(driver, 5).until(
326
+
327
  EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
328
+
329
  )
330
+
331
  initial_content = [element.get_attribute("src") for element in elements]
332
+
333
+ except TimeoutException:                     logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
334
+
335
  initial_content = []
336
+
337
  else:
338
+
339
  elements = driver.find_elements(By.TAG_NAME, "img")
340
+
341
  initial_content = [element.get_attribute("src") for element in elements]
342
+
343
  else:
344
+
345
  initial_content = driver.page_source
346
 
347
+
348
+
349
  initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
350
+
351
  logging.info(f"Initial hash for {url}: {initial_hash}")
352
+
353
  return initial_hash
354
+
355
  except Exception as exception:
356
+
357
  logging.error(f"Error accessing {url}: {exception}")
 
358
 
359
+ return None# Function to monitor URLs for changesdef monitor_urls(
360
+
361
  storage_location: str,
362
+
363
  urls: list,
364
+
365
  scrape_interval: int,
366
+
367
  content_type: str,
368
+
369
  selector: str = None,
370
+
371
+ progress: gr.Progress = None):
372
+
373
  """
374
+
375
  Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
376
+
377
  """
378
+
379
  global HISTORY, STOP_THREADS
380
+
381
  previous_hashes = {url: "" for url in urls}
382
 
383
+
384
+
385
  options = Options()
386
+
387
  options.add_argument("--headless")
388
+
389
  options.add_argument("--no-sandbox")
390
+
391
+ options.add_argument("--disable-dev-shm-usage")  
392
+
393
+
394
+
395
+
396
+
397
+
398
 
399
  driver = create_driver(options)
400
+
401
  if driver is None:
402
+
403
  logging.error("WebDriver could not be initialized. Exiting monitor.")
404
+
405
  return
406
 
407
+
408
+
409
  try:
410
+
411
  while not STOP_THREADS:
412
+
413
  for url in urls:
414
+
415
  if STOP_THREADS:
416
+
417
  break
418
+
419
  try:
420
+
421
  driver.get(url)
422
+
423
  WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
424
+
425
  time.sleep(2) # Additional wait for dynamic content
426
 
427
+
428
+
429
  if content_type == "text":
430
+
431
  current_content = driver.page_source
432
+
433
  elif content_type == "media":
434
+
435
  if selector:
436
+
437
  try:
438
+
439
  elements = WebDriverWait(driver, 5).until(
440
+
441
  EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
442
+
443
  )
444
+
445
  current_content = [element.get_attribute("src") for element in elements]
446
+
447
  except TimeoutException:
448
+
449
  logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
450
+
451
  current_content = []
452
+
453
  else:
454
+
455
  elements = driver.find_elements(By.TAG_NAME, "img")
456
+
457
  current_content = [element.get_attribute("src") for element in elements]
458
+
459
  else:
460
+
461
  current_content = driver.page_source
462
 
463
+
464
+
465
  current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
466
+
467
  if current_hash != previous_hashes[url]:
468
+
469
  previous_hashes[url] = current_hash
470
+
471
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
472
+
473
  HISTORY.append(f"Change detected at {url} on {date_time_str}")
474
 
475
+
476
+
477
  # Attempt to log to database
478
+
479
  connection = get_db_connection()
480
+
481
  if connection:
482
+
483
  try:
484
+
485
  cursor = connection.cursor()
486
+
487
  insert_query = """
488
+
489
  INSERT INTO scraped_data (url, content_hash, change_detected)
490
+
491
  VALUES (%s, %s, %s)
492
+
493
  """
494
+
495
  cursor.execute(insert_query, (url, current_hash, date_time_str))
496
+
497
  connection.commit()
498
+
499
  logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
500
+
501
  except mysql.connector.Error as err:
502
+
503
  logging.error(f"Error inserting data into database: {err}")
504
+
505
  # Fallback to CSV
506
+
507
  log_to_csv(storage_location, url, current_hash, date_time_str)
508
+
509
  finally:
510
+
511
  cursor.close()
512
+
513
  connection.close()
514
+
515
  else:
516
+
517
  # Fallback to CSV
518
+
519
  log_to_csv(storage_location, url, current_hash, date_time_str)
520
 
521
+
522
+
523
  # Update progress
524
+
525
  if progress:
526
+
527
  progress(1)
528
+
529
  except (
530
+
531
  NoSuchElementException,
532
+
533
  StaleElementReferenceException,
534
+
535
  TimeoutException,
536
+
537
  Exception,
538
+
539
  ) as e:
540
+
541
  logging.error(f"Error accessing {url}: {e}")
542
+
543
  if progress:
544
+
545
  progress(1)
546
+
547
  time.sleep(scrape_interval * 60) # Wait for the next scrape interval
548
+
549
  finally:
550
+
551
  driver.quit()
 
552
 
553
+ logging.info("ChromeDriver session ended.")# Function to start scrapingdef start_scraping(
554
+
555
  storage_location: str,
556
+
557
  urls: str,
558
+
559
  scrape_interval: int,
560
+
561
  content_type: str,
562
+
563
  selector: str = None,
564
+
565
+ progress: gr.Progress = None) -> str:
566
+
567
  """
568
+
569
  Starts the scraping process in a separate thread with progress indication.
570
+
571
  """
572
+
573
  global CURRENT_TASK, HISTORY, STOP_THREADS
574
 
575
+
576
+
577
  if STOP_THREADS:
578
+
579
  STOP_THREADS = False # Reset the flag if previously stopped
580
 
581
+
582
+
583
  url_list = [url.strip() for url in urls.split(",") if url.strip()]
584
+
585
  CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
586
+
587
  HISTORY.append(f"Task started: {CURRENT_TASK}")
588
+
589
  logging.info(f"Task started: {CURRENT_TASK}")
590
 
591
+
592
+
593
  # Initialize database tables
594
+
595
  initialize_database()
596
 
597
+
598
+
599
  # Log initial observations
600
+
601
  def log_initial_observations():
602
+
603
  options = Options()
604
+
605
  options.add_argument("--headless")
606
+
607
  options.add_argument("--no-sandbox")
608
+
609
+ options.add_argument("--disable-dev-shm-usage")  
610
+
611
+
612
+
613
+
614
+
615
+
616
 
617
  driver = create_driver(options)
618
+
619
  if driver is None:
620
+
621
  return
622
 
623
+
624
+
625
  for url in url_list:
626
+
627
  if STOP_THREADS:
628
+
629
  break
630
+
631
  try:
632
+
633
  initial_hash = get_initial_observation(driver, url, content_type, selector)
634
+
635
  if initial_hash:
636
+
637
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
638
+
639
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
640
 
641
+
642
+
643
  # Attempt to log to database
644
+
645
  connection = get_db_connection()
646
+
647
  if connection:
648
+
649
  try:
650
+
651
  cursor = connection.cursor()
652
+
653
  insert_query = """
654
+
655
  INSERT INTO scraped_data (url, content_hash, change_detected)
656
+
657
  VALUES (%s, %s, %s)
658
+
659
  """
660
+
661
  cursor.execute(insert_query, (url, initial_hash, date_time_str))
662
+
663
  connection.commit()
664
+
665
  logging.info(f"Initial observation logged for {url} in database.")
666
+
667
  except mysql.connector.Error as err:
668
+
669
  logging.error(f"Error inserting initial observation into database: {err}")
670
+
671
  # Fallback to CSV
672
+
673
  log_to_csv(storage_location, url, initial_hash, date_time_str)
674
+
675
  finally:
676
+
677
  cursor.close()
678
+
679
  connection.close()
680
+
681
  else:
682
+
683
  # Fallback to CSV
684
+
685
  log_to_csv(storage_location, url, initial_hash, date_time_str)
686
+
687
  except Exception as e:
688
+
689
  HISTORY.append(f"Error accessing {url}: {e}")
690
+
691
  logging.error(f"Error accessing {url}: {e}")
692
+
693
  driver.quit()
694
 
695
+
696
+
697
  # Start logging initial observations
698
+
699
  initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
700
+
701
  initial_thread.start()
702
 
703
+
704
+
705
  # Start the monitoring thread with progress
706
+
707
  monitor_thread = threading.Thread(
708
+
709
  target=monitor_urls,
710
+
711
  args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
712
+
713
  daemon=True,
714
+
715
  )
716
+
717
  monitor_thread.start()
718
+
719
  logging.info("Started scraping thread.")
720
+
721
  return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
722
 
723
  # Function to stop scraping