acecalisto3 commited on
Commit
df717a9
1 Parent(s): e9a41cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +778 -212
app.py CHANGED
@@ -1,276 +1,842 @@
1
- import mysql.connector
2
- from mysql.connector import errorcode
3
  import os
4
- import logging
5
  import time
6
  import hashlib
 
7
  import datetime
8
- import gradio as gr
9
  import csv
 
 
10
  from urllib.parse import urlparse
 
 
11
  from selenium import webdriver
12
  from selenium.webdriver.chrome.service import Service
13
  from selenium.webdriver.chrome.options import Options
14
  from selenium.webdriver.common.by import By
15
  from selenium.webdriver.support.ui import WebDriverWait
16
  from selenium.webdriver.support import expected_conditions as EC
17
- from selenium.webdriver.common.keys import Keys
18
- from selenium.common.exceptions import NoSuchElementException
19
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
20
- from transformers import pipeline
21
- import feedparser
22
- from bs4 import BeautifulSoup
23
- import threading
 
 
 
 
 
 
 
 
 
24
 
25
  # Configure logging
26
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
27
 
28
  # Define constants
29
  DEFAULT_FILE_PATH = "scraped_data"
30
- PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
 
 
 
 
 
31
  HISTORY = []
32
  CURRENT_TASK = None
33
- STOP_THREADS = False
34
-
35
- # Define database configuration
36
- db_config = {
37
- 'user': os.getenv('DB_USER'),
38
- 'password': os.getenv('DB_PASSWORD'),
39
- 'host': os.getenv('DB_HOST'),
40
- 'raise_on_warnings': True
41
- }
42
-
43
- # Define a function to initialize the database
44
- def initialize_database(config):
45
  try:
46
- cnx = mysql.connector.connect(**config)
47
- cursor = cnx.cursor()
48
-
49
- # Create database if it doesn't exist
50
- cursor.execute("CREATE DATABASE IF NOT EXISTS scraper_db")
51
- cnx.database = 'scraper_db'
52
-
53
- # Create tables
54
- TABLES = {}
55
- TABLES['scraped_data'] = (
56
- "CREATE TABLE IF NOT EXISTS scraped_data ("
57
- " id INT AUTO_INCREMENT PRIMARY KEY,"
58
- " url VARCHAR(255) NOT NULL,"
59
- " content_hash VARCHAR(64) NOT NULL,"
60
- " change_detected DATETIME NOT NULL"
61
- ") ENGINE=InnoDB"
62
  )
63
-
64
- for table_name in TABLES:
65
- table_description = TABLES[table_name]
66
- try:
67
- cursor.execute(table_description)
68
- logging.info(f"Table `{table_name}` created successfully.")
69
- except mysql.connector.Error as err:
70
- if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
71
- logging.warning(f"Table `{table_name}` already exists.")
72
- else:
73
- logging.error(err.msg)
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  cursor.close()
76
- cnx.close()
77
  logging.info("Database initialization complete.")
78
- except mysql.connector.Error as err:
79
- logging.error(f"Database initialization failed: {err}")
80
 
81
- # Define a function to start scraping
82
- def start_scraping(storage_location, urls, scrape_interval, content_type, db_config):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  global CURRENT_TASK, HISTORY, STOP_THREADS
84
-
85
- CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
 
 
 
 
86
  HISTORY.append(f"Task started: {CURRENT_TASK}")
87
-
88
- for url in urls:
89
- # Create a folder for the URL
 
 
 
 
90
  hostname = urlparse(url).hostname
91
  folder_path = os.path.join(storage_location, hostname)
92
  os.makedirs(folder_path, exist_ok=True)
93
-
94
  # Log the initial observation
95
  try:
96
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
97
- driver.get(url)
98
- time.sleep(2) # Wait for the page to load
99
- if content_type == "text":
100
- initial_content = driver.page_source
101
- elif content_type == "media":
102
- initial_content = driver.find_elements(By.TAG_NAME, "img")
103
- else:
104
- initial_content = driver.page_source
105
- initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
 
106
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
107
- with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
108
- file.write(f"Initial observation at {url}: {initial_hash}")
109
- except (NoSuchElementException, Exception) as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  HISTORY.append(f"Error accessing {url}: {e}")
111
-
112
- # Start a new thread for monitoring URLs
113
- threading.Thread(target=monitor_urls, args=(storage_location, [url], scrape_interval, content_type, [STOP_THREADS], db_config)).start()
114
-
115
- return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
116
-
117
- # Define a function to monitor URLs for changes
118
- def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag, db_config):
119
- global HISTORY
120
- previous_hashes = {url: "" for url in urls}
121
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  try:
123
- cnx = mysql.connector.connect(**db_config)
124
- cursor = cnx.cursor()
125
-
126
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager ().install()), options=Options()) as driver:
127
- while not stop_scraping_flag[0]:
128
- for url in urls:
129
- try:
130
- driver.get(url)
131
- time.sleep(2) # Wait for the page to load
132
- if content_type == "text":
133
- current_content = driver.page_source
134
- elif content_type == "media":
135
- current_content = driver.find_elements(By.TAG_NAME, "img")
136
- else:
137
- current_content = driver.page_source
138
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
139
-
140
- if current_hash != previous_hashes[url]:
141
- previous_hashes[url] = current_hash
142
- date_time = datetime.datetime.now()
143
- HISTORY.append(f"Change detected at {url} on {date_time}")
144
-
145
- # Insert into MySQL
146
- add_change = ("INSERT INTO scraped_data "
147
- "(url, content_hash, change_detected) "
148
- "VALUES (%s, %s, %s)")
149
- data_change = (url, current_hash, date_time)
150
- cursor.execute(add_change, data_change)
151
- cnx.commit()
152
-
153
- logging.info(f"Change detected and logged for {url} at {date_time}")
154
- except (NoSuchElementException, Exception) as e:
155
- logging.error(f"Error accessing {url}: {e}")
156
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
157
  except Exception as e:
158
- logging.error(f"Error in monitor_urls: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  finally:
160
  cursor.close()
161
- cnx.close()
162
 
163
- # Define a function to stop scraping
164
- def stop_scraping():
165
- global STOP_THREADS
166
- STOP_THREADS = True
167
- return "Scraping stopped."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Define a function to generate RSS feed
170
- def generate_rss_feed(selected_url, db_config):
 
 
 
 
 
 
 
 
 
 
 
171
  try:
172
- cnx = mysql.connector.connect(**db_config)
173
- cursor = cnx.cursor(dictionary=True)
174
-
175
- query = ("SELECT content_hash, change_detected FROM scraped_data "
176
- "WHERE url = %s ORDER BY change_detected DESC LIMIT 10")
177
- cursor.execute(query, (selected_url,))
178
-
179
- items = cursor.fetchall()
180
-
181
- rss_items = ""
182
- for item in items:
183
- rss_items += f"""
184
- <item>
185
- <title>Change Detected</title>
186
- <link>{selected_url}</link>
187
- <description>Change detected on {item['change_detected'].strftime('%Y-%m-%d %H:%M:%S')}</description>
188
- <pubDate>{item['change_detected'].strftime('%a, %d %b %Y %H:%M:%S +0000')}</pubDate>
189
- </item>
190
- """
191
-
192
- rss_feed = f"""<?xml version="1.0" encoding="UTF-8"?>
193
- <rss version="2.0">
194
- <channel>
195
- <title>RSS Feed for {selected_url}</title>
196
- <link>{selected_url}</link>
197
- <description>Latest changes detected on {selected_url}.</description>
198
- {rss_items}
199
- </channel>
200
- </rss>"""
201
-
202
  cursor.close()
203
- cnx.close()
204
- return rss_feed
205
- except mysql.connector.Error as err:
206
- logging.error(f"Error generating RSS feed: {err}")
207
- return "Failed to generate RSS feed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
- # Define a function to handle messages
210
- def handle_message(message, chat_history, system_message, max_tokens, temperature, top_p):
211
- chat_history.append((message, system_message))
212
- response = f"Received message: {message}"
213
- return chat_history, response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  # Define the Gradio interface
216
- def create_interface():
 
 
 
217
  with gr.Blocks() as demo:
 
 
218
  with gr.Row():
219
  with gr.Column():
220
- message = gr.Textbox(label="Message")
221
- system_message = gr.Textbox(value="You are a helpful assistant.", label="System message")
222
- max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
223
- temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
224
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
225
- storage_location = gr.Textbox(value="scraped_data", label="Storage Location")
226
- urls = gr.Textbox(label="URLs (comma separated)")
227
- scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
228
- content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  start_button = gr.Button("Start Scraping")
230
  stop_button = gr.Button("Stop Scraping")
231
- csv_output = gr.Textbox(label="CSV Output", interactive=False)
232
- model_name_input = gr.Textbox(value="default_model", label="Model Name")
233
- gpu_layers_input = gr.Slider(minimum=0, maximum=8, value=2, step=1, label="GPU Layers")
 
234
  with gr.Column():
 
235
  chat_history = gr.Chatbot(label="Chat History")
236
- response_box = gr.Textbox(label="Response")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  # Connect buttons to their respective functions
239
  start_button.click(
240
- fn=lambda storage, urls, interval, ctype: start_scraping(
241
- storage, urls.split(", "), interval, ctype, db_config
242
- ),
243
- inputs=[storage_location, urls, scrape_interval, content_type],
244
- outputs=csv_output
 
 
 
 
245
  )
246
- stop_button.click(stop_scraping, outputs=csv_output)
247
 
248
- # Connect message submission to the chat interface
249
- message.submit(handle_message, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p], outputs=[chat_history, response_box])
250
 
251
- # Add a button to display the CSV content for a selected URL
252
- with gr.Row():
253
- selected_url = gr.Textbox(label="Select URL for CSV Content")
254
- csv_button = gr.Button("Display CSV Content")
255
- csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
256
- csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
257
 
258
- # Add a button to display the RSS feed for a selected URL
259
- with gr.Row():
260
- selected_url = gr.Textbox(label="Select URL for RSS Feed")
261
- rss_button = gr.Button("Generate RSS Feed")
262
- rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
263
  rss_button.click(
264
- generate_rss_feed,
265
- inputs=[selected_url, gr.State(db_config)],
266
- outputs=rss_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  )
268
 
269
  return demo
270
 
271
- # Initialize the database
272
- initialize_database(db_config)
273
 
274
- # Launch the Gradio interface
275
- demo = create_interface()
276
- demo.launch()
 
 
 
1
  import os
 
2
  import time
3
  import hashlib
4
+ import logging
5
  import datetime
 
6
  import csv
7
+ import threading
8
+ import re
9
  from urllib.parse import urlparse
10
+
11
+ import pandas as pd
12
  from selenium import webdriver
13
  from selenium.webdriver.chrome.service import Service
14
  from selenium.webdriver.chrome.options import Options
15
  from selenium.webdriver.common.by import By
16
  from selenium.webdriver.support.ui import WebDriverWait
17
  from selenium.webdriver.support import expected_conditions as EC
18
+ from selenium.common.exceptions import (
19
+ TimeoutException,
20
+ NoSuchElementException,
21
+ StaleElementReferenceException,
22
+ )
23
+ from webdriver_manager.chrome import ChromeDriverManager
24
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
25
+ import gradio as gr
26
+ import xml.etree.ElementTree as ET
27
+ import torch
28
+ import mysql.connector
29
+ from mysql.connector import errorcode
30
+ from dotenv import load_dotenv
31
+
32
+ # Load environment variables from .env file
33
+ load_dotenv()
34
 
35
  # Configure logging
36
+ logging.basicConfig(
37
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
38
+ )
39
 
40
  # Define constants
41
  DEFAULT_FILE_PATH = "scraped_data"
42
+ PURPOSE = (
43
+ "You go to Culvers sites, you continuously seek changes on them since your last observation. "
44
+ "Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
45
+ )
46
+
47
+ # Global variables for task management
48
  HISTORY = []
49
  CURRENT_TASK = None
50
+ STOP_THREADS = False # Flag to stop scraping threads
51
+
52
+ # MySQL Database Connection
53
+ def get_db_connection():
54
+ """
55
+ Establishes and returns a MySQL database connection using environment variables.
56
+ """
 
 
 
 
 
57
  try:
58
+ connection = mysql.connector.connect(
59
+ host=os.getenv("DB_HOST"),
60
+ user=os.getenv("DB_USER"),
61
+ password=os.getenv("DB_PASSWORD"),
62
+ database=os.getenv("DB_NAME")
 
 
 
 
 
 
 
 
 
 
 
63
  )
64
+ if connection.is_connected():
65
+ logging.info("Connected to MySQL database.")
66
+ return connection
67
+ except mysql.connector.Error as err:
68
+ if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
69
+ logging.error("Invalid database credentials.")
70
+ elif err.errno == errorcode.ER_BAD_DB_ERROR:
71
+ logging.error("Database does not exist.")
72
+ else:
73
+ logging.error(err)
74
+ return None
75
+
76
+ # Initialize Database
77
+ def initialize_database():
78
+ """
79
+ Initializes the database by creating necessary tables if they do not exist.
80
+ """
81
+ connection = get_db_connection()
82
+ if connection is None:
83
+ logging.error("Failed to connect to the database. Initialization aborted.")
84
+ return
85
+
86
+ cursor = connection.cursor()
87
+ try:
88
+ # Create table for scraped data
89
+ create_scraped_data_table = """
90
+ CREATE TABLE IF NOT EXISTS scraped_data (
91
+ id INT AUTO_INCREMENT PRIMARY KEY,
92
+ url VARCHAR(255) NOT NULL,
93
+ content_hash VARCHAR(64) NOT NULL,
94
+ change_detected DATETIME NOT NULL
95
+ )
96
+ """
97
+ cursor.execute(create_scraped_data_table)
98
+ logging.info("Table 'scraped_data' is ready.")
99
+
100
+ # Create table for action logs
101
+ create_action_logs_table = """
102
+ CREATE TABLE IF NOT EXISTS action_logs (
103
+ id INT AUTO_INCREMENT PRIMARY KEY,
104
+ action VARCHAR(255) NOT NULL,
105
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
106
+ )
107
+ """
108
+ cursor.execute(create_action_logs_table)
109
+ logging.info("Table 'action_logs' is ready.")
110
+
111
+ except mysql.connector.Error as err:
112
+ logging.error(f"Error creating tables: {err}")
113
+ finally:
114
  cursor.close()
115
+ connection.close()
116
  logging.info("Database initialization complete.")
 
 
117
 
118
+ # Function to monitor URLs for changes
119
+ def monitor_urls(
120
+ storage_location: str,
121
+ urls: list,
122
+ scrape_interval: int,
123
+ content_type: str,
124
+ selector: str = None,
125
+ ):
126
+ """
127
+ Monitors the specified URLs for changes and logs any detected changes to the database.
128
+ """
129
+ global HISTORY, STOP_THREADS
130
+ previous_hashes = {url: "" for url in urls}
131
+
132
+ options = Options()
133
+ options.add_argument("--headless")
134
+ options.add_argument("--no-sandbox")
135
+ options.add_argument("--disable-dev-shm-usage")
136
+
137
+ driver = create_driver(options)
138
+ if driver is None:
139
+ logging.error("WebDriver could not be initialized. Exiting monitor.")
140
+ return
141
+
142
+ try:
143
+ while not STOP_THREADS:
144
+ for url in urls:
145
+ try:
146
+ driver.get(url)
147
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
148
+ time.sleep(2) # Additional wait for dynamic content
149
+
150
+ if content_type == "text":
151
+ current_content = driver.page_source
152
+ elif content_type == "media":
153
+ if selector:
154
+ try:
155
+ elements = WebDriverWait(driver, 5).until(
156
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
157
+ )
158
+ current_content = [element.get_attribute("src") for element in elements]
159
+ except TimeoutException:
160
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
161
+ current_content = []
162
+ else:
163
+ elements = driver.find_elements(By.TAG_NAME, "img")
164
+ current_content = [element.get_attribute("src") for element in elements]
165
+ else:
166
+ current_content = driver.page_source
167
+
168
+ current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
169
+ if current_hash != previous_hashes[url]:
170
+ previous_hashes[url] = current_hash
171
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
172
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
173
+
174
+ # Insert change into MySQL database
175
+ connection = get_db_connection()
176
+ if connection:
177
+ cursor = connection.cursor()
178
+ insert_query = """
179
+ INSERT INTO scraped_data (url, content_hash, change_detected)
180
+ VALUES (%s, %s, %s)
181
+ """
182
+ cursor.execute(insert_query, (url, current_hash, date_time_str))
183
+ connection.commit()
184
+ cursor.close()
185
+ connection.close()
186
+ logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
187
+ else:
188
+ logging.error("Failed to connect to database. Change not logged.")
189
+
190
+ except (
191
+ NoSuchElementException,
192
+ StaleElementReferenceException,
193
+ TimeoutException,
194
+ Exception,
195
+ ) as e:
196
+ logging.error(f"Error accessing {url}: {e}")
197
+ time.sleep(scrape_interval * 60) # Wait for the next scrape interval
198
+ finally:
199
+ driver.quit()
200
+ logging.info("ChromeDriver session ended.")
201
+
202
+ # Function to create WebDriver
203
+ def create_driver(options: Options) -> webdriver.Chrome:
204
+ """
205
+ Initializes and returns a Selenium Chrome WebDriver instance.
206
+ """
207
+ try:
208
+ driver = webdriver.Chrome(
209
+ service=Service(ChromeDriverManager().install()), options=options
210
+ )
211
+ logging.info("ChromeDriver initialized successfully.")
212
+ return driver
213
+ except Exception as exception:
214
+ logging.error(f"Error initializing ChromeDriver: {exception}")
215
+ return None
216
+
217
+ # Function to get initial observation
218
+ def get_initial_observation(
219
+ driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
220
+ ) -> str:
221
+ """
222
+ Retrieves the initial content from the URL and returns its MD5 hash.
223
+ """
224
+ try:
225
+ driver.get(url)
226
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
227
+ time.sleep(2) # Additional wait for dynamic content
228
+
229
+ if content_type == "text":
230
+ initial_content = driver.page_source
231
+ elif content_type == "media":
232
+ if selector:
233
+ try:
234
+ elements = WebDriverWait(driver, 5).until(
235
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
236
+ )
237
+ initial_content = [element.get_attribute("src") for element in elements]
238
+ except TimeoutException:
239
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
240
+ initial_content = []
241
+ else:
242
+ elements = driver.find_elements(By.TAG_NAME, "img")
243
+ initial_content = [element.get_attribute("src") for element in elements]
244
+ else:
245
+ initial_content = driver.page_source
246
+
247
+ initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
248
+ logging.info(f"Initial hash for {url}: {initial_hash}")
249
+ return initial_hash
250
+ except Exception as exception:
251
+ logging.error(f"Error accessing {url}: {exception}")
252
+ return None
253
+
254
+ # Function to start scraping
255
+ def start_scraping(
256
+ storage_location: str,
257
+ urls: str,
258
+ scrape_interval: int,
259
+ content_type: str,
260
+ selector: str = None,
261
+ ) -> str:
262
+ """
263
+ Starts the scraping process in a separate thread.
264
+ """
265
  global CURRENT_TASK, HISTORY, STOP_THREADS
266
+
267
+ if STOP_THREADS:
268
+ STOP_THREADS = False # Reset the flag if previously stopped
269
+
270
+ url_list = [url.strip() for url in urls.split(",") if url.strip()]
271
+ CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
272
  HISTORY.append(f"Task started: {CURRENT_TASK}")
273
+ logging.info(f"Task started: {CURRENT_TASK}")
274
+
275
+ # Initialize database tables
276
+ initialize_database()
277
+
278
+ for url in url_list:
279
+ # Create a folder for the URL (if still needed for CSVs)
280
  hostname = urlparse(url).hostname
281
  folder_path = os.path.join(storage_location, hostname)
282
  os.makedirs(folder_path, exist_ok=True)
283
+
284
  # Log the initial observation
285
  try:
286
+ options = Options()
287
+ options.add_argument("--headless")
288
+ options.add_argument("--no-sandbox")
289
+ options.add_argument("--disable-dev-shm-usage")
290
+
291
+ driver = create_driver(options)
292
+ if driver is None:
293
+ continue
294
+
295
+ initial_hash = get_initial_observation(driver, url, content_type, selector)
296
+ if initial_hash:
297
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
298
+
299
+ # Insert initial observation into MySQL database
300
+ connection = get_db_connection()
301
+ if connection:
302
+ cursor = connection.cursor()
303
+ insert_query = """
304
+ INSERT INTO scraped_data (url, content_hash, change_detected)
305
+ VALUES (%s, %s, %s)
306
+ """
307
+ cursor.execute(insert_query, (url, initial_hash, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
308
+ connection.commit()
309
+ cursor.close()
310
+ connection.close()
311
+ logging.info(f"Initial observation logged for {url}")
312
+ else:
313
+ logging.error("Failed to connect to database. Initial observation not logged.")
314
+
315
+ except Exception as e:
316
  HISTORY.append(f"Error accessing {url}: {e}")
317
+ logging.error(f"Error accessing {url}: {e}")
318
+ finally:
319
+ driver.quit()
320
+
321
+ # Start the monitoring thread
322
+ monitor_thread = threading.Thread(
323
+ target=monitor_urls,
324
+ args=(storage_location, url_list, scrape_interval, content_type, selector),
325
+ daemon=True,
326
+ )
327
+ monitor_thread.start()
328
+ logging.info("Started scraping thread.")
329
+ return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
330
+
331
+ # Function to stop scraping
332
+ def stop_scraping() -> str:
333
+ """
334
+ Stops all ongoing scraping threads.
335
+ """
336
+ global STOP_THREADS
337
+ STOP_THREADS = True
338
+ HISTORY.append("Scraping stopped by user.")
339
+ logging.info("Scraping stop signal sent.")
340
+ return "Scraping has been stopped."
341
+
342
+ # Function to display CSV content from MySQL
343
+ def display_csv(storage_location: str, url: str) -> str:
344
+ """
345
+ Fetches and returns the scraped data for a given URL from the MySQL database.
346
+ """
347
  try:
348
+ connection = get_db_connection()
349
+ if not connection:
350
+ return "Failed to connect to the database."
351
+
352
+ cursor = connection.cursor(dictionary=True)
353
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
354
+ cursor.execute(query, (url,))
355
+ results = cursor.fetchall()
356
+
357
+ if not results:
358
+ return "No data available for the selected URL."
359
+
360
+ df = pd.DataFrame(results)
361
+ cursor.close()
362
+ connection.close()
363
+ return df.to_string(index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  except Exception as e:
365
+ logging.error(f"Error fetching data for {url}: {e}")
366
+ return f"Error fetching data for {url}: {e}"
367
+
368
+ # Function to generate RSS feed from MySQL data
369
+ def generate_rss_feed(storage_location: str, url: str) -> str:
370
+ """
371
+ Generates an RSS feed for the latest changes detected on a given URL from the MySQL database.
372
+ """
373
+ try:
374
+ connection = get_db_connection()
375
+ if not connection:
376
+ return "Failed to connect to the database."
377
+
378
+ cursor = connection.cursor(dictionary=True)
379
+ query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
380
+ cursor.execute(query, (url,))
381
+ results = cursor.fetchall()
382
+
383
+ if not results:
384
+ return "No changes detected to include in RSS feed."
385
+
386
+ # Create the root RSS element
387
+ rss = ET.Element("rss", version="2.0")
388
+ channel = ET.SubElement(rss, "channel")
389
+
390
+ # Add channel elements
391
+ title = ET.SubElement(channel, "title")
392
+ title.text = f"RSS Feed for {urlparse(url).hostname}"
393
+
394
+ link = ET.SubElement(channel, "link")
395
+ link.text = url
396
+
397
+ description = ET.SubElement(channel, "description")
398
+ description.text = "Recent changes detected on the website."
399
+
400
+ # Add items to the feed
401
+ for row in results:
402
+ item = ET.SubElement(channel, "item")
403
+
404
+ item_title = ET.SubElement(item, "title")
405
+ item_title.text = f"Change detected at {row['url']}"
406
+
407
+ item_link = ET.SubElement(item, "link")
408
+ item_link.text = row["url"]
409
+
410
+ item_description = ET.SubElement(item, "description")
411
+ item_description.text = f"Content changed on {row['change_detected']}"
412
+
413
+ pub_date = ET.SubElement(item, "pubDate")
414
+ pub_date.text = datetime.datetime.strptime(
415
+ str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
416
+ ).strftime("%a, %d %b %Y %H:%M:%S +0000")
417
+
418
+ # Generate the XML string
419
+ rss_feed = ET.tostring(rss, encoding="utf-8", method="xml")
420
+ return rss_feed.decode("utf-8")
421
+ except Exception as e:
422
+ logging.error(f"Error generating RSS feed for {url}: {e}")
423
+ return f"Error generating RSS feed for {url}: {e}"
424
  finally:
425
  cursor.close()
426
+ connection.close()
427
 
428
+ # Function to load the Mistral model
429
+ def load_model():
430
+ """
431
+ Loads the Mistral model and tokenizer once and returns the pipeline.
432
+ """
433
+ model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
434
+ try:
435
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
436
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
437
+ pipe = pipeline(
438
+ "text-generation",
439
+ model=model,
440
+ tokenizer=tokenizer,
441
+ device=0 if torch.cuda.is_available() else -1,
442
+ )
443
+ logging.info("Mistral model loaded successfully.")
444
+ return pipe
445
+ except Exception as e:
446
+ logging.error(f"Error loading Mistral model: {e}")
447
+ return None
448
+
449
+ # Load the model once at the start
450
+ chat_pipeline = load_model()
451
+
452
+ # Function to parse user commands
453
+ def parse_command(message: str) -> tuple:
454
+ """
455
+ Parses the user message to identify if it contains a command.
456
+ Returns the command and its parameters if found, else (None, None).
457
+ """
458
+ # Define command patterns
459
+ patterns = {
460
+ "filter": r"filter\s+(?P<words>[\w\s,]+)\s+in\s+column\s+(?P<column>\w+)",
461
+ "sort": r"sort\s+(?P<column>\w+)\s+(?P<order>ascending|descending)",
462
+ "export": r"export\s+to\s+csv\s+as\s+(?P<filename>\w+\.csv)",
463
+ "log": r"log\s+action\s+(?P<action>.+)",
464
+ }
465
+
466
+ for command, pattern in patterns.items():
467
+ match = re.search(pattern, message, re.IGNORECASE)
468
+ if match:
469
+ params = match.groupdict()
470
+ return command, params
471
+
472
+ return None, None
473
+
474
+ # Function to execute parsed commands
475
+ def execute_command(command: str, params: dict) -> str:
476
+ """
477
+ Executes the corresponding function based on the command and parameters.
478
+ """
479
+ if command == "filter":
480
+ words = [word.strip() for word in params["words"].split(",")]
481
+ column = params["column"]
482
+ return filter_data(column, words)
483
+ elif command == "sort":
484
+ column = params["column"]
485
+ order = params["order"]
486
+ return sort_data(column, order)
487
+ elif command == "export":
488
+ filename = params["filename"]
489
+ return export_csv(filename)
490
+ elif command == "log":
491
+ action = params["action"]
492
+ return log_action(action)
493
+ else:
494
+ return "Unknown command."
495
+
496
+ # Data Manipulation Functions
497
+ def filter_data(column: str, words: list) -> str:
498
+ """
499
+ Filters the scraped data to include only rows where the specified column contains the given words.
500
+ Saves the filtered data to a new CSV file.
501
+ """
502
+ try:
503
+ latest_csv = get_latest_csv()
504
+ if not latest_csv:
505
+ return "No CSV files found to filter."
506
+
507
+ df = pd.read_csv(latest_csv)
508
+ # Create a regex pattern to match any of the words
509
+ pattern = '|'.join(words)
510
+ filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
511
+
512
+ if filtered_df.empty:
513
+ return f"No records found with words {words} in column '{column}'."
514
+
515
+ # Save the filtered data to a new CSV
516
+ filtered_csv = latest_csv.replace(".csv", "_filtered.csv")
517
+ filtered_df.to_csv(filtered_csv, index=False)
518
+ logging.info(f"Data filtered on column '{column}' for words {words}.")
519
+ return f"Data filtered and saved to {filtered_csv}."
520
+ except Exception as e:
521
+ logging.error(f"Error filtering data: {e}")
522
+ return f"Error filtering data: {e}"
523
+
524
+ def sort_data(column: str, order: str) -> str:
525
+ """
526
+ Sorts the scraped data based on the specified column and order.
527
+ Saves the sorted data to a new CSV file.
528
+ """
529
+ try:
530
+ latest_csv = get_latest_csv()
531
+ if not latest_csv:
532
+ return "No CSV files found to sort."
533
+
534
+ df = pd.read_csv(latest_csv)
535
+ ascending = True if order.lower() == "ascending" else False
536
+ sorted_df = df.sort_values(by=column, ascending=ascending)
537
+
538
+ # Save the sorted data to a new CSV
539
+ sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}.csv")
540
+ sorted_df.to_csv(sorted_csv, index=False)
541
+ logging.info(f"Data sorted on column '{column}' in {order} order.")
542
+ return f"Data sorted and saved to {sorted_csv}."
543
+ except Exception as e:
544
+ logging.error(f"Error sorting data: {e}")
545
+ return f"Error sorting data: {e}"
546
+
547
+ def export_csv(filename: str) -> str:
548
+ """
549
+ Exports the latest scraped data to a specified CSV filename.
550
+ """
551
+ try:
552
+ latest_csv = get_latest_csv()
553
+ if not latest_csv:
554
+ return "No CSV files found to export."
555
 
556
+ export_path = os.path.join(os.path.dirname(latest_csv), filename)
557
+ df = pd.read_csv(latest_csv)
558
+ df.to_csv(export_path, index=False)
559
+ logging.info(f"Data exported to {export_path}.")
560
+ return f"Data exported to {export_path}."
561
+ except Exception as e:
562
+ logging.error(f"Error exporting CSV: {e}")
563
+ return f"Error exporting CSV: {e}"
564
+
565
+ def log_action(action: str) -> str:
566
+ """
567
+ Logs a custom action message to the MySQL database.
568
+ """
569
  try:
570
+ connection = get_db_connection()
571
+ if not connection:
572
+ return "Failed to connect to the database."
573
+
574
+ cursor = connection.cursor()
575
+ insert_query = """
576
+ INSERT INTO action_logs (action)
577
+ VALUES (%s)
578
+ """
579
+ cursor.execute(insert_query, (action,))
580
+ connection.commit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  cursor.close()
582
+ connection.close()
583
+
584
+ HISTORY.append(f"User Action Logged: {action}")
585
+ logging.info(f"Action logged: {action}")
586
+ return f"Action logged: {action}"
587
+ except Exception as e:
588
+ logging.error(f"Error logging action: {e}")
589
+ return f"Error logging action: {e}"
590
+
591
+ def get_latest_csv() -> str:
592
+ """
593
+ Retrieves the latest CSV file from the storage directory based on modification time.
594
+ """
595
+ try:
596
+ storage_dirs = [d for d in os.listdir(DEFAULT_FILE_PATH) if os.path.isdir(os.path.join(DEFAULT_FILE_PATH, d))]
597
+ if not storage_dirs:
598
+ return None
599
+
600
+ latest_csv = None
601
+ latest_time = 0
602
+ for dir_name in storage_dirs:
603
+ dir_path = os.path.join(DEFAULT_FILE_PATH, dir_name)
604
+ csv_files = [f for f in os.listdir(dir_path) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
605
+ for csv_file in csv_files:
606
+ csv_path = os.path.join(dir_path, csv_file)
607
+ mod_time = os.path.getmtime(csv_path)
608
+ if mod_time > latest_time:
609
+ latest_time = mod_time
610
+ latest_csv = csv_path
611
+ return latest_csv
612
+ except Exception as e:
613
+ logging.error(f"Error retrieving latest CSV: {e}")
614
+ return None
615
+
616
+ # Chat Response Function with Dynamic Command Handling
617
+ def respond(
618
+ message: str,
619
+ history: list,
620
+ system_message: str,
621
+ max_tokens: int,
622
+ temperature: float,
623
+ top_p: float,
624
+ ) -> str:
625
+ """
626
+ Generates a response using the Mistral model based on the user's message and history.
627
+ Additionally, handles dynamic commands to interact with individual components.
628
+ """
629
+ if chat_pipeline is None:
630
+ return "Error: Chat model is not loaded."
631
 
632
+ try:
633
+ # Check if the message contains a command
634
+ command, params = parse_command(message)
635
+ if command:
636
+ # Execute the corresponding function
637
+ response = execute_command(command, params)
638
+ else:
639
+ # Generate a regular response using the model
640
+ prompt = (
641
+ f"System: {system_message}\n"
642
+ f"History: {history}\n"
643
+ f"User: {message}\n"
644
+ f"Assistant:"
645
+ )
646
+ response = chat_pipeline(
647
+ prompt,
648
+ max_length=max_tokens,
649
+ temperature=temperature,
650
+ top_p=top_p,
651
+ num_return_sequences=1,
652
+ )[0]["generated_text"]
653
+
654
+ # Extract the assistant's reply
655
+ response = response.split("Assistant:")[-1].strip()
656
+ return response
657
+ except Exception as e:
658
+ logging.error(f"Error generating response: {e}")
659
+ return "Error generating response."
660
+
661
+ # Function to load the Mistral model
662
+ def load_model():
663
+ """
664
+ Loads the Mistral model and tokenizer once and returns the pipeline.
665
+ """
666
+ model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
667
+ try:
668
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
669
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
670
+ pipe = pipeline(
671
+ "text-generation",
672
+ model=model,
673
+ tokenizer=tokenizer,
674
+ device=0 if torch.cuda.is_available() else -1,
675
+ )
676
+ logging.info("Mistral model loaded successfully.")
677
+ return pipe
678
+ except Exception as e:
679
+ logging.error(f"Error loading Mistral model: {e}")
680
+ return None
681
+
682
+ # Load the model once at the start
683
+ chat_pipeline = load_model()
684
 
685
  # Define the Gradio interface
686
+ def create_interface() -> gr.Blocks:
687
+ """
688
+ Defines and returns the Gradio interface for the application.
689
+ """
690
  with gr.Blocks() as demo:
691
+ gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
692
+
693
  with gr.Row():
694
  with gr.Column():
695
+ # Scraping Controls
696
+ storage_location = gr.Textbox(
697
+ value=DEFAULT_FILE_PATH, label="Storage Location"
698
+ )
699
+ urls = gr.Textbox(
700
+ label="URLs (comma separated)",
701
+ placeholder="https://example.com, https://anotherexample.com",
702
+ )
703
+ scrape_interval = gr.Slider(
704
+ minimum=1,
705
+ maximum=60,
706
+ value=5,
707
+ step=1,
708
+ label="Scrape Interval (minutes)",
709
+ )
710
+ content_type = gr.Radio(
711
+ choices=["text", "media", "both"],
712
+ value="text",
713
+ label="Content Type",
714
+ )
715
+ selector = gr.Textbox(
716
+ label="CSS Selector for Media (Optional)",
717
+ placeholder="e.g., img.main-image",
718
+ )
719
  start_button = gr.Button("Start Scraping")
720
  stop_button = gr.Button("Stop Scraping")
721
+ status_output = gr.Textbox(
722
+ label="Status Output", interactive=False, lines=2
723
+ )
724
+
725
  with gr.Column():
726
+ # Chat Interface
727
  chat_history = gr.Chatbot(label="Chat History")
728
+ with gr.Row():
729
+ message = gr.Textbox(label="Message", placeholder="Type your message here...")
730
+ system_message = gr.Textbox(
731
+ value="You are a helpful assistant.", label="System message"
732
+ )
733
+ max_tokens = gr.Slider(
734
+ minimum=1,
735
+ maximum=2048,
736
+ value=512,
737
+ step=1,
738
+ label="Max new tokens",
739
+ )
740
+ temperature = gr.Slider(
741
+ minimum=0.1,
742
+ maximum=4.0,
743
+ value=0.7,
744
+ step=0.1,
745
+ label="Temperature",
746
+ )
747
+ top_p = gr.Slider(
748
+ minimum=0.1,
749
+ maximum=1.0,
750
+ value=0.95,
751
+ step=0.05,
752
+ label="Top-p (nucleus sampling)",
753
+ )
754
+ response_box = gr.Textbox(label="Response", interactive=False, lines=2)
755
+
756
+ with gr.Row():
757
+ with gr.Column():
758
+ # CSV Display Controls
759
+ selected_url_csv = gr.Textbox(
760
+ label="Select URL for CSV Content",
761
+ placeholder="https://example.com",
762
+ )
763
+ csv_button = gr.Button("Display CSV Content")
764
+ csv_content_output = gr.Textbox(
765
+ label="CSV Content Output", interactive=False, lines=10
766
+ )
767
+
768
+ with gr.Column():
769
+ # RSS Feed Generation Controls
770
+ selected_url_rss = gr.Textbox(
771
+ label="Select URL for RSS Feed",
772
+ placeholder="https://example.com",
773
+ )
774
+ rss_button = gr.Button("Generate RSS Feed")
775
+ rss_output = gr.Textbox(
776
+ label="RSS Feed Output", interactive=False, lines=20
777
+ )
778
 
779
  # Connect buttons to their respective functions
780
  start_button.click(
781
+ fn=start_scraping,
782
+ inputs=[
783
+ storage_location,
784
+ urls,
785
+ scrape_interval,
786
+ content_type,
787
+ selector,
788
+ ],
789
+ outputs=status_output,
790
  )
 
791
 
792
+ stop_button.click(fn=stop_scraping, outputs=status_output)
 
793
 
794
+ csv_button.click(
795
+ fn=display_csv,
796
+ inputs=[storage_location, selected_url_csv],
797
+ outputs=csv_content_output,
798
+ )
 
799
 
 
 
 
 
 
800
  rss_button.click(
801
+ fn=generate_rss_feed,
802
+ inputs=[storage_location, selected_url_rss],
803
+ outputs=rss_output,
804
+ )
805
+
806
+ # Connect message submission to the chat interface
807
+ def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
808
+ if not message_input.strip():
809
+ return history, "Please enter a message."
810
+
811
+ response = respond(
812
+ message_input,
813
+ history,
814
+ system_msg,
815
+ max_toks,
816
+ temp,
817
+ top_p_val,
818
+ )
819
+ history.append((message_input, response))
820
+ return history, response
821
+
822
+ message.submit(
823
+ update_chat,
824
+ inputs=[
825
+ message,
826
+ chat_history,
827
+ system_message,
828
+ max_tokens,
829
+ temperature,
830
+ top_p,
831
+ ],
832
+ outputs=[chat_history, response_box],
833
  )
834
 
835
  return demo
836
 
837
+ # Initialize database on script start
838
+ initialize_database()
839
 
840
+ if __name__ == "__main__":
841
+ demo = create_interface()
842
+ demo.launch()