Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,457 +1,723 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
import
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
import
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
from
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
)
|
26 |
-
|
27 |
-
|
28 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
29 |
-
import gradio as gr
|
30 |
-
import xml.etree.ElementTree as ET
|
31 |
-
import torch
|
32 |
-
import mysql.connector
|
33 |
-
from mysql.connector import pooling
|
34 |
-
import nltk
|
35 |
-
from huggingface_hub import login
|
36 |
-
from dotenv import load_dotenv
|
37 |
-
|
38 |
-
# Initialize NLTK resources (you may need to download these)
|
39 |
-
st.title("CEEMEESEEK with Model Selection")
|
40 |
|
41 |
# Dictionary to store model loading functions
|
|
|
42 |
model_loaders = {
|
|
|
43 |
"Falcon": lambda: load_model("tiiuae/falcon-7b"),
|
|
|
44 |
"Flan-T5": lambda: load_model("google/flan-t5-xl"),
|
|
|
45 |
"Flan-T5-Small": lambda: load_model("google/flan-t5-small") # Add a smaller model
|
46 |
-
}
|
47 |
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
"""
|
52 |
-
Loads the specified model and tokenizer.
|
53 |
-
"""
|
54 |
-
try:
|
55 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
|
56 |
-
model = AutoModelForCausalLM.from_pretrained(model_name)
|
57 |
-
# This should be inside the try block
|
58 |
-
max_supported_length = 2048 # Get this from the model config
|
59 |
-
openllama_pipeline = pipeline(
|
60 |
-
"text-generation",
|
61 |
-
model=model,
|
62 |
-
tokenizer=tokenizer,
|
63 |
-
truncation=True,
|
64 |
-
max_length=max_supported_length,
|
65 |
-
temperature=0.7,
|
66 |
-
top_p=0.95,
|
67 |
-
device=0 if torch.cuda.is_available() else -1,
|
68 |
-
)
|
69 |
-
logging.info(f"{model_name} loaded successfully.")
|
70 |
-
return openllama_pipeline
|
71 |
-
except Exception as e:
|
72 |
-
logging.error(f"Error loading {model_name} model: {e}")
|
73 |
-
return None
|
74 |
|
75 |
-
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
76 |
-
if not HUGGINGFACE_TOKEN:
|
77 |
-
raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
|
78 |
|
79 |
-
login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
|
80 |
|
81 |
-
if not HUGGINGFACE_TOKEN:
|
|
|
82 |
raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
|
83 |
-
add_to_git_credential=True
|
84 |
-
login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
|
85 |
|
86 |
|
87 |
-
# Load environment variables from .env file
|
88 |
-
load_dotenv()
|
89 |
|
90 |
-
# Configure logging
|
|
|
91 |
logging.basicConfig(
|
|
|
92 |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
93 |
-
)
|
94 |
|
95 |
-
# Define constants
|
|
|
96 |
DEFAULT_FILE_PATH = "scraped_data"
|
|
|
97 |
PURPOSE = (
|
|
|
98 |
"You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
|
|
|
99 |
"Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
|
100 |
-
)
|
101 |
|
102 |
-
# Global variables for task management
|
|
|
103 |
HISTORY = []
|
|
|
104 |
CURRENT_TASK = None
|
105 |
-
STOP_THREADS = False # Flag to stop scraping threads
|
106 |
|
107 |
-
# Database Pooling Configuration
|
|
|
108 |
DB_POOL_NAME = "mypool"
|
109 |
-
DB_POOL_SIZE = 5 # Adjust based on expected load
|
110 |
|
111 |
-
|
|
|
112 |
dbconfig = {
|
|
|
113 |
"host": os.getenv("DB_HOST"),
|
|
|
114 |
"user": os.getenv("DB_USER"),
|
|
|
115 |
"password": os.getenv("DB_PASSWORD"),
|
|
|
116 |
"database": os.getenv("DB_NAME"),
|
|
|
117 |
}
|
|
|
118 |
connection_pool = mysql.connector.pooling.MySQLConnectionPool(
|
|
|
119 |
pool_name=DB_POOL_NAME,
|
|
|
120 |
pool_size=DB_POOL_SIZE,
|
|
|
121 |
pool_reset_session=True,
|
|
|
122 |
**dbconfig
|
|
|
123 |
)
|
124 |
-
|
125 |
-
except mysql.connector.Error as err:
|
|
|
126 |
logging.warning(f"Database connection pool creation failed: {err}")
|
127 |
-
connection_pool = None # Will use CSV as fallback
|
128 |
|
129 |
-
# Function to get a database connection from the
|
130 |
-
|
131 |
"""
|
|
|
132 |
Retrieves a connection from the pool. Returns None if pool is not available.
|
|
|
133 |
"""
|
|
|
134 |
if connection_pool:
|
|
|
135 |
try:
|
|
|
136 |
connection = connection_pool.get_connection()
|
|
|
137 |
if connection.is_connected():
|
|
|
138 |
return connection
|
|
|
139 |
except mysql.connector.Error as err:
|
|
|
140 |
logging.error(f"Error getting connection from pool: {err}")
|
141 |
-
return None
|
142 |
|
143 |
-
# Initialize Database: Create tables and
|
144 |
-
|
145 |
"""
|
|
|
146 |
Initializes the database by creating necessary tables and indexes if they do not exist.
|
|
|
147 |
"""
|
|
|
148 |
connection = get_db_connection()
|
|
|
149 |
if connection is None:
|
|
|
150 |
logging.info("Database initialization skipped. Using CSV storage.")
|
|
|
151 |
return
|
152 |
|
|
|
|
|
153 |
cursor = connection.cursor()
|
|
|
154 |
try:
|
|
|
155 |
# Create table for scraped data
|
|
|
156 |
create_scraped_data_table = """
|
|
|
157 |
CREATE TABLE IF NOT EXISTS scraped_data (
|
|
|
158 |
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
|
159 |
url VARCHAR(255) NOT NULL,
|
|
|
160 |
content_hash VARCHAR(64) NOT NULL,
|
|
|
161 |
change_detected DATETIME NOT NULL
|
|
|
162 |
)
|
|
|
163 |
"""
|
|
|
164 |
cursor.execute(create_scraped_data_table)
|
|
|
165 |
logging.info("Table 'scraped_data' is ready.")
|
166 |
|
|
|
|
|
167 |
# Create indexes for performance
|
|
|
168 |
create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
|
|
|
169 |
create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
|
|
|
170 |
cursor.execute(create_index_url)
|
|
|
171 |
cursor.execute(create_index_change)
|
|
|
172 |
logging.info("Indexes on 'url' and 'change_detected' columns created.")
|
173 |
|
|
|
|
|
174 |
# Create table for action logs
|
|
|
175 |
create_action_logs_table = """
|
|
|
176 |
CREATE TABLE IF NOT EXISTS action_logs (
|
|
|
177 |
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
|
178 |
action VARCHAR(255) NOT NULL,
|
|
|
179 |
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
|
180 |
)
|
|
|
181 |
"""
|
|
|
182 |
cursor.execute(create_action_logs_table)
|
|
|
183 |
logging.info("Table 'action_logs' is ready.")
|
184 |
|
|
|
|
|
185 |
except mysql.connector.Error as err:
|
|
|
186 |
logging.error(f"Error initializing database: {err}")
|
|
|
187 |
finally:
|
|
|
188 |
cursor.close()
|
|
|
189 |
connection.close()
|
190 |
-
logging.info("Database initialization complete.")
|
191 |
|
192 |
-
# Function to create
|
193 |
-
|
194 |
"""
|
|
|
195 |
Initializes and returns a Selenium Chrome WebDriver instance.
|
|
|
196 |
"""
|
|
|
197 |
try:
|
|
|
198 |
driver = webdriver.Chrome(
|
|
|
199 |
service=Service(ChromeDriverManager().install()), options=options
|
|
|
200 |
)
|
|
|
201 |
logging.info("ChromeDriver initialized successfully.")
|
|
|
202 |
return driver
|
|
|
203 |
except Exception as exception:
|
|
|
204 |
logging.error(f"Error initializing ChromeDriver: {exception}")
|
205 |
-
return None
|
206 |
|
207 |
-
# Function to log changes to
|
208 |
-
|
209 |
"""
|
|
|
210 |
Logs the change to a CSV file in the storage_location.
|
|
|
211 |
"""
|
|
|
212 |
try:
|
|
|
213 |
os.makedirs(storage_location, exist_ok=True)
|
|
|
214 |
csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
|
|
|
215 |
file_exists = os.path.isfile(csv_file_path)
|
216 |
|
|
|
|
|
217 |
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
|
|
218 |
fieldnames = ["date", "time", "url", "content_hash", "change"]
|
|
|
219 |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
|
220 |
if not file_exists:
|
|
|
221 |
writer.writeheader()
|
|
|
222 |
writer.writerow(
|
|
|
223 |
{
|
|
|
224 |
"date": change_detected.split()[0],
|
|
|
225 |
"time": change_detected.split()[1],
|
|
|
226 |
"url": url,
|
|
|
227 |
"content_hash": content_hash,
|
|
|
228 |
"change": "Content changed",
|
|
|
229 |
}
|
|
|
230 |
)
|
|
|
231 |
logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
|
|
|
232 |
except Exception as e:
|
233 |
-
logging.error(f"Error logging data to CSV: {e}")
|
234 |
|
235 |
-
# Function to get initial
|
236 |
-
|
237 |
-
driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
|
238 |
-
|
239 |
"""
|
|
|
240 |
Retrieves the initial content from the URL and returns its MD5 hash.
|
|
|
241 |
"""
|
|
|
242 |
try:
|
|
|
243 |
driver.get(url)
|
|
|
244 |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
|
|
245 |
time.sleep(2) # Additional wait for dynamic content
|
246 |
|
|
|
|
|
247 |
if content_type == "text":
|
|
|
248 |
initial_content = driver.page_source
|
|
|
249 |
elif content_type == "media":
|
|
|
250 |
if selector:
|
|
|
251 |
try:
|
|
|
252 |
elements = WebDriverWait(driver, 5).until(
|
|
|
253 |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
|
|
|
254 |
)
|
|
|
255 |
initial_content = [element.get_attribute("src") for element in elements]
|
256 |
-
|
257 |
-
|
|
|
258 |
initial_content = []
|
|
|
259 |
else:
|
|
|
260 |
elements = driver.find_elements(By.TAG_NAME, "img")
|
|
|
261 |
initial_content = [element.get_attribute("src") for element in elements]
|
|
|
262 |
else:
|
|
|
263 |
initial_content = driver.page_source
|
264 |
|
|
|
|
|
265 |
initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
|
|
|
266 |
logging.info(f"Initial hash for {url}: {initial_hash}")
|
|
|
267 |
return initial_hash
|
|
|
268 |
except Exception as exception:
|
|
|
269 |
logging.error(f"Error accessing {url}: {exception}")
|
270 |
-
return None
|
271 |
|
272 |
-
# Function to monitor URLs for
|
273 |
-
|
274 |
storage_location: str,
|
|
|
275 |
urls: list,
|
|
|
276 |
scrape_interval: int,
|
|
|
277 |
content_type: str,
|
|
|
278 |
selector: str = None,
|
279 |
-
|
280 |
-
):
|
|
|
281 |
"""
|
|
|
282 |
Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
|
|
|
283 |
"""
|
|
|
284 |
global HISTORY, STOP_THREADS
|
|
|
285 |
previous_hashes = {url: "" for url in urls}
|
286 |
|
|
|
|
|
287 |
options = Options()
|
|
|
288 |
options.add_argument("--headless")
|
|
|
289 |
options.add_argument("--no-sandbox")
|
290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
driver = create_driver(options)
|
|
|
293 |
if driver is None:
|
|
|
294 |
logging.error("WebDriver could not be initialized. Exiting monitor.")
|
|
|
295 |
return
|
296 |
|
|
|
|
|
297 |
try:
|
|
|
298 |
while not STOP_THREADS:
|
|
|
299 |
for url in urls:
|
|
|
300 |
if STOP_THREADS:
|
|
|
301 |
break
|
|
|
302 |
try:
|
|
|
303 |
driver.get(url)
|
|
|
304 |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
|
|
305 |
time.sleep(2) # Additional wait for dynamic content
|
306 |
|
|
|
|
|
307 |
if content_type == "text":
|
|
|
308 |
current_content = driver.page_source
|
|
|
309 |
elif content_type == "media":
|
|
|
310 |
if selector:
|
|
|
311 |
try:
|
|
|
312 |
elements = WebDriverWait(driver, 5).until(
|
|
|
313 |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
|
|
|
314 |
)
|
|
|
315 |
current_content = [element.get_attribute("src") for element in elements]
|
|
|
316 |
except TimeoutException:
|
|
|
317 |
logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
|
|
|
318 |
current_content = []
|
|
|
319 |
else:
|
|
|
320 |
elements = driver.find_elements(By.TAG_NAME, "img")
|
|
|
321 |
current_content = [element.get_attribute("src") for element in elements]
|
|
|
322 |
else:
|
|
|
323 |
current_content = driver.page_source
|
324 |
|
|
|
|
|
325 |
current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
|
|
|
326 |
if current_hash != previous_hashes[url]:
|
|
|
327 |
previous_hashes[url] = current_hash
|
|
|
328 |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
329 |
HISTORY.append(f"Change detected at {url} on {date_time_str}")
|
330 |
|
|
|
|
|
331 |
# Attempt to log to database
|
|
|
332 |
connection = get_db_connection()
|
|
|
333 |
if connection:
|
|
|
334 |
try:
|
|
|
335 |
cursor = connection.cursor()
|
|
|
336 |
insert_query = """
|
|
|
337 |
INSERT INTO scraped_data (url, content_hash, change_detected)
|
|
|
338 |
VALUES (%s, %s, %s)
|
|
|
339 |
"""
|
|
|
340 |
cursor.execute(insert_query, (url, current_hash, date_time_str))
|
|
|
341 |
connection.commit()
|
|
|
342 |
logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
|
|
|
343 |
except mysql.connector.Error as err:
|
|
|
344 |
logging.error(f"Error inserting data into database: {err}")
|
|
|
345 |
# Fallback to CSV
|
|
|
346 |
log_to_csv(storage_location, url, current_hash, date_time_str)
|
|
|
347 |
finally:
|
|
|
348 |
cursor.close()
|
|
|
349 |
connection.close()
|
|
|
350 |
else:
|
|
|
351 |
# Fallback to CSV
|
|
|
352 |
log_to_csv(storage_location, url, current_hash, date_time_str)
|
353 |
|
|
|
|
|
354 |
# Update progress
|
|
|
355 |
if progress:
|
|
|
356 |
progress(1)
|
|
|
357 |
except (
|
|
|
358 |
NoSuchElementException,
|
|
|
359 |
StaleElementReferenceException,
|
|
|
360 |
TimeoutException,
|
|
|
361 |
Exception,
|
|
|
362 |
) as e:
|
|
|
363 |
logging.error(f"Error accessing {url}: {e}")
|
|
|
364 |
if progress:
|
|
|
365 |
progress(1)
|
|
|
366 |
time.sleep(scrape_interval * 60) # Wait for the next scrape interval
|
|
|
367 |
finally:
|
|
|
368 |
driver.quit()
|
369 |
-
logging.info("ChromeDriver session ended.")
|
370 |
|
371 |
-
# Function to start
|
372 |
-
|
373 |
storage_location: str,
|
|
|
374 |
urls: str,
|
|
|
375 |
scrape_interval: int,
|
|
|
376 |
content_type: str,
|
|
|
377 |
selector: str = None,
|
378 |
-
|
379 |
-
) -> str:
|
|
|
380 |
"""
|
|
|
381 |
Starts the scraping process in a separate thread with progress indication.
|
|
|
382 |
"""
|
|
|
383 |
global CURRENT_TASK, HISTORY, STOP_THREADS
|
384 |
|
|
|
|
|
385 |
if STOP_THREADS:
|
|
|
386 |
STOP_THREADS = False # Reset the flag if previously stopped
|
387 |
|
|
|
|
|
388 |
url_list = [url.strip() for url in urls.split(",") if url.strip()]
|
|
|
389 |
CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
|
|
|
390 |
HISTORY.append(f"Task started: {CURRENT_TASK}")
|
|
|
391 |
logging.info(f"Task started: {CURRENT_TASK}")
|
392 |
|
|
|
|
|
393 |
# Initialize database tables
|
|
|
394 |
initialize_database()
|
395 |
|
|
|
|
|
396 |
# Log initial observations
|
|
|
397 |
def log_initial_observations():
|
|
|
398 |
options = Options()
|
|
|
399 |
options.add_argument("--headless")
|
|
|
400 |
options.add_argument("--no-sandbox")
|
401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
driver = create_driver(options)
|
|
|
404 |
if driver is None:
|
|
|
405 |
return
|
406 |
|
|
|
|
|
407 |
for url in url_list:
|
|
|
408 |
if STOP_THREADS:
|
|
|
409 |
break
|
|
|
410 |
try:
|
|
|
411 |
initial_hash = get_initial_observation(driver, url, content_type, selector)
|
|
|
412 |
if initial_hash:
|
|
|
413 |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
414 |
HISTORY.append(f"Initial observation at {url}: {initial_hash}")
|
415 |
|
|
|
|
|
416 |
# Attempt to log to database
|
|
|
417 |
connection = get_db_connection()
|
|
|
418 |
if connection:
|
|
|
419 |
try:
|
|
|
420 |
cursor = connection.cursor()
|
|
|
421 |
insert_query = """
|
|
|
422 |
INSERT INTO scraped_data (url, content_hash, change_detected)
|
|
|
423 |
VALUES (%s, %s, %s)
|
|
|
424 |
"""
|
|
|
425 |
cursor.execute(insert_query, (url, initial_hash, date_time_str))
|
|
|
426 |
connection.commit()
|
|
|
427 |
logging.info(f"Initial observation logged for {url} in database.")
|
|
|
428 |
except mysql.connector.Error as err:
|
|
|
429 |
logging.error(f"Error inserting initial observation into database: {err}")
|
|
|
430 |
# Fallback to CSV
|
|
|
431 |
log_to_csv(storage_location, url, initial_hash, date_time_str)
|
|
|
432 |
finally:
|
|
|
433 |
cursor.close()
|
|
|
434 |
connection.close()
|
|
|
435 |
else:
|
|
|
436 |
# Fallback to CSV
|
|
|
437 |
log_to_csv(storage_location, url, initial_hash, date_time_str)
|
|
|
438 |
except Exception as e:
|
|
|
439 |
HISTORY.append(f"Error accessing {url}: {e}")
|
|
|
440 |
logging.error(f"Error accessing {url}: {e}")
|
|
|
441 |
driver.quit()
|
442 |
|
|
|
|
|
443 |
# Start logging initial observations
|
|
|
444 |
initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
|
|
|
445 |
initial_thread.start()
|
446 |
|
|
|
|
|
447 |
# Start the monitoring thread with progress
|
|
|
448 |
monitor_thread = threading.Thread(
|
|
|
449 |
target=monitor_urls,
|
|
|
450 |
args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
|
|
|
451 |
daemon=True,
|
|
|
452 |
)
|
|
|
453 |
monitor_thread.start()
|
|
|
454 |
logging.info("Started scraping thread.")
|
|
|
455 |
return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
|
456 |
|
457 |
# Function to stop scraping
|
|
|
1 |
+
limport datetimeimport osimport csvimport timeimport hashlibimport loggingfrom collections import defaultdictimport mysql.connectorimport threadingfrom urllib.parse import urlparseimport gradio as grfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
ECfrom selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutExceptionfrom selenium.webdriver.chrome.service
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManager
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
from huggingface_hub import InferenceClient, loginfrom transformers import AutoTokenizer, AutoModelForCausalLM, pipelineimport randomimport yamlimport torchimport pandas as pdimport xml.etree.ElementTree as ETimport reimport spacyimport unittestfrom dotenv import load_dotenvimport nltk# Initialize NLTK resources (you may need to download these)
|
14 |
+
|
15 |
+
nltk.download('punkt')nltk.download('averaged_perceptron_tagger')
|
16 |
+
|
17 |
+
nltk.download('maxent_ne_chunker')
|
18 |
+
|
19 |
+
nltk.download('words')
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
# Load spaCy model
|
24 |
+
|
25 |
+
nlp = spacy.load("en_core_web_sm")
|
26 |
+
|
27 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# Dictionary to store model loading functions
|
30 |
+
|
31 |
model_loaders = {
|
32 |
+
|
33 |
"Falcon": lambda: load_model("tiiuae/falcon-7b"),
|
34 |
+
|
35 |
"Flan-T5": lambda: load_model("google/flan-t5-xl"),
|
36 |
+
|
37 |
"Flan-T5-Small": lambda: load_model("google/flan-t5-small") # Add a smaller model
|
|
|
38 |
|
39 |
+
}# Load environment variables from .env file
|
40 |
|
41 |
+
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
|
|
|
|
|
|
43 |
|
|
|
44 |
|
45 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")if not HUGGINGFACE_TOKEN:
|
46 |
+
|
47 |
raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
|
|
|
|
|
48 |
|
49 |
|
|
|
|
|
50 |
|
51 |
+
login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)# Configure logging
|
52 |
+
|
53 |
logging.basicConfig(
|
54 |
+
|
55 |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
|
56 |
|
57 |
+
)# Define constants
|
58 |
+
|
59 |
DEFAULT_FILE_PATH = "scraped_data"
|
60 |
+
|
61 |
PURPOSE = (
|
62 |
+
|
63 |
"You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
|
64 |
+
|
65 |
"Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
|
|
|
66 |
|
67 |
+
)# Global variables for task management
|
68 |
+
|
69 |
HISTORY = []
|
70 |
+
|
71 |
CURRENT_TASK = None
|
|
|
72 |
|
73 |
+
STOP_THREADS = False # Flag to stop scraping threads# Database Pooling Configuration
|
74 |
+
|
75 |
DB_POOL_NAME = "mypool"
|
|
|
76 |
|
77 |
+
DB_POOL_SIZE = 5 # Adjust based on expected loadtry:
|
78 |
+
|
79 |
dbconfig = {
|
80 |
+
|
81 |
"host": os.getenv("DB_HOST"),
|
82 |
+
|
83 |
"user": os.getenv("DB_USER"),
|
84 |
+
|
85 |
"password": os.getenv("DB_PASSWORD"),
|
86 |
+
|
87 |
"database": os.getenv("DB_NAME"),
|
88 |
+
|
89 |
}
|
90 |
+
|
91 |
connection_pool = mysql.connector.pooling.MySQLConnectionPool(
|
92 |
+
|
93 |
pool_name=DB_POOL_NAME,
|
94 |
+
|
95 |
pool_size=DB_POOL_SIZE,
|
96 |
+
|
97 |
pool_reset_session=True,
|
98 |
+
|
99 |
**dbconfig
|
100 |
+
|
101 |
)
|
102 |
+
|
103 |
+
logging.info("Database connection pool created successfully.")except mysql.connector.Error as err:
|
104 |
+
|
105 |
logging.warning(f"Database connection pool creation failed: {err}")
|
|
|
106 |
|
107 |
+
connection_pool = None # Will use CSV as fallback# Function to get a database connection from the pooldef get_db_connection():
|
108 |
+
|
109 |
"""
|
110 |
+
|
111 |
Retrieves a connection from the pool. Returns None if pool is not available.
|
112 |
+
|
113 |
"""
|
114 |
+
|
115 |
if connection_pool:
|
116 |
+
|
117 |
try:
|
118 |
+
|
119 |
connection = connection_pool.get_connection()
|
120 |
+
|
121 |
if connection.is_connected():
|
122 |
+
|
123 |
return connection
|
124 |
+
|
125 |
except mysql.connector.Error as err:
|
126 |
+
|
127 |
logging.error(f"Error getting connection from pool: {err}")
|
|
|
128 |
|
129 |
+
return None# Initialize Database: Create tables and indexesdef initialize_database():
|
130 |
+
|
131 |
"""
|
132 |
+
|
133 |
Initializes the database by creating necessary tables and indexes if they do not exist.
|
134 |
+
|
135 |
"""
|
136 |
+
|
137 |
connection = get_db_connection()
|
138 |
+
|
139 |
if connection is None:
|
140 |
+
|
141 |
logging.info("Database initialization skipped. Using CSV storage.")
|
142 |
+
|
143 |
return
|
144 |
|
145 |
+
|
146 |
+
|
147 |
cursor = connection.cursor()
|
148 |
+
|
149 |
try:
|
150 |
+
|
151 |
# Create table for scraped data
|
152 |
+
|
153 |
create_scraped_data_table = """
|
154 |
+
|
155 |
CREATE TABLE IF NOT EXISTS scraped_data (
|
156 |
+
|
157 |
id INT AUTO_INCREMENT PRIMARY KEY,
|
158 |
+
|
159 |
url VARCHAR(255) NOT NULL,
|
160 |
+
|
161 |
content_hash VARCHAR(64) NOT NULL,
|
162 |
+
|
163 |
change_detected DATETIME NOT NULL
|
164 |
+
|
165 |
)
|
166 |
+
|
167 |
"""
|
168 |
+
|
169 |
cursor.execute(create_scraped_data_table)
|
170 |
+
|
171 |
logging.info("Table 'scraped_data' is ready.")
|
172 |
|
173 |
+
|
174 |
+
|
175 |
# Create indexes for performance
|
176 |
+
|
177 |
create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
|
178 |
+
|
179 |
create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
|
180 |
+
|
181 |
cursor.execute(create_index_url)
|
182 |
+
|
183 |
cursor.execute(create_index_change)
|
184 |
+
|
185 |
logging.info("Indexes on 'url' and 'change_detected' columns created.")
|
186 |
|
187 |
+
|
188 |
+
|
189 |
# Create table for action logs
|
190 |
+
|
191 |
create_action_logs_table = """
|
192 |
+
|
193 |
CREATE TABLE IF NOT EXISTS action_logs (
|
194 |
+
|
195 |
id INT AUTO_INCREMENT PRIMARY KEY,
|
196 |
+
|
197 |
action VARCHAR(255) NOT NULL,
|
198 |
+
|
199 |
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
200 |
+
|
201 |
)
|
202 |
+
|
203 |
"""
|
204 |
+
|
205 |
cursor.execute(create_action_logs_table)
|
206 |
+
|
207 |
logging.info("Table 'action_logs' is ready.")
|
208 |
|
209 |
+
|
210 |
+
|
211 |
except mysql.connector.Error as err:
|
212 |
+
|
213 |
logging.error(f"Error initializing database: {err}")
|
214 |
+
|
215 |
finally:
|
216 |
+
|
217 |
cursor.close()
|
218 |
+
|
219 |
connection.close()
|
|
|
220 |
|
221 |
+
logging.info("Database initialization complete.")# Function to create WebDriverdef create_driver(options: Options) -> webdriver.Chrome:
|
222 |
+
|
223 |
"""
|
224 |
+
|
225 |
Initializes and returns a Selenium Chrome WebDriver instance.
|
226 |
+
|
227 |
"""
|
228 |
+
|
229 |
try:
|
230 |
+
|
231 |
driver = webdriver.Chrome(
|
232 |
+
|
233 |
service=Service(ChromeDriverManager().install()), options=options
|
234 |
+
|
235 |
)
|
236 |
+
|
237 |
logging.info("ChromeDriver initialized successfully.")
|
238 |
+
|
239 |
return driver
|
240 |
+
|
241 |
except Exception as exception:
|
242 |
+
|
243 |
logging.error(f"Error initializing ChromeDriver: {exception}")
|
|
|
244 |
|
245 |
+
return None# Function to log changes to CSVdef log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
|
246 |
+
|
247 |
"""
|
248 |
+
|
249 |
Logs the change to a CSV file in the storage_location.
|
250 |
+
|
251 |
"""
|
252 |
+
|
253 |
try:
|
254 |
+
|
255 |
os.makedirs(storage_location, exist_ok=True)
|
256 |
+
|
257 |
csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
|
258 |
+
|
259 |
file_exists = os.path.isfile(csv_file_path)
|
260 |
|
261 |
+
|
262 |
+
|
263 |
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
264 |
+
|
265 |
fieldnames = ["date", "time", "url", "content_hash", "change"]
|
266 |
+
|
267 |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
268 |
+
|
269 |
if not file_exists:
|
270 |
+
|
271 |
writer.writeheader()
|
272 |
+
|
273 |
writer.writerow(
|
274 |
+
|
275 |
{
|
276 |
+
|
277 |
"date": change_detected.split()[0],
|
278 |
+
|
279 |
"time": change_detected.split()[1],
|
280 |
+
|
281 |
"url": url,
|
282 |
+
|
283 |
"content_hash": content_hash,
|
284 |
+
|
285 |
"change": "Content changed",
|
286 |
+
|
287 |
}
|
288 |
+
|
289 |
)
|
290 |
+
|
291 |
logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
|
292 |
+
|
293 |
except Exception as e:
|
|
|
294 |
|
295 |
+
logging.error(f"Error logging data to CSV: {e}")# Function to get initial observationdef get_initial_observation(
|
296 |
+
|
297 |
+
driver: webdriver.Chrome, url: str, content_type: str, selector: str = None) -> str:
|
298 |
+
|
299 |
"""
|
300 |
+
|
301 |
Retrieves the initial content from the URL and returns its MD5 hash.
|
302 |
+
|
303 |
"""
|
304 |
+
|
305 |
try:
|
306 |
+
|
307 |
driver.get(url)
|
308 |
+
|
309 |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
310 |
+
|
311 |
time.sleep(2) # Additional wait for dynamic content
|
312 |
|
313 |
+
|
314 |
+
|
315 |
if content_type == "text":
|
316 |
+
|
317 |
initial_content = driver.page_source
|
318 |
+
|
319 |
elif content_type == "media":
|
320 |
+
|
321 |
if selector:
|
322 |
+
|
323 |
try:
|
324 |
+
|
325 |
elements = WebDriverWait(driver, 5).until(
|
326 |
+
|
327 |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
|
328 |
+
|
329 |
)
|
330 |
+
|
331 |
initial_content = [element.get_attribute("src") for element in elements]
|
332 |
+
|
333 |
+
except TimeoutException: logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
|
334 |
+
|
335 |
initial_content = []
|
336 |
+
|
337 |
else:
|
338 |
+
|
339 |
elements = driver.find_elements(By.TAG_NAME, "img")
|
340 |
+
|
341 |
initial_content = [element.get_attribute("src") for element in elements]
|
342 |
+
|
343 |
else:
|
344 |
+
|
345 |
initial_content = driver.page_source
|
346 |
|
347 |
+
|
348 |
+
|
349 |
initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
|
350 |
+
|
351 |
logging.info(f"Initial hash for {url}: {initial_hash}")
|
352 |
+
|
353 |
return initial_hash
|
354 |
+
|
355 |
except Exception as exception:
|
356 |
+
|
357 |
logging.error(f"Error accessing {url}: {exception}")
|
|
|
358 |
|
359 |
+
return None# Function to monitor URLs for changesdef monitor_urls(
|
360 |
+
|
361 |
storage_location: str,
|
362 |
+
|
363 |
urls: list,
|
364 |
+
|
365 |
scrape_interval: int,
|
366 |
+
|
367 |
content_type: str,
|
368 |
+
|
369 |
selector: str = None,
|
370 |
+
|
371 |
+
progress: gr.Progress = None):
|
372 |
+
|
373 |
"""
|
374 |
+
|
375 |
Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
|
376 |
+
|
377 |
"""
|
378 |
+
|
379 |
global HISTORY, STOP_THREADS
|
380 |
+
|
381 |
previous_hashes = {url: "" for url in urls}
|
382 |
|
383 |
+
|
384 |
+
|
385 |
options = Options()
|
386 |
+
|
387 |
options.add_argument("--headless")
|
388 |
+
|
389 |
options.add_argument("--no-sandbox")
|
390 |
+
|
391 |
+
options.add_argument("--disable-dev-shm-usage")
|
392 |
+
|
393 |
+
|
394 |
+
|
395 |
+
|
396 |
+
|
397 |
+
|
398 |
|
399 |
driver = create_driver(options)
|
400 |
+
|
401 |
if driver is None:
|
402 |
+
|
403 |
logging.error("WebDriver could not be initialized. Exiting monitor.")
|
404 |
+
|
405 |
return
|
406 |
|
407 |
+
|
408 |
+
|
409 |
try:
|
410 |
+
|
411 |
while not STOP_THREADS:
|
412 |
+
|
413 |
for url in urls:
|
414 |
+
|
415 |
if STOP_THREADS:
|
416 |
+
|
417 |
break
|
418 |
+
|
419 |
try:
|
420 |
+
|
421 |
driver.get(url)
|
422 |
+
|
423 |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
424 |
+
|
425 |
time.sleep(2) # Additional wait for dynamic content
|
426 |
|
427 |
+
|
428 |
+
|
429 |
if content_type == "text":
|
430 |
+
|
431 |
current_content = driver.page_source
|
432 |
+
|
433 |
elif content_type == "media":
|
434 |
+
|
435 |
if selector:
|
436 |
+
|
437 |
try:
|
438 |
+
|
439 |
elements = WebDriverWait(driver, 5).until(
|
440 |
+
|
441 |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
|
442 |
+
|
443 |
)
|
444 |
+
|
445 |
current_content = [element.get_attribute("src") for element in elements]
|
446 |
+
|
447 |
except TimeoutException:
|
448 |
+
|
449 |
logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
|
450 |
+
|
451 |
current_content = []
|
452 |
+
|
453 |
else:
|
454 |
+
|
455 |
elements = driver.find_elements(By.TAG_NAME, "img")
|
456 |
+
|
457 |
current_content = [element.get_attribute("src") for element in elements]
|
458 |
+
|
459 |
else:
|
460 |
+
|
461 |
current_content = driver.page_source
|
462 |
|
463 |
+
|
464 |
+
|
465 |
current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
|
466 |
+
|
467 |
if current_hash != previous_hashes[url]:
|
468 |
+
|
469 |
previous_hashes[url] = current_hash
|
470 |
+
|
471 |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
472 |
+
|
473 |
HISTORY.append(f"Change detected at {url} on {date_time_str}")
|
474 |
|
475 |
+
|
476 |
+
|
477 |
# Attempt to log to database
|
478 |
+
|
479 |
connection = get_db_connection()
|
480 |
+
|
481 |
if connection:
|
482 |
+
|
483 |
try:
|
484 |
+
|
485 |
cursor = connection.cursor()
|
486 |
+
|
487 |
insert_query = """
|
488 |
+
|
489 |
INSERT INTO scraped_data (url, content_hash, change_detected)
|
490 |
+
|
491 |
VALUES (%s, %s, %s)
|
492 |
+
|
493 |
"""
|
494 |
+
|
495 |
cursor.execute(insert_query, (url, current_hash, date_time_str))
|
496 |
+
|
497 |
connection.commit()
|
498 |
+
|
499 |
logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
|
500 |
+
|
501 |
except mysql.connector.Error as err:
|
502 |
+
|
503 |
logging.error(f"Error inserting data into database: {err}")
|
504 |
+
|
505 |
# Fallback to CSV
|
506 |
+
|
507 |
log_to_csv(storage_location, url, current_hash, date_time_str)
|
508 |
+
|
509 |
finally:
|
510 |
+
|
511 |
cursor.close()
|
512 |
+
|
513 |
connection.close()
|
514 |
+
|
515 |
else:
|
516 |
+
|
517 |
# Fallback to CSV
|
518 |
+
|
519 |
log_to_csv(storage_location, url, current_hash, date_time_str)
|
520 |
|
521 |
+
|
522 |
+
|
523 |
# Update progress
|
524 |
+
|
525 |
if progress:
|
526 |
+
|
527 |
progress(1)
|
528 |
+
|
529 |
except (
|
530 |
+
|
531 |
NoSuchElementException,
|
532 |
+
|
533 |
StaleElementReferenceException,
|
534 |
+
|
535 |
TimeoutException,
|
536 |
+
|
537 |
Exception,
|
538 |
+
|
539 |
) as e:
|
540 |
+
|
541 |
logging.error(f"Error accessing {url}: {e}")
|
542 |
+
|
543 |
if progress:
|
544 |
+
|
545 |
progress(1)
|
546 |
+
|
547 |
time.sleep(scrape_interval * 60) # Wait for the next scrape interval
|
548 |
+
|
549 |
finally:
|
550 |
+
|
551 |
driver.quit()
|
|
|
552 |
|
553 |
+
logging.info("ChromeDriver session ended.")# Function to start scrapingdef start_scraping(
|
554 |
+
|
555 |
storage_location: str,
|
556 |
+
|
557 |
urls: str,
|
558 |
+
|
559 |
scrape_interval: int,
|
560 |
+
|
561 |
content_type: str,
|
562 |
+
|
563 |
selector: str = None,
|
564 |
+
|
565 |
+
progress: gr.Progress = None) -> str:
|
566 |
+
|
567 |
"""
|
568 |
+
|
569 |
Starts the scraping process in a separate thread with progress indication.
|
570 |
+
|
571 |
"""
|
572 |
+
|
573 |
global CURRENT_TASK, HISTORY, STOP_THREADS
|
574 |
|
575 |
+
|
576 |
+
|
577 |
if STOP_THREADS:
|
578 |
+
|
579 |
STOP_THREADS = False # Reset the flag if previously stopped
|
580 |
|
581 |
+
|
582 |
+
|
583 |
url_list = [url.strip() for url in urls.split(",") if url.strip()]
|
584 |
+
|
585 |
CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
|
586 |
+
|
587 |
HISTORY.append(f"Task started: {CURRENT_TASK}")
|
588 |
+
|
589 |
logging.info(f"Task started: {CURRENT_TASK}")
|
590 |
|
591 |
+
|
592 |
+
|
593 |
# Initialize database tables
|
594 |
+
|
595 |
initialize_database()
|
596 |
|
597 |
+
|
598 |
+
|
599 |
# Log initial observations
|
600 |
+
|
601 |
def log_initial_observations():
|
602 |
+
|
603 |
options = Options()
|
604 |
+
|
605 |
options.add_argument("--headless")
|
606 |
+
|
607 |
options.add_argument("--no-sandbox")
|
608 |
+
|
609 |
+
options.add_argument("--disable-dev-shm-usage")
|
610 |
+
|
611 |
+
|
612 |
+
|
613 |
+
|
614 |
+
|
615 |
+
|
616 |
|
617 |
driver = create_driver(options)
|
618 |
+
|
619 |
if driver is None:
|
620 |
+
|
621 |
return
|
622 |
|
623 |
+
|
624 |
+
|
625 |
for url in url_list:
|
626 |
+
|
627 |
if STOP_THREADS:
|
628 |
+
|
629 |
break
|
630 |
+
|
631 |
try:
|
632 |
+
|
633 |
initial_hash = get_initial_observation(driver, url, content_type, selector)
|
634 |
+
|
635 |
if initial_hash:
|
636 |
+
|
637 |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
638 |
+
|
639 |
HISTORY.append(f"Initial observation at {url}: {initial_hash}")
|
640 |
|
641 |
+
|
642 |
+
|
643 |
# Attempt to log to database
|
644 |
+
|
645 |
connection = get_db_connection()
|
646 |
+
|
647 |
if connection:
|
648 |
+
|
649 |
try:
|
650 |
+
|
651 |
cursor = connection.cursor()
|
652 |
+
|
653 |
insert_query = """
|
654 |
+
|
655 |
INSERT INTO scraped_data (url, content_hash, change_detected)
|
656 |
+
|
657 |
VALUES (%s, %s, %s)
|
658 |
+
|
659 |
"""
|
660 |
+
|
661 |
cursor.execute(insert_query, (url, initial_hash, date_time_str))
|
662 |
+
|
663 |
connection.commit()
|
664 |
+
|
665 |
logging.info(f"Initial observation logged for {url} in database.")
|
666 |
+
|
667 |
except mysql.connector.Error as err:
|
668 |
+
|
669 |
logging.error(f"Error inserting initial observation into database: {err}")
|
670 |
+
|
671 |
# Fallback to CSV
|
672 |
+
|
673 |
log_to_csv(storage_location, url, initial_hash, date_time_str)
|
674 |
+
|
675 |
finally:
|
676 |
+
|
677 |
cursor.close()
|
678 |
+
|
679 |
connection.close()
|
680 |
+
|
681 |
else:
|
682 |
+
|
683 |
# Fallback to CSV
|
684 |
+
|
685 |
log_to_csv(storage_location, url, initial_hash, date_time_str)
|
686 |
+
|
687 |
except Exception as e:
|
688 |
+
|
689 |
HISTORY.append(f"Error accessing {url}: {e}")
|
690 |
+
|
691 |
logging.error(f"Error accessing {url}: {e}")
|
692 |
+
|
693 |
driver.quit()
|
694 |
|
695 |
+
|
696 |
+
|
697 |
# Start logging initial observations
|
698 |
+
|
699 |
initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
|
700 |
+
|
701 |
initial_thread.start()
|
702 |
|
703 |
+
|
704 |
+
|
705 |
# Start the monitoring thread with progress
|
706 |
+
|
707 |
monitor_thread = threading.Thread(
|
708 |
+
|
709 |
target=monitor_urls,
|
710 |
+
|
711 |
args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
|
712 |
+
|
713 |
daemon=True,
|
714 |
+
|
715 |
)
|
716 |
+
|
717 |
monitor_thread.start()
|
718 |
+
|
719 |
logging.info("Started scraping thread.")
|
720 |
+
|
721 |
return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
|
722 |
|
723 |
# Function to stop scraping
|