Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 13, 2024

Commit

bcc7de3

verified ·

1 Parent(s): 9c0b2a6

Update app.py

Browse files

Files changed (1) hide show

app.py +382 -116

app.py CHANGED Viewed

@@ -1,457 +1,723 @@
-import os
-import time
-import hashlib
-import logging
-import streamlit as st
-import datetime
-import csv
-import threading
-import re
-import unittest
-from urllib.parse import urlparse
-import pandas as pd
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import (
-    TimeoutException,
-    NoSuchElementException,
-    StaleElementReferenceException,
-)
-from webdriver_manager.chrome import ChromeDriverManager
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import gradio as gr
-import xml.etree.ElementTree as ET
-import torch
-import mysql.connector
-from mysql.connector import pooling
-import nltk
-from huggingface_hub import login
-from dotenv import load_dotenv
-# Initialize NLTK resources (you may need to download these)
-st.title("CEEMEESEEK with Model Selection")
 # Dictionary to store model loading functions
 model_loaders = {
     "Falcon": lambda: load_model("tiiuae/falcon-7b"),
     "Flan-T5": lambda: load_model("google/flan-t5-xl"),
     "Flan-T5-Small": lambda: load_model("google/flan-t5-small")  # Add a smaller model
-}
-model_option = st.selectbox("Select a Model", list(model_loaders.keys()))
-def load_model(model_name: str):
-    """
-    Loads the specified model and tokenizer.
-    """
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        # This should be inside the try block
-        max_supported_length = 2048  # Get this from the model config
-        openllama_pipeline = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            truncation=True,
-            max_length=max_supported_length,
-            temperature=0.7,
-            top_p=0.95,
-            device=0 if torch.cuda.is_available() else -1,
-        )
-        logging.info(f"{model_name} loaded successfully.")
-        return openllama_pipeline
-    except Exception as e:
-        logging.error(f"Error loading {model_name} model: {e}")
-        return None
-HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
-if not HUGGINGFACE_TOKEN:
-    raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
-login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
-if not HUGGINGFACE_TOKEN:
     raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
-add_to_git_credential=True
-login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
-# Load environment variables from .env file
-load_dotenv()
-# Configure logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-# Define constants
 DEFAULT_FILE_PATH = "scraped_data"
 PURPOSE = (
     "You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
     "Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
-)
-# Global variables for task management
 HISTORY = []
 CURRENT_TASK = None
-STOP_THREADS = False  # Flag to stop scraping threads
-# Database Pooling Configuration
 DB_POOL_NAME = "mypool"
-DB_POOL_SIZE = 5  # Adjust based on expected load
-try:
     dbconfig = {
         "host": os.getenv("DB_HOST"),
         "user": os.getenv("DB_USER"),
         "password": os.getenv("DB_PASSWORD"),
         "database": os.getenv("DB_NAME"),
     }
     connection_pool = mysql.connector.pooling.MySQLConnectionPool(
         pool_name=DB_POOL_NAME,
         pool_size=DB_POOL_SIZE,
         pool_reset_session=True,
         **dbconfig
     )
-    logging.info("Database connection pool created successfully.")
-except mysql.connector.Error as err:
     logging.warning(f"Database connection pool creation failed: {err}")
-    connection_pool = None  # Will use CSV as fallback
-# Function to get a database connection from the pool
-def get_db_connection():
     """
     Retrieves a connection from the pool. Returns None if pool is not available.
     """
     if connection_pool:
         try:
             connection = connection_pool.get_connection()
             if connection.is_connected():
                 return connection
         except mysql.connector.Error as err:
             logging.error(f"Error getting connection from pool: {err}")
-    return None
-# Initialize Database: Create tables and indexes
-def initialize_database():
     """
     Initializes the database by creating necessary tables and indexes if they do not exist.
     """
     connection = get_db_connection()
     if connection is None:
         logging.info("Database initialization skipped. Using CSV storage.")
         return
     cursor = connection.cursor()
     try:
         # Create table for scraped data
         create_scraped_data_table = """
         CREATE TABLE IF NOT EXISTS scraped_data (
             id INT AUTO_INCREMENT PRIMARY KEY,
             url VARCHAR(255) NOT NULL,
             content_hash VARCHAR(64) NOT NULL,
             change_detected DATETIME NOT NULL
         )
         """
         cursor.execute(create_scraped_data_table)
         logging.info("Table 'scraped_data' is ready.")
         # Create indexes for performance
         create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
         create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
         cursor.execute(create_index_url)
         cursor.execute(create_index_change)
         logging.info("Indexes on 'url' and 'change_detected' columns created.")
         # Create table for action logs
         create_action_logs_table = """
         CREATE TABLE IF NOT EXISTS action_logs (
             id INT AUTO_INCREMENT PRIMARY KEY,
             action VARCHAR(255) NOT NULL,
             timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
         )
         """
         cursor.execute(create_action_logs_table)
         logging.info("Table 'action_logs' is ready.")
     except mysql.connector.Error as err:
         logging.error(f"Error initializing database: {err}")
     finally:
         cursor.close()
         connection.close()
-        logging.info("Database initialization complete.")
-# Function to create WebDriver
-def create_driver(options: Options) -> webdriver.Chrome:
     """
     Initializes and returns a Selenium Chrome WebDriver instance.
     """
     try:
         driver = webdriver.Chrome(
             service=Service(ChromeDriverManager().install()), options=options
         )
         logging.info("ChromeDriver initialized successfully.")
         return driver
     except Exception as exception:
         logging.error(f"Error initializing ChromeDriver: {exception}")
-        return None
-# Function to log changes to CSV
-def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
     """
     Logs the change to a CSV file in the storage_location.
     """
     try:
         os.makedirs(storage_location, exist_ok=True)
         csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
         file_exists = os.path.isfile(csv_file_path)
         with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
             fieldnames = ["date", "time", "url", "content_hash", "change"]
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             if not file_exists:
                 writer.writeheader()
             writer.writerow(
                 {
                     "date": change_detected.split()[0],
                     "time": change_detected.split()[1],
                     "url": url,
                     "content_hash": content_hash,
                     "change": "Content changed",
                 }
             )
         logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
     except Exception as e:
-        logging.error(f"Error logging data to CSV: {e}")
-# Function to get initial observation
-def get_initial_observation(
-    driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
-) -> str:
     """
     Retrieves the initial content from the URL and returns its MD5 hash.
     """
     try:
         driver.get(url)
         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
         time.sleep(2)  # Additional wait for dynamic content
         if content_type == "text":
             initial_content = driver.page_source
         elif content_type == "media":
             if selector:
                 try:
                     elements = WebDriverWait(driver, 5).until(
                         EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
                     )
                     initial_content = [element.get_attribute("src") for element in elements]
-                except TimeoutException:
-                    logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
                     initial_content = []
             else:
                 elements = driver.find_elements(By.TAG_NAME, "img")
                 initial_content = [element.get_attribute("src") for element in elements]
         else:
             initial_content = driver.page_source
         initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
         logging.info(f"Initial hash for {url}: {initial_hash}")
         return initial_hash
     except Exception as exception:
         logging.error(f"Error accessing {url}: {exception}")
-        return None
-# Function to monitor URLs for changes
-def monitor_urls(
     storage_location: str,
     urls: list,
     scrape_interval: int,
     content_type: str,
     selector: str = None,
-    progress: gr.Progress = None
-):
     """
     Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
     """
     global HISTORY, STOP_THREADS
     previous_hashes = {url: "" for url in urls}
     options = Options()
     options.add_argument("--headless")
     options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage")
     driver = create_driver(options)
     if driver is None:
         logging.error("WebDriver could not be initialized. Exiting monitor.")
         return
     try:
         while not STOP_THREADS:
             for url in urls:
                 if STOP_THREADS:
                     break
                 try:
                     driver.get(url)
                     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                     time.sleep(2)  # Additional wait for dynamic content
                     if content_type == "text":
                         current_content = driver.page_source
                     elif content_type == "media":
                         if selector:
                             try:
                                 elements = WebDriverWait(driver, 5).until(
                                     EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
                                 )
                                 current_content = [element.get_attribute("src") for element in elements]
                             except TimeoutException:
                                 logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
                                 current_content = []
                         else:
                             elements = driver.find_elements(By.TAG_NAME, "img")
                             current_content = [element.get_attribute("src") for element in elements]
                     else:
                         current_content = driver.page_source
                     current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
                     if current_hash != previous_hashes[url]:
                         previous_hashes[url] = current_hash
                         date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                         HISTORY.append(f"Change detected at {url} on {date_time_str}")
                         # Attempt to log to database
                         connection = get_db_connection()
                         if connection:
                             try:
                                 cursor = connection.cursor()
                                 insert_query = """
                                 INSERT INTO scraped_data (url, content_hash, change_detected)
                                 VALUES (%s, %s, %s)
                                 """
                                 cursor.execute(insert_query, (url, current_hash, date_time_str))
                                 connection.commit()
                                 logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
                             except mysql.connector.Error as err:
                                 logging.error(f"Error inserting data into database: {err}")
                                 # Fallback to CSV
                                 log_to_csv(storage_location, url, current_hash, date_time_str)
                             finally:
                                 cursor.close()
                                 connection.close()
                         else:
                             # Fallback to CSV
                             log_to_csv(storage_location, url, current_hash, date_time_str)
                         # Update progress
                         if progress:
                             progress(1)
                 except (
                     NoSuchElementException,
                     StaleElementReferenceException,
                     TimeoutException,
                     Exception,
                 ) as e:
                     logging.error(f"Error accessing {url}: {e}")
                     if progress:
                         progress(1)
             time.sleep(scrape_interval * 60)  # Wait for the next scrape interval
     finally:
         driver.quit()
-        logging.info("ChromeDriver session ended.")
-# Function to start scraping
-def start_scraping(
     storage_location: str,
     urls: str,
     scrape_interval: int,
     content_type: str,
     selector: str = None,
-    progress: gr.Progress = None
-) -> str:
     """
     Starts the scraping process in a separate thread with progress indication.
     """
     global CURRENT_TASK, HISTORY, STOP_THREADS
     if STOP_THREADS:
         STOP_THREADS = False  # Reset the flag if previously stopped
     url_list = [url.strip() for url in urls.split(",") if url.strip()]
     CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
     HISTORY.append(f"Task started: {CURRENT_TASK}")
     logging.info(f"Task started: {CURRENT_TASK}")
     # Initialize database tables
     initialize_database()
     # Log initial observations
     def log_initial_observations():
         options = Options()
         options.add_argument("--headless")
         options.add_argument("--no-sandbox")
-        options.add_argument("--disable-dev-shm-usage")
         driver = create_driver(options)
         if driver is None:
             return
         for url in url_list:
             if STOP_THREADS:
                 break
             try:
                 initial_hash = get_initial_observation(driver, url, content_type, selector)
                 if initial_hash:
                     date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                     HISTORY.append(f"Initial observation at {url}: {initial_hash}")
                     # Attempt to log to database
                     connection = get_db_connection()
                     if connection:
                         try:
                             cursor = connection.cursor()
                             insert_query = """
                             INSERT INTO scraped_data (url, content_hash, change_detected)
                             VALUES (%s, %s, %s)
                             """
                             cursor.execute(insert_query, (url, initial_hash, date_time_str))
                             connection.commit()
                             logging.info(f"Initial observation logged for {url} in database.")
                         except mysql.connector.Error as err:
                             logging.error(f"Error inserting initial observation into database: {err}")
                             # Fallback to CSV
                             log_to_csv(storage_location, url, initial_hash, date_time_str)
                         finally:
                             cursor.close()
                             connection.close()
                     else:
                         # Fallback to CSV
                         log_to_csv(storage_location, url, initial_hash, date_time_str)
             except Exception as e:
                 HISTORY.append(f"Error accessing {url}: {e}")
                 logging.error(f"Error accessing {url}: {e}")
         driver.quit()
     # Start logging initial observations
     initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
     initial_thread.start()
     # Start the monitoring thread with progress
     monitor_thread = threading.Thread(
         target=monitor_urls,
         args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
         daemon=True,
     )
     monitor_thread.start()
     logging.info("Started scraping thread.")
     return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
 # Function to stop scraping

+limport datetimeimport osimport csvimport timeimport hashlibimport loggingfrom collections import defaultdictimport mysql.connectorimport threadingfrom urllib.parse import urlparseimport gradio as grfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as
+ ECfrom selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutExceptionfrom selenium.webdriver.chrome.service
+ import Servicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManager
+from huggingface_hub import InferenceClient, loginfrom transformers import AutoTokenizer, AutoModelForCausalLM, pipelineimport randomimport yamlimport torchimport pandas as pdimport xml.etree.ElementTree as ETimport reimport spacyimport unittestfrom dotenv import load_dotenvimport nltk# Initialize NLTK resources (you may need to download these)
+nltk.download('punkt')nltk.download('averaged_perceptron_tagger')
+nltk.download('maxent_ne_chunker')
+nltk.download('words')
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
 # Dictionary to store model loading functions
 model_loaders = {
     "Falcon": lambda: load_model("tiiuae/falcon-7b"),
     "Flan-T5": lambda: load_model("google/flan-t5-xl"),
     "Flan-T5-Small": lambda: load_model("google/flan-t5-small")  # Add a smaller model
+}# Load environment variables from .env file
+load_dotenv()
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")if not HUGGINGFACE_TOKEN:
     raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
+login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)# Configure logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)# Define constants
 DEFAULT_FILE_PATH = "scraped_data"
 PURPOSE = (
     "You monitor urls. You log what you observe. You seek any changes on them since your last observation. "
     "Anything new gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
+)# Global variables for task management
 HISTORY = []
 CURRENT_TASK = None
+STOP_THREADS = False  # Flag to stop scraping threads# Database Pooling Configuration
 DB_POOL_NAME = "mypool"
+DB_POOL_SIZE = 5  # Adjust based on expected loadtry:
     dbconfig = {
         "host": os.getenv("DB_HOST"),
         "user": os.getenv("DB_USER"),
         "password": os.getenv("DB_PASSWORD"),
         "database": os.getenv("DB_NAME"),
     }
     connection_pool = mysql.connector.pooling.MySQLConnectionPool(
         pool_name=DB_POOL_NAME,
         pool_size=DB_POOL_SIZE,
         pool_reset_session=True,
         **dbconfig
     )
+    logging.info("Database connection pool created successfully.")except mysql.connector.Error as err:
     logging.warning(f"Database connection pool creation failed: {err}")
+    connection_pool = None  # Will use CSV as fallback# Function to get a database connection from the pooldef get_db_connection():
     """
     Retrieves a connection from the pool. Returns None if pool is not available.
     """
     if connection_pool:
         try:
             connection = connection_pool.get_connection()
             if connection.is_connected():
                 return connection
         except mysql.connector.Error as err:
             logging.error(f"Error getting connection from pool: {err}")
+    return None# Initialize Database: Create tables and indexesdef initialize_database():
     """
     Initializes the database by creating necessary tables and indexes if they do not exist.
     """
     connection = get_db_connection()
     if connection is None:
         logging.info("Database initialization skipped. Using CSV storage.")
         return
     cursor = connection.cursor()
     try:
         # Create table for scraped data
         create_scraped_data_table = """
         CREATE TABLE IF NOT EXISTS scraped_data (
             id INT AUTO_INCREMENT PRIMARY KEY,
             url VARCHAR(255) NOT NULL,
             content_hash VARCHAR(64) NOT NULL,
             change_detected DATETIME NOT NULL
         )
         """
         cursor.execute(create_scraped_data_table)
         logging.info("Table 'scraped_data' is ready.")
         # Create indexes for performance
         create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
         create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
         cursor.execute(create_index_url)
         cursor.execute(create_index_change)
         logging.info("Indexes on 'url' and 'change_detected' columns created.")
         # Create table for action logs
         create_action_logs_table = """
         CREATE TABLE IF NOT EXISTS action_logs (
             id INT AUTO_INCREMENT PRIMARY KEY,
             action VARCHAR(255) NOT NULL,
             timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
         )
         """
         cursor.execute(create_action_logs_table)
         logging.info("Table 'action_logs' is ready.")
     except mysql.connector.Error as err:
         logging.error(f"Error initializing database: {err}")
     finally:
         cursor.close()
         connection.close()
+        logging.info("Database initialization complete.")# Function to create WebDriverdef create_driver(options: Options) -> webdriver.Chrome:
     """
     Initializes and returns a Selenium Chrome WebDriver instance.
     """
     try:
         driver = webdriver.Chrome(
             service=Service(ChromeDriverManager().install()), options=options
         )
         logging.info("ChromeDriver initialized successfully.")
         return driver
     except Exception as exception:
         logging.error(f"Error initializing ChromeDriver: {exception}")
+        return None# Function to log changes to CSVdef log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
     """
     Logs the change to a CSV file in the storage_location.
     """
     try:
         os.makedirs(storage_location, exist_ok=True)
         csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
         file_exists = os.path.isfile(csv_file_path)
         with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
             fieldnames = ["date", "time", "url", "content_hash", "change"]
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             if not file_exists:
                 writer.writeheader()
             writer.writerow(
                 {
                     "date": change_detected.split()[0],
                     "time": change_detected.split()[1],
                     "url": url,
                     "content_hash": content_hash,
                     "change": "Content changed",
                 }
             )
         logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
     except Exception as e:
+        logging.error(f"Error logging data to CSV: {e}")# Function to get initial observationdef get_initial_observation(
+    driver: webdriver.Chrome, url: str, content_type: str, selector: str = None) -> str:
     """
     Retrieves the initial content from the URL and returns its MD5 hash.
     """
     try:
         driver.get(url)
         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
         time.sleep(2)  # Additional wait for dynamic content
         if content_type == "text":
             initial_content = driver.page_source
         elif content_type == "media":
             if selector:
                 try:
                     elements = WebDriverWait(driver, 5).until(
                         EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
                     )
                     initial_content = [element.get_attribute("src") for element in elements]
+                except TimeoutException:                     logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
                     initial_content = []
             else:
                 elements = driver.find_elements(By.TAG_NAME, "img")
                 initial_content = [element.get_attribute("src") for element in elements]
         else:
             initial_content = driver.page_source
         initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
         logging.info(f"Initial hash for {url}: {initial_hash}")
         return initial_hash
     except Exception as exception:
         logging.error(f"Error accessing {url}: {exception}")
+        return None# Function to monitor URLs for changesdef monitor_urls(
     storage_location: str,
     urls: list,
     scrape_interval: int,
     content_type: str,
     selector: str = None,
+    progress: gr.Progress = None):
     """
     Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
     """
     global HISTORY, STOP_THREADS
     previous_hashes = {url: "" for url in urls}
     options = Options()
     options.add_argument("--headless")
     options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
     driver = create_driver(options)
     if driver is None:
         logging.error("WebDriver could not be initialized. Exiting monitor.")
         return
     try:
         while not STOP_THREADS:
             for url in urls:
                 if STOP_THREADS:
                     break
                 try:
                     driver.get(url)
                     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                     time.sleep(2)  # Additional wait for dynamic content
                     if content_type == "text":
                         current_content = driver.page_source
                     elif content_type == "media":
                         if selector:
                             try:
                                 elements = WebDriverWait(driver, 5).until(
                                     EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
                                 )
                                 current_content = [element.get_attribute("src") for element in elements]
                             except TimeoutException:
                                 logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
                                 current_content = []
                         else:
                             elements = driver.find_elements(By.TAG_NAME, "img")
                             current_content = [element.get_attribute("src") for element in elements]
                     else:
                         current_content = driver.page_source
                     current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
                     if current_hash != previous_hashes[url]:
                         previous_hashes[url] = current_hash
                         date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                         HISTORY.append(f"Change detected at {url} on {date_time_str}")
                         # Attempt to log to database
                         connection = get_db_connection()
                         if connection:
                             try:
                                 cursor = connection.cursor()
                                 insert_query = """
                                 INSERT INTO scraped_data (url, content_hash, change_detected)
                                 VALUES (%s, %s, %s)
                                 """
                                 cursor.execute(insert_query, (url, current_hash, date_time_str))
                                 connection.commit()
                                 logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
                             except mysql.connector.Error as err:
                                 logging.error(f"Error inserting data into database: {err}")
                                 # Fallback to CSV
                                 log_to_csv(storage_location, url, current_hash, date_time_str)
                             finally:
                                 cursor.close()
                                 connection.close()
                         else:
                             # Fallback to CSV
                             log_to_csv(storage_location, url, current_hash, date_time_str)
                         # Update progress
                         if progress:
                             progress(1)
                 except (
                     NoSuchElementException,
                     StaleElementReferenceException,
                     TimeoutException,
                     Exception,
                 ) as e:
                     logging.error(f"Error accessing {url}: {e}")
                     if progress:
                         progress(1)
             time.sleep(scrape_interval * 60)  # Wait for the next scrape interval
     finally:
         driver.quit()
+        logging.info("ChromeDriver session ended.")# Function to start scrapingdef start_scraping(
     storage_location: str,
     urls: str,
     scrape_interval: int,
     content_type: str,
     selector: str = None,
+    progress: gr.Progress = None) -> str:
     """
     Starts the scraping process in a separate thread with progress indication.
     """
     global CURRENT_TASK, HISTORY, STOP_THREADS
     if STOP_THREADS:
         STOP_THREADS = False  # Reset the flag if previously stopped
     url_list = [url.strip() for url in urls.split(",") if url.strip()]
     CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
     HISTORY.append(f"Task started: {CURRENT_TASK}")
     logging.info(f"Task started: {CURRENT_TASK}")
     # Initialize database tables
     initialize_database()
     # Log initial observations
     def log_initial_observations():
         options = Options()
         options.add_argument("--headless")
         options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
         driver = create_driver(options)
         if driver is None:
             return
         for url in url_list:
             if STOP_THREADS:
                 break
             try:
                 initial_hash = get_initial_observation(driver, url, content_type, selector)
                 if initial_hash:
                     date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                     HISTORY.append(f"Initial observation at {url}: {initial_hash}")
                     # Attempt to log to database
                     connection = get_db_connection()
                     if connection:
                         try:
                             cursor = connection.cursor()
                             insert_query = """
                             INSERT INTO scraped_data (url, content_hash, change_detected)
                             VALUES (%s, %s, %s)
                             """
                             cursor.execute(insert_query, (url, initial_hash, date_time_str))
                             connection.commit()
                             logging.info(f"Initial observation logged for {url} in database.")
                         except mysql.connector.Error as err:
                             logging.error(f"Error inserting initial observation into database: {err}")
                             # Fallback to CSV
                             log_to_csv(storage_location, url, initial_hash, date_time_str)
                         finally:
                             cursor.close()
                             connection.close()
                     else:
                         # Fallback to CSV
                         log_to_csv(storage_location, url, initial_hash, date_time_str)
             except Exception as e:
                 HISTORY.append(f"Error accessing {url}: {e}")
                 logging.error(f"Error accessing {url}: {e}")
         driver.quit()
     # Start logging initial observations
     initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
     initial_thread.start()
     # Start the monitoring thread with progress
     monitor_thread = threading.Thread(
         target=monitor_urls,
         args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
         daemon=True,
     )
     monitor_thread.start()
     logging.info("Started scraping thread.")
     return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
 # Function to stop scraping