Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Oct 12, 2024

Commit

ac4d529

1 Parent(s): 7321c1c

fix

Browse files

Files changed (1) hide show

app.py +1239 -3

app.py CHANGED Viewed

@@ -1,4 +1,41 @@
-# ... (your existing imports and code before model loading) ...
 # Dictionary to store model loading functions
 model_loaders = {
@@ -12,8 +49,1146 @@ model_option = st.selectbox("Select a Model", list(model_loaders.keys()))
 # Load the selected model
 model = model_loaders[model_option]()
-# ... (rest of your existing code) ...
 def load_model(model_name: str):
     """
@@ -40,4 +1215,65 @@ def load_model(model_name: str):
         logging.error(f"Error loading {model_name} model: {e}")
         return None
-# ... (rest of your existing code) ...

+import os
+import time
+import hashlib
+import logging
+import streamlit as st
+import datetime
+import csv
+import threading
+import re
+import unittest
+from urllib.parse import urlparse
+import spaces
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import (
+    TimeoutException,
+    NoSuchElementException,
+    StaleElementReferenceException,
+)
+from webdriver_manager.chrome import ChromeDriverManager
+from transformers import AutoTokenizer, OpenLlamaForCausalLM, pipeline
+import gradio as gr
+import xml.etree.ElementTree as ET
+import torch
+import mysql.connector
+from mysql.connector import errorcode, pooling
+import nltk
+import importlib
+st.title("CEEMEESEEK with Model Selection")
 # Dictionary to store model loading functions
 model_loaders = {
 # Load the selected model
 model = model_loaders[model_option]()
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+if not HUGGINGFACE_TOKEN:
+    raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.")
+add_to_git_credential=True
+login(token=HUGGINGFACE_TOKEN, add_to_git_credential=True)
+# Load environment variables from .env file
+load_dotenv()
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Define constants
+DEFAULT_FILE_PATH = "scraped_data"
+PURPOSE = (
+    "You go to Culvers sites, you continuously seek changes on them since your last observation. "
+    "Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
+)
+# Global variables for task management
+HISTORY = []
+CURRENT_TASK = None
+STOP_THREADS = False  # Flag to stop scraping threads
+# Database Pooling Configuration
+DB_POOL_NAME = "mypool"
+DB_POOL_SIZE = 5  # Adjust based on expected load
+try:
+    dbconfig = {
+        "host": os.getenv("DB_HOST"),
+        "user": os.getenv("DB_USER"),
+        "password": os.getenv("DB_PASSWORD"),
+        "database": os.getenv("DB_NAME"),
+    }
+    connection_pool = mysql.connector.pooling.MySQLConnectionPool(
+        pool_name=DB_POOL_NAME,
+        pool_size=DB_POOL_SIZE,
+        pool_reset_session=True,
+        **dbconfig
+    )
+    logging.info("Database connection pool created successfully.")
+except mysql.connector.Error as err:
+    logging.warning(f"Database connection pool creation failed: {err}")
+    connection_pool = None  # Will use CSV as fallback
+# Function to get a database connection from the pool
+def get_db_connection():
+    """
+    Retrieves a connection from the pool. Returns None if pool is not available.
+    """
+    if connection_pool:
+        try:
+            connection = connection_pool.get_connection()
+            if connection.is_connected():
+                return connection
+        except mysql.connector.Error as err:
+            logging.error(f"Error getting connection from pool: {err}")
+    return None
+# Initialize Database: Create tables and indexes
+def initialize_database():
+    """
+    Initializes the database by creating necessary tables and indexes if they do not exist.
+    """
+    connection = get_db_connection()
+    if connection is None:
+        logging.info("Database initialization skipped. Using CSV storage.")
+        return
+    cursor = connection.cursor()
+    try:
+        # Create table for scraped data
+        create_scraped_data_table = """
+        CREATE TABLE IF NOT EXISTS scraped_data (
+            id INT AUTO_INCREMENT PRIMARY KEY,
+            url VARCHAR(255) NOT NULL,
+            content_hash VARCHAR(64) NOT NULL,
+            change_detected DATETIME NOT NULL
+        )
+        """
+        cursor.execute(create_scraped_data_table)
+        logging.info("Table 'scraped_data' is ready.")
+        # Create indexes for performance
+        create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)"
+        create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)"
+        cursor.execute(create_index_url)
+        cursor.execute(create_index_change)
+        logging.info("Indexes on 'url' and 'change_detected' columns created.")
+        # Create table for action logs
+        create_action_logs_table = """
+        CREATE TABLE IF NOT EXISTS action_logs (
+            id INT AUTO_INCREMENT PRIMARY KEY,
+            action VARCHAR(255) NOT NULL,
+            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
+        )
+        """
+        cursor.execute(create_action_logs_table)
+        logging.info("Table 'action_logs' is ready.")
+    except mysql.connector.Error as err:
+        logging.error(f"Error initializing database: {err}")
+    finally:
+        cursor.close()
+        connection.close()
+        logging.info("Database initialization complete.")
+# Function to create WebDriver
+def create_driver(options: Options) -> webdriver.Chrome:
+    """
+    Initializes and returns a Selenium Chrome WebDriver instance.
+    """
+    try:
+        driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()), options=options
+        )
+        logging.info("ChromeDriver initialized successfully.")
+        return driver
+    except Exception as exception:
+        logging.error(f"Error initializing ChromeDriver: {exception}")
+        return None
+# Function to log changes to CSV
+def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str):
+    """
+    Logs the change to a CSV file in the storage_location.
+    """
+    try:
+        os.makedirs(storage_location, exist_ok=True)
+        csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv")
+        file_exists = os.path.isfile(csv_file_path)
+        with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
+            fieldnames = ["date", "time", "url", "content_hash", "change"]
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            if not file_exists:
+                writer.writeheader()
+            writer.writerow(
+                {
+                    "date": change_detected.split()[0],
+                    "time": change_detected.split()[1],
+                    "url": url,
+                    "content_hash": content_hash,
+                    "change": "Content changed",
+                }
+            )
+        logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.")
+    except Exception as e:
+        logging.error(f"Error logging data to CSV: {e}")
+# Function to get initial observation
+def get_initial_observation(
+    driver: webdriver.Chrome, url: str, content_type: str, selector: str = None
+) -> str:
+    """
+    Retrieves the initial content from the URL and returns its MD5 hash.
+    """
+    try:
+        driver.get(url)
+        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+        time.sleep(2)  # Additional wait for dynamic content
+        if content_type == "text":
+            initial_content = driver.page_source
+        elif content_type == "media":
+            if selector:
+                try:
+                    elements = WebDriverWait(driver, 5).until(
+                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
+                    )
+                    initial_content = [element.get_attribute("src") for element in elements]
+                except TimeoutException:
+                    logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
+                    initial_content = []
+            else:
+                elements = driver.find_elements(By.TAG_NAME, "img")
+                initial_content = [element.get_attribute("src") for element in elements]
+        else:
+            initial_content = driver.page_source
+        initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest()
+        logging.info(f"Initial hash for {url}: {initial_hash}")
+        return initial_hash
+    except Exception as exception:
+        logging.error(f"Error accessing {url}: {exception}")
+        return None
+# Function to monitor URLs for changes
+def monitor_urls(
+    storage_location: str,
+    urls: list,
+    scrape_interval: int,
+    content_type: str,
+    selector: str = None,
+    progress: gr.Progress = None
+):
+    """
+    Monitors the specified URLs for changes and logs any detected changes to the database or CSV.
+    """
+    global HISTORY, STOP_THREADS
+    previous_hashes = {url: "" for url in urls}
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = create_driver(options)
+    if driver is None:
+        logging.error("WebDriver could not be initialized. Exiting monitor.")
+        return
+    try:
+        while not STOP_THREADS:
+            for url in urls:
+                if STOP_THREADS:
+                    break
+                try:
+                    driver.get(url)
+                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+                    time.sleep(2)  # Additional wait for dynamic content
+                    if content_type == "text":
+                        current_content = driver.page_source
+                    elif content_type == "media":
+                        if selector:
+                            try:
+                                elements = WebDriverWait(driver, 5).until(
+                                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
+                                )
+                                current_content = [element.get_attribute("src") for element in elements]
+                            except TimeoutException:
+                                logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
+                                current_content = []
+                        else:
+                            elements = driver.find_elements(By.TAG_NAME, "img")
+                            current_content = [element.get_attribute("src") for element in elements]
+                    else:
+                        current_content = driver.page_source
+                    current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest()
+                    if current_hash != previous_hashes[url]:
+                        previous_hashes[url] = current_hash
+                        date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                        HISTORY.append(f"Change detected at {url} on {date_time_str}")
+                        # Attempt to log to database
+                        connection = get_db_connection()
+                        if connection:
+                            try:
+                                cursor = connection.cursor()
+                                insert_query = """
+                                INSERT INTO scraped_data (url, content_hash, change_detected)
+                                VALUES (%s, %s, %s)
+                                """
+                                cursor.execute(insert_query, (url, current_hash, date_time_str))
+                                connection.commit()
+                                logging.info(f"Change detected at {url} on {date_time_str} and logged to database.")
+                            except mysql.connector.Error as err:
+                                logging.error(f"Error inserting data into database: {err}")
+                                # Fallback to CSV
+                                log_to_csv(storage_location, url, current_hash, date_time_str)
+                            finally:
+                                cursor.close()
+                                connection.close()
+                        else:
+                            # Fallback to CSV
+                            log_to_csv(storage_location, url, current_hash, date_time_str)
+                        # Update progress
+                        if progress:
+                            progress(1)
+                except (
+                    NoSuchElementException,
+                    StaleElementReferenceException,
+                    TimeoutException,
+                    Exception,
+                ) as e:
+                    logging.error(f"Error accessing {url}: {e}")
+                    if progress:
+                        progress(1)
+            time.sleep(scrape_interval * 60)  # Wait for the next scrape interval
+    finally:
+        driver.quit()
+        logging.info("ChromeDriver session ended.")
+# Function to start scraping
+def start_scraping(
+    storage_location: str,
+    urls: str,
+    scrape_interval: int,
+    content_type: str,
+    selector: str = None,
+    progress: gr.Progress = None
+) -> str:
+    """
+    Starts the scraping process in a separate thread with progress indication.
+    """
+    global CURRENT_TASK, HISTORY, STOP_THREADS
+    if STOP_THREADS:
+        STOP_THREADS = False  # Reset the flag if previously stopped
+    url_list = [url.strip() for url in urls.split(",") if url.strip()]
+    CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}"
+    HISTORY.append(f"Task started: {CURRENT_TASK}")
+    logging.info(f"Task started: {CURRENT_TASK}")
+    # Initialize database tables
+    initialize_database()
+    # Log initial observations
+    def log_initial_observations():
+        options = Options()
+        options.add_argument("--headless")
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        driver = create_driver(options)
+        if driver is None:
+            return
+        for url in url_list:
+            if STOP_THREADS:
+                break
+            try:
+                initial_hash = get_initial_observation(driver, url, content_type, selector)
+                if initial_hash:
+                    date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    HISTORY.append(f"Initial observation at {url}: {initial_hash}")
+                    # Attempt to log to database
+                    connection = get_db_connection()
+                    if connection:
+                        try:
+                            cursor = connection.cursor()
+                            insert_query = """
+                            INSERT INTO scraped_data (url, content_hash, change_detected)
+                            VALUES (%s, %s, %s)
+                            """
+                            cursor.execute(insert_query, (url, initial_hash, date_time_str))
+                            connection.commit()
+                            logging.info(f"Initial observation logged for {url} in database.")
+                        except mysql.connector.Error as err:
+                            logging.error(f"Error inserting initial observation into database: {err}")
+                            # Fallback to CSV
+                            log_to_csv(storage_location, url, initial_hash, date_time_str)
+                        finally:
+                            cursor.close()
+                            connection.close()
+                    else:
+                        # Fallback to CSV
+                        log_to_csv(storage_location, url, initial_hash, date_time_str)
+            except Exception as e:
+                HISTORY.append(f"Error accessing {url}: {e}")
+                logging.error(f"Error accessing {url}: {e}")
+        driver.quit()
+    # Start logging initial observations
+    initial_thread = threading.Thread(target=log_initial_observations, daemon=True)
+    initial_thread.start()
+    # Start the monitoring thread with progress
+    monitor_thread = threading.Thread(
+        target=monitor_urls,
+        args=(storage_location, url_list, scrape_interval, content_type, selector, progress),
+        daemon=True,
+    )
+    monitor_thread.start()
+    logging.info("Started scraping thread.")
+    return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes."
+# Function to stop scraping
+def stop_scraping() -> str:
+    """
+    Stops all ongoing scraping threads.
+    """
+    global STOP_THREADS
+    STOP_THREADS = True
+    HISTORY.append("Scraping stopped by user.")
+    logging.info("Scraping stop signal sent.")
+    return "Scraping has been stopped."
+# Function to display CSV content from MySQL or CSV
+def display_csv(storage_location: str, url: str) -> str:
+    """
+    Fetches and returns the scraped data for a given URL from the MySQL database or CSV.
+    """
+    try:
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
+                cursor.execute(query, (url,))
+                results = cursor.fetchall()
+                if not results:
+                    return "No data available for the selected URL."
+                df = pd.DataFrame(results)
+                cursor.close()
+                connection.close()
+                return df.to_string(index=False)
+            except mysql.connector.Error as err:
+                logging.error(f"Error fetching data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Fetching data from CSV.")
+        # Fallback to CSV
+        hostname = urlparse(url).hostname
+        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            return df.to_string(index=False)
+        else:
+            return "No data available."
+    except Exception as e:
+        logging.error(f"Error fetching data for {url}: {e}")
+        return f"Error fetching data for {url}: {e}"
+# Function to generate RSS feed from MySQL or CSV data
+def generate_rss_feed(storage_location: str, url: str) -> str:
+    """
+    Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV.
+    """
+    try:
+        connection = get_db_connection()
+        rss_feed = ""
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10"
+                cursor.execute(query, (url,))
+                results = cursor.fetchall()
+                if not results:
+                    return "No changes detected to include in RSS feed."
+                # Create the root RSS element
+                rss = ET.Element("rss", version="2.0")
+                channel = ET.SubElement(rss, "channel")
+                # Add channel elements
+                title = ET.SubElement(channel, "title")
+                title.text = f"RSS Feed for {urlparse(url).hostname}"
+                link = ET.SubElement(channel, "link")
+                link.text = url
+                description = ET.SubElement(channel, "description")
+                description.text = "Recent changes detected on the website."
+                # Add items to the feed
+                for row in results:
+                    item = ET.SubElement(channel, "item")
+                    item_title = ET.SubElement(item, "title")
+                    item_title.text = f"Change detected at {row['url']}"
+                    item_link = ET.SubElement(item, "link")
+                    item_link.text = row["url"]
+                    item_description = ET.SubElement(item, "description")
+                    item_description.text = f"Content changed on {row['change_detected']}"
+                    pub_date = ET.SubElement(item, "pubDate")
+                    pub_date.text = datetime.datetime.strptime(
+                        str(row['change_detected']), "%Y-%m-%d %H:%M:%S"
+                    ).strftime("%a, %d %b %Y %H:%M:%S +0000")
+                # Generate the XML string
+                rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
+                cursor.close()
+                connection.close()
+                return rss_feed
+            except mysql.connector.Error as err:
+                logging.error(f"Error fetching data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Generating RSS feed from CSV.")
+        # Fallback to CSV
+        hostname = urlparse(url).hostname
+        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path).tail(10)
+            if df.empty:
+                return "No changes detected to include in RSS feed."
+            # Create the root RSS element
+            rss = ET.Element("rss", version="2.0")
+            channel = ET.SubElement(rss, "channel")
+            # Add channel elements
+            title = ET.SubElement(channel, "title")
+            title.text = f"RSS Feed for {hostname}"
+            link = ET.SubElement(channel, "link")
+            link.text = url
+            description = ET.SubElement(channel, "description")
+            description.text = "Recent changes detected on the website."
+            # Add items to the feed
+            for _, row in df.iterrows():
+                item = ET.SubElement(channel, "item")
+                item_title = ET.SubElement(item, "title")
+                item_title.text = f"Change detected at {row['url']}"
+                item_link = ET.SubElement(item, "link")
+                item_link.text = row["url"]
+                item_description = ET.SubElement(item, "description")
+                item_description.text = f"Content changed on {row['date']} at {row['time']}"
+                pub_date = ET.SubElement(item, "pubDate")
+                pub_date.text = datetime.datetime.strptime(
+                    f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S"
+                ).strftime("%a, %d %b %Y %H:%M:%S +0000")
+            # Generate the XML string
+            rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8")
+            return rss_feed
+        else:
+            return "No data available."
+    except Exception as e:
+        logging.error(f"Error generating RSS feed for {url}: {e}")
+        return f"Error generating RSS feed for {url}: {e}"
+# Function to parse user commands using spaCy
+def parse_command(message: str) -> tuple:
+    """
+    Parses the user message using spaCy to identify if it contains a command.
+    Returns the command and its parameters if found, else (None, None).
+    """
+    doc = nlp(message.lower())
+    command = None
+    params = {}
+    # Define command patterns
+    if "filter" in message.lower():
+        # Example: "Filter apples, oranges in column Description"
+        match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE)
+        if match:
+            words = [word.strip() for word in match.group(1).split(",")]
+            column = match.group(2)
+            command = "filter"
+            params = {"words": words, "column": column}
+    elif "sort" in message.lower():
+        # Example: "Sort Price ascending"
+        match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE)
+        if match:
+            column = match.group(1)
+            order = match.group(2)
+            command = "sort"
+            params = {"column": column, "order": order}
+    elif "export to csv as" in message.lower():
+        # Example: "Export to CSV as filtered_data.csv"
+        match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE)
+        if match:
+            filename = match.group(1)
+            command = "export"
+            params = {"filename": filename}
+    elif "log action" in message.lower():
+        # Example: "Log action Filtered data for specific fruits"
+        match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE)
+        if match:
+            action = match.group(1)
+            command = "log"
+            params = {"action": action}
+    return command, params
+# Function to execute parsed commands
+def execute_command(command: str, params: dict) -> str:
+    """
+    Executes the corresponding function based on the command and parameters.
+    """
+    if command == "filter":
+        words = params["words"]
+        column = params["column"]
+        return filter_data(column, words)
+    elif command == "sort":
+        column = params["column"]
+        order = params["order"]
+        return sort_data(column, order)
+    elif command == "export":
+        filename = params["filename"]
+        return export_csv(filename)
+    elif command == "log":
+        action = params["action"]
+        return log_action(action)
+    else:
+        return "Unknown command."
+# Data Manipulation Functions
+def filter_data(column: str, words: list) -> str:
+    """
+    Filters the scraped data to include only rows where the specified column contains the given words.
+    Saves the filtered data to a new CSV file.
+    """
+    try:
+        storage_location = DEFAULT_FILE_PATH
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                # Fetch all data
+                query = "SELECT * FROM scraped_data"
+                cursor.execute(query)
+                results = cursor.fetchall()
+                if not results:
+                    return "No data available to filter."
+                df = pd.DataFrame(results)
+                # Create a regex pattern to match any of the words
+                pattern = '|'.join(words)
+                if column not in df.columns:
+                    return f"Column '{column}' does not exist in the data."
+                filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)]
+                if filtered_df.empty:
+                    return f"No records found with words {words} in column '{column}'."
+                # Save the filtered data to a new CSV
+                timestamp = int(time.time())
+                filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv")
+                filtered_df.to_csv(filtered_csv, index=False)
+                logging.info(f"Data filtered on column '{column}' for words {words}.")
+                return f"Data filtered and saved to {filtered_csv}."
+            except mysql.connector.Error as err:
+                logging.error(f"Error fetching data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Filtering data from CSV.")
+        # Fallback to CSV
+        csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
+        if not csv_files:
+            return "No CSV files found to filter."
+        # Assume the latest CSV is the target
+        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
+        df = pd.read_csv(latest_csv)
+        if column not in df.columns:
+            return f"Column '{column}' does not exist in the data."
+        filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)]
+        if filtered_df.empty:
+            return f"No records found with words {words} in column '{column}'."
+        # Save the filtered data to a new CSV
+        timestamp = int(time.time())
+        filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv")
+        filtered_df.to_csv(filtered_csv, index=False)
+        logging.info(f"Data filtered on column '{column}' for words {words}.")
+        return f"Data filtered and saved to {filtered_csv}."
+    except Exception as e:
+        logging.error(f"Error filtering data: {e}")
+        return f"Error filtering data: {e}"
+def sort_data(column: str, order: str) -> str:
+    """
+    Sorts the scraped data based on the specified column and order.
+    Saves the sorted data to a new CSV file.
+    """
+    try:
+        storage_location = DEFAULT_FILE_PATH
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                # Fetch all data
+                query = "SELECT * FROM scraped_data"
+                cursor.execute(query)
+                results = cursor.fetchall()
+                if not results:
+                    return "No data available to sort."
+                df = pd.DataFrame(results)
+                if column not in df.columns:
+                    return f"Column '{column}' does not exist in the data."
+                ascending = True if order.lower() == "ascending" else False
+                sorted_df = df.sort_values(by=column, ascending=ascending)
+                # Save the sorted data to a new CSV
+                timestamp = int(time.time())
+                sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv")
+                sorted_df.to_csv(sorted_csv, index=False)
+                logging.info(f"Data sorted on column '{column}' in {order} order.")
+                return f"Data sorted and saved to {sorted_csv}."
+            except mysql.connector.Error as err:
+                logging.error(f"Error fetching data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Sorting data from CSV.")
+        # Fallback to CSV
+        csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
+        if not csv_files:
+            return "No CSV files found to sort."
+        # Assume the latest CSV is the target
+        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
+        df = pd.read_csv(latest_csv)
+        if column not in df.columns:
+            return f"Column '{column}' does not exist in the data."
+        ascending = True if order.lower() == "ascending" else False
+        sorted_df = df.sort_values(by=column, ascending=ascending)
+        # Save the sorted data to a new CSV
+        timestamp = int(time.time())
+        sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv")
+        sorted_df.to_csv(sorted_csv, index=False)
+        logging.info(f"Data sorted on column '{column}' in {order} order.")
+        return f"Data sorted and saved to {sorted_csv}."
+    except Exception as e:
+        logging.error(f"Error sorting data: {e}")
+        return f"Error sorting data: {e}"
+def export_csv(filename: str) -> str:
+    """
+    Exports the latest scraped data to a specified CSV filename.
+    """
+    try:
+        storage_location = DEFAULT_FILE_PATH
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                # Fetch all data
+                query = "SELECT * FROM scraped_data"
+                cursor.execute(query)
+                results = cursor.fetchall()
+                if not results:
+                    return "No data available to export."
+                df = pd.DataFrame(results)
+                export_path = os.path.join(storage_location, filename)
+                df.to_csv(export_path, index=False)
+                logging.info(f"Data exported to {export_path}.")
+                return f"Data exported to {export_path}."
+            except mysql.connector.Error as err:
+                logging.error(f"Error exporting data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Exporting data from CSV.")
+        # Fallback to CSV
+        csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")]
+        if not csv_files:
+            return "No CSV files found to export."
+        # Assume the latest CSV is the target
+        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
+        df = pd.read_csv(latest_csv)
+        export_path = os.path.join(storage_location, filename)
+        df.to_csv(export_path, index=False)
+        logging.info(f"Data exported to {export_path}.")
+        return f"Data exported to {export_path}."
+    except Exception as e:
+        logging.error(f"Error exporting CSV: {e}")
+        return f"Error exporting CSV: {e}"
+def log_action(action: str) -> str:
+    """
+    Logs a custom action message to the MySQL database or CSV.
+    """
+    try:
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor()
+                insert_query = """
+                INSERT INTO action_logs (action)
+                VALUES (%s)
+                """
+                cursor.execute(insert_query, (action,))
+                connection.commit()
+                logging.info(f"Action logged in database: {action}")
+                cursor.close()
+                connection.close()
+                return f"Action logged: {action}"
+            except mysql.connector.Error as err:
+                logging.error(f"Error logging action to database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Logging action to CSV.")
+        # Fallback to CSV
+        storage_location = DEFAULT_FILE_PATH
+        try:
+            os.makedirs(storage_location, exist_ok=True)
+            csv_file_path = os.path.join(storage_location, "action_logs.csv")
+            file_exists = os.path.isfile(csv_file_path)
+            with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
+                fieldnames = ["timestamp", "action"]
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+                if not file_exists:
+                    writer.writeheader()
+                writer.writerow(
+                    {
+                        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                        "action": action,
+                    }
+                )
+            logging.info(f"Action logged to CSV: {action}")
+            return f"Action logged: {action}"
+        except Exception as e:
+            logging.error(f"Error logging action to CSV: {e}")
+            return f"Error logging action: {e}"
+    except Exception as e:
+        logging.error(f"Error logging action: {e}")
+        return f"Error logging action: {e}"
+# Function to get the latest CSV file based on modification time
+def get_latest_csv() -> str:
+    """
+    Retrieves the latest CSV file from the storage directory based on modification time.
+    """
+    try:
+        storage_location = "/home/users/app/scraped_data"
+        csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")]
+        if not csv_files:
+            return None
+        latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime)
+        return latest_csv
+    except Exception as e:
+        logging.error(f"Error retrieving latest CSV: {e}")
+        return None
+def respond(
+    message: str,
+    history: list,
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+) -> str:
+    """
+    Generates a response using OpenLlamaForCausalLM.
+    """
+    try:
+        # Check if the message contains a command
+        command, params = parse_command(message)
+        if command:
+            # Execute the corresponding function
+            response = execute_command(command, params)
+        else:
+            # Generate a regular response using OpenLlama
+            prompt = (
+                f"System: {system_message}\n"
+                f"History: {history}\n"
+                f"User: {message}\n"
+                f"Assistant:"
+            )
+            response = openllama_pipeline(
+                prompt,
+                max_length=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+            )[0]["generated_text"]
+            # Extract the assistant's reply
+            response = response.split("Assistant:")[-1].strip()
+        return response
+    except Exception as e:
+        logging.error(f"Error generating response: {e}")
+        return "Error generating response."
+# Define the Gradio interface
+def create_interface() -> gr.Blocks():
+    """
+    Defines and returns the Gradio interface for the application.
+    """
+    with gr.Blocks() as demo:
+        gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
+        with gr.Row():
+            with gr.Column():
+                # Scraping Controls
+                storage_location = gr.Textbox(
+                    value=DEFAULT_FILE_PATH, label="Storage Location"
+                )
+                urls = gr.Textbox(
+                    label="URLs (comma separated)",
+                    placeholder="https://example.com, https://anotherexample.com",
+                )
+                scrape_interval = gr.Slider(
+                    minimum=1,
+                    maximum=60,
+                    value=5,
+                    step=1,
+                    label="Scrape Interval (minutes)",
+                )
+                content_type = gr.Radio(
+                    choices=["text", "media", "both"],
+                    value="text",
+                    label="Content Type",
+                )
+                selector = gr.Textbox(
+                    label="CSS Selector for Media (Optional)",
+                    placeholder="e.g., img.main-image",
+                )
+                start_button = gr.Button("Start Scraping")
+                stop_button = gr.Button("Stop Scraping")
+                status_output = gr.Textbox(
+                    label="Status Output", interactive=False, lines=2
+                )
+            with gr.Column():
+                # Chat Interface
+                chat_history = gr.Chatbot(label="Chat History")
+                with gr.Row():
+                    message = gr.Textbox(label="Message", placeholder="Type your message here...")
+                system_message = gr.Textbox(
+                    value="You are a helpful assistant.", label="System message"
+                )
+                max_tokens = gr.Slider(
+                    minimum=1,
+                    maximum=2048,
+                    value=512,
+                    step=1,
+                    label="Max new tokens",
+                )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=4.0,
+                    value=0.7,
+                    step=0.1,
+                    label="Temperature",
+                )
+                top_p = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.95,
+                    step=0.05,
+                    label="Top-p (nucleus sampling)",
+                )
+                response_box = gr.Textbox(label="Response", interactive=False, lines=2)
+        with gr.Row():
+            with gr.Column():
+                # CSV Display Controls
+                selected_url_csv = gr.Textbox(
+                    label="Select URL for CSV Content",
+                    placeholder="https://example.com",
+                )
+                csv_button = gr.Button("Display CSV Content")
+                csv_content_output = gr.Textbox(
+                    label="CSV Content Output", interactive=False, lines=10
+                )
+            with gr.Column():
+                # RSS Feed Generation Controls
+                selected_url_rss = gr.Textbox(
+                    label="Select URL for RSS Feed",
+                    placeholder="https://example.com",
+                )
+                rss_button = gr.Button("Generate RSS Feed")
+                rss_output = gr.Textbox(
+                    label="RSS Feed Output", interactive=False, lines=20
+                )
+        # Historical Data View
+        with gr.Row():
+            historical_view_url = gr.Textbox(
+                label="Select URL for Historical Data",
+                placeholder="https://example.com",
+            )
+            historical_button = gr.Button("View Historical Data")
+            historical_output = gr.Dataframe(
+                headers=["ID", "URL", "Content Hash", "Change Detected"],
+                label="Historical Data",
+                interactive=False
+            )
+        # Connect buttons to their respective functions
+        start_button.click(
+            fn=start_scraping,
+            inputs=[
+                storage_location,
+                urls,
+                scrape_interval,
+                content_type,
+                selector,
+            ],
+            outputs=status_output,
+        )
+        stop_button.click(fn=stop_scraping, outputs=status_output)
+        csv_button.click(
+            fn=display_csv,
+            inputs=[storage_location, selected_url_csv],
+            outputs=csv_content_output,
+        )
+        rss_button.click(
+            fn=generate_rss_feed,
+            inputs=[storage_location, selected_url_rss],
+            outputs=rss_output,
+        )
+        historical_button.click(
+            fn=display_historical_data,
+            inputs=[storage_location, historical_view_url],
+            outputs=historical_output,
+        )
+        # Connect message submission to the chat interface
+        def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val):
+            if not message_input.strip():
+                return history, "Please enter a message."
+            response = respond(
+                message_input,
+                history,
+                system_msg,
+                max_toks,
+                temp,
+                top_p_val,
+            )
+            history.append((message_input, response))
+            return history, response
+        message.submit(
+            update_chat,
+            inputs=[
+                message,
+                chat_history,
+                system_message,
+                max_tokens,
+                temperature,
+                top_p,
+            ],
+            outputs=[chat_history, response_box],
+        )
+    return demo
+# Function to display historical data
+def display_historical_data(storage_location: str, url: str):
+    """
+    Retrieves and displays historical scraping data for a given URL.
+    """
+    try:
+        connection = get_db_connection()
+        if connection:
+            try:
+                cursor = connection.cursor(dictionary=True)
+                query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC"
+                cursor.execute(query, (url,))
+                results = cursor.fetchall()
+                if not results:
+                    return pd.DataFrame()
+                df = pd.DataFrame(results)
+                cursor.close()
+                connection.close()
+                return df
+            except mysql.connector.Error as err:
+                logging.error(f"Error fetching historical data from database: {err}")
+                # Fallback to CSV
+        else:
+            logging.info("No database connection. Fetching historical data from CSV.")
+        # Fallback to CSV
+        hostname = urlparse(url).hostname
+        csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            return df
+        else:
+            return pd.DataFrame()
+    except Exception as e:
+        logging.error(f"Error fetching historical data for {url}: {e}")
+        return pd.DataFrame()
+    def load_model():
+        """
+        Loads the openLlama model and tokenizer once and returns the pipeline.
+        """
+    try:
+        model_name = "openlm-research/open_llama_3b_v2"
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        # This should be inside the try block
+        max_supported_length = 2048
+        openllama_pipeline = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            truncation=True,
+            max_length=max_supported_length,
+            temperature=0.7,
+            top_p=0.95,
+            device=0 if torch.cuda.is_available() else -1,
+        )
+        logging.info("Model loaded successfully.")
+        return openllama_pipeline  # Return the pipeline
+    except Exception as e:
+        logging.error(f"Error loading google/flan-t5-xl model: {e}")
+        return None
 def load_model(model_name: str):
     """
         logging.error(f"Error loading {model_name} model: {e}")
         return None
+# Automated Testing using unittest
+class TestApp(unittest.TestCase):
+    def test_parse_command_filter(self):
+        command = "Filter apples, oranges in column Description"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "filter")
+        self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"])
+        self.assertEqual(parsed_command[1]["column"], "Description")
+    def test_parse_command_sort(self):
+        command = "Sort Price ascending"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "sort")
+        self.assertEqual(parsed_command[1]["column"], "Price")
+        self.assertEqual(parsed_command[1]["order"], "ascending")
+    def test_parse_command_export(self):
+        command = "Export to CSV as filtered_data.csv"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "export")
+        self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv")
+    def test_parse_command_log(self):
+        command = "Log action Filtered data for specific fruits"
+        parsed_command = parse_command(command)
+        self.assertEqual(parsed_command[0], "log")
+        self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits")
+    def test_database_connection(self):
+        connection = get_db_connection()
+        # Connection may be None if not configured; adjust the test accordingly
+        if connection:
+            self.assertTrue(connection.is_connected())
+            connection.close()
+        else:
+            self.assertIsNone(connection)
+def main():
+    # Initialize and run the application
+    logging.info("Starting the application...")
+    model = load_model()
+    if model:
+        logging.info("Application started successfully.")
+        print("Main function executed")
+        print("Creating interface...")
+        demo = create_interface()
+        print("Launching interface...")
+        demo.launch(server_name="0.0.0.0", server_port=7860)
+    else:
+        logging.error("Failed to start the application.")
+# Main execution
+if __name__ == "__main__":
+    # Initialize database
+    initialize_database()
+    # Create and launch Gradio interface
+    demo = create_interface()
+    demo.launch()
+    # Run automated tests
+    unittest.main(argv=[''], exit=False)