Spaces:
Runtime error
Runtime error
import os | |
import time | |
import hashlib | |
import logging | |
import datetime | |
import csv | |
import threading | |
import re | |
import unittest | |
from urllib.parse import urlparse | |
import pandas as pd | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import ( | |
TimeoutException, | |
NoSuchElementException, | |
StaleElementReferenceException, | |
) | |
from webdriver_manager.chrome import ChromeDriverManager | |
from transformers import AutoTokenizer, OpenLlamaForCausalLM, pipeline | |
import gradio as gr | |
import xml.etree.ElementTree as ET | |
import torch | |
import mysql.connector | |
from mysql.connector import errorcode, pooling | |
from dotenv import load_dotenv | |
from huggingface_hub import login | |
model_name = "openlm-research/open_llama_3b_v2" # Or another OpenLlama variant | |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False) | |
model = OpenLlamaForCausalLM.from_pretrained(model_name) | |
openllama_pipeline = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device=0 if torch.cuda.is_available() else -1 # Use GPU if available | |
) | |
nlp = AutoTokenizer.from_pretrained("bert-base-uncased") | |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") | |
if not HUGGINGFACE_TOKEN: | |
raise ValueError("HUGGINGFACE_TOKEN is not set in the environment variables.") | |
login(token=HUGGINGFACE_TOKEN) | |
# Load environment variables from .env file | |
load_dotenv() | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
# Define constants | |
DEFAULT_FILE_PATH = "scraped_data" | |
PURPOSE = ( | |
"You go to Culvers sites, you continuously seek changes on them since your last observation. " | |
"Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data." | |
) | |
# Global variables for task management | |
HISTORY = [] | |
CURRENT_TASK = None | |
STOP_THREADS = False # Flag to stop scraping threads | |
# Database Pooling Configuration | |
DB_POOL_NAME = "mypool" | |
DB_POOL_SIZE = 5 # Adjust based on expected load | |
try: | |
dbconfig = { | |
"host": os.getenv("DB_HOST"), | |
"user": os.getenv("DB_USER"), | |
"password": os.getenv("DB_PASSWORD"), | |
"database": os.getenv("DB_NAME"), | |
} | |
connection_pool = mysql.connector.pooling.MySQLConnectionPool( | |
pool_name=DB_POOL_NAME, | |
pool_size=DB_POOL_SIZE, | |
pool_reset_session=True, | |
**dbconfig | |
) | |
logging.info("Database connection pool created successfully.") | |
except mysql.connector.Error as err: | |
logging.warning(f"Database connection pool creation failed: {err}") | |
connection_pool = None # Will use CSV as fallback | |
# Function to get a database connection from the pool | |
def get_db_connection(): | |
""" | |
Retrieves a connection from the pool. Returns None if pool is not available. | |
""" | |
if connection_pool: | |
try: | |
connection = connection_pool.get_connection() | |
if connection.is_connected(): | |
return connection | |
except mysql.connector.Error as err: | |
logging.error(f"Error getting connection from pool: {err}") | |
return None | |
# Initialize Database: Create tables and indexes | |
def initialize_database(): | |
""" | |
Initializes the database by creating necessary tables and indexes if they do not exist. | |
""" | |
connection = get_db_connection() | |
if connection is None: | |
logging.info("Database initialization skipped. Using CSV storage.") | |
return | |
cursor = connection.cursor() | |
try: | |
# Create table for scraped data | |
create_scraped_data_table = """ | |
CREATE TABLE IF NOT EXISTS scraped_data ( | |
id INT AUTO_INCREMENT PRIMARY KEY, | |
url VARCHAR(255) NOT NULL, | |
content_hash VARCHAR(64) NOT NULL, | |
change_detected DATETIME NOT NULL | |
) | |
""" | |
cursor.execute(create_scraped_data_table) | |
logging.info("Table 'scraped_data' is ready.") | |
# Create indexes for performance | |
create_index_url = "CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url)" | |
create_index_change = "CREATE INDEX IF NOT EXISTS idx_change_detected ON scraped_data(change_detected)" | |
cursor.execute(create_index_url) | |
cursor.execute(create_index_change) | |
logging.info("Indexes on 'url' and 'change_detected' columns created.") | |
# Create table for action logs | |
create_action_logs_table = """ | |
CREATE TABLE IF NOT EXISTS action_logs ( | |
id INT AUTO_INCREMENT PRIMARY KEY, | |
action VARCHAR(255) NOT NULL, | |
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP | |
) | |
""" | |
cursor.execute(create_action_logs_table) | |
logging.info("Table 'action_logs' is ready.") | |
except mysql.connector.Error as err: | |
logging.error(f"Error initializing database: {err}") | |
finally: | |
cursor.close() | |
connection.close() | |
logging.info("Database initialization complete.") | |
# Function to create WebDriver | |
def create_driver(options: Options) -> webdriver.Chrome: | |
""" | |
Initializes and returns a Selenium Chrome WebDriver instance. | |
""" | |
try: | |
driver = webdriver.Chrome( | |
service=Service(ChromeDriverManager().install()), options=options | |
) | |
logging.info("ChromeDriver initialized successfully.") | |
return driver | |
except Exception as exception: | |
logging.error(f"Error initializing ChromeDriver: {exception}") | |
return None | |
# Function to log changes to CSV | |
def log_to_csv(storage_location: str, url: str, content_hash: str, change_detected: str): | |
""" | |
Logs the change to a CSV file in the storage_location. | |
""" | |
try: | |
os.makedirs(storage_location, exist_ok=True) | |
csv_file_path = os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv") | |
file_exists = os.path.isfile(csv_file_path) | |
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile: | |
fieldnames = ["date", "time", "url", "content_hash", "change"] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
if not file_exists: | |
writer.writeheader() | |
writer.writerow( | |
{ | |
"date": change_detected.split()[0], | |
"time": change_detected.split()[1], | |
"url": url, | |
"content_hash": content_hash, | |
"change": "Content changed", | |
} | |
) | |
logging.info(f"Change detected at {url} on {change_detected} and logged to CSV.") | |
except Exception as e: | |
logging.error(f"Error logging data to CSV: {e}") | |
# Function to get initial observation | |
def get_initial_observation( | |
driver: webdriver.Chrome, url: str, content_type: str, selector: str = None | |
) -> str: | |
""" | |
Retrieves the initial content from the URL and returns its MD5 hash. | |
""" | |
try: | |
driver.get(url) | |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) | |
time.sleep(2) # Additional wait for dynamic content | |
if content_type == "text": | |
initial_content = driver.page_source | |
elif content_type == "media": | |
if selector: | |
try: | |
elements = WebDriverWait(driver, 5).until( | |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)) | |
) | |
initial_content = [element.get_attribute("src") for element in elements] | |
except TimeoutException: | |
logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}") | |
initial_content = [] | |
else: | |
elements = driver.find_elements(By.TAG_NAME, "img") | |
initial_content = [element.get_attribute("src") for element in elements] | |
else: | |
initial_content = driver.page_source | |
initial_hash = hashlib.md5(str(initial_content).encode("utf-8")).hexdigest() | |
logging.info(f"Initial hash for {url}: {initial_hash}") | |
return initial_hash | |
except Exception as exception: | |
logging.error(f"Error accessing {url}: {exception}") | |
return None | |
# Function to monitor URLs for changes | |
def monitor_urls( | |
storage_location: str, | |
urls: list, | |
scrape_interval: int, | |
content_type: str, | |
selector: str = None, | |
progress: gr.Progress = None | |
): | |
""" | |
Monitors the specified URLs for changes and logs any detected changes to the database or CSV. | |
""" | |
global HISTORY, STOP_THREADS | |
previous_hashes = {url: "" for url in urls} | |
options = Options() | |
options.add_argument("--headless") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
driver = create_driver(options) | |
if driver is None: | |
logging.error("WebDriver could not be initialized. Exiting monitor.") | |
return | |
try: | |
while not STOP_THREADS: | |
for url in urls: | |
if STOP_THREADS: | |
break | |
try: | |
driver.get(url) | |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) | |
time.sleep(2) # Additional wait for dynamic content | |
if content_type == "text": | |
current_content = driver.page_source | |
elif content_type == "media": | |
if selector: | |
try: | |
elements = WebDriverWait(driver, 5).until( | |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)) | |
) | |
current_content = [element.get_attribute("src") for element in elements] | |
except TimeoutException: | |
logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}") | |
current_content = [] | |
else: | |
elements = driver.find_elements(By.TAG_NAME, "img") | |
current_content = [element.get_attribute("src") for element in elements] | |
else: | |
current_content = driver.page_source | |
current_hash = hashlib.md5(str(current_content).encode("utf-8")).hexdigest() | |
if current_hash != previous_hashes[url]: | |
previous_hashes[url] = current_hash | |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
HISTORY.append(f"Change detected at {url} on {date_time_str}") | |
# Attempt to log to database | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor() | |
insert_query = """ | |
INSERT INTO scraped_data (url, content_hash, change_detected) | |
VALUES (%s, %s, %s) | |
""" | |
cursor.execute(insert_query, (url, current_hash, date_time_str)) | |
connection.commit() | |
logging.info(f"Change detected at {url} on {date_time_str} and logged to database.") | |
except mysql.connector.Error as err: | |
logging.error(f"Error inserting data into database: {err}") | |
# Fallback to CSV | |
log_to_csv(storage_location, url, current_hash, date_time_str) | |
finally: | |
cursor.close() | |
connection.close() | |
else: | |
# Fallback to CSV | |
log_to_csv(storage_location, url, current_hash, date_time_str) | |
# Update progress | |
if progress: | |
progress(1) | |
except ( | |
NoSuchElementException, | |
StaleElementReferenceException, | |
TimeoutException, | |
Exception, | |
) as e: | |
logging.error(f"Error accessing {url}: {e}") | |
if progress: | |
progress(1) | |
time.sleep(scrape_interval * 60) # Wait for the next scrape interval | |
finally: | |
driver.quit() | |
logging.info("ChromeDriver session ended.") | |
# Function to start scraping | |
def start_scraping( | |
storage_location: str, | |
urls: str, | |
scrape_interval: int, | |
content_type: str, | |
selector: str = None, | |
progress: gr.Progress = None | |
) -> str: | |
""" | |
Starts the scraping process in a separate thread with progress indication. | |
""" | |
global CURRENT_TASK, HISTORY, STOP_THREADS | |
if STOP_THREADS: | |
STOP_THREADS = False # Reset the flag if previously stopped | |
url_list = [url.strip() for url in urls.split(",") if url.strip()] | |
CURRENT_TASK = f"Monitoring URLs: {', '.join(url_list)}" | |
HISTORY.append(f"Task started: {CURRENT_TASK}") | |
logging.info(f"Task started: {CURRENT_TASK}") | |
# Initialize database tables | |
initialize_database() | |
# Log initial observations | |
def log_initial_observations(): | |
options = Options() | |
options.add_argument("--headless") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
driver = create_driver(options) | |
if driver is None: | |
return | |
for url in url_list: | |
if STOP_THREADS: | |
break | |
try: | |
initial_hash = get_initial_observation(driver, url, content_type, selector) | |
if initial_hash: | |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
HISTORY.append(f"Initial observation at {url}: {initial_hash}") | |
# Attempt to log to database | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor() | |
insert_query = """ | |
INSERT INTO scraped_data (url, content_hash, change_detected) | |
VALUES (%s, %s, %s) | |
""" | |
cursor.execute(insert_query, (url, initial_hash, date_time_str)) | |
connection.commit() | |
logging.info(f"Initial observation logged for {url} in database.") | |
except mysql.connector.Error as err: | |
logging.error(f"Error inserting initial observation into database: {err}") | |
# Fallback to CSV | |
log_to_csv(storage_location, url, initial_hash, date_time_str) | |
finally: | |
cursor.close() | |
connection.close() | |
else: | |
# Fallback to CSV | |
log_to_csv(storage_location, url, initial_hash, date_time_str) | |
except Exception as e: | |
HISTORY.append(f"Error accessing {url}: {e}") | |
logging.error(f"Error accessing {url}: {e}") | |
driver.quit() | |
# Start logging initial observations | |
initial_thread = threading.Thread(target=log_initial_observations, daemon=True) | |
initial_thread.start() | |
# Start the monitoring thread with progress | |
monitor_thread = threading.Thread( | |
target=monitor_urls, | |
args=(storage_location, url_list, scrape_interval, content_type, selector, progress), | |
daemon=True, | |
) | |
monitor_thread.start() | |
logging.info("Started scraping thread.") | |
return f"Started scraping {', '.join(url_list)} every {scrape_interval} minutes." | |
# Function to stop scraping | |
def stop_scraping() -> str: | |
""" | |
Stops all ongoing scraping threads. | |
""" | |
global STOP_THREADS | |
STOP_THREADS = True | |
HISTORY.append("Scraping stopped by user.") | |
logging.info("Scraping stop signal sent.") | |
return "Scraping has been stopped." | |
# Function to display CSV content from MySQL or CSV | |
def display_csv(storage_location: str, url: str) -> str: | |
""" | |
Fetches and returns the scraped data for a given URL from the MySQL database or CSV. | |
""" | |
try: | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor(dictionary=True) | |
query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC" | |
cursor.execute(query, (url,)) | |
results = cursor.fetchall() | |
if not results: | |
return "No data available for the selected URL." | |
df = pd.DataFrame(results) | |
cursor.close() | |
connection.close() | |
return df.to_string(index=False) | |
except mysql.connector.Error as err: | |
logging.error(f"Error fetching data from database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Fetching data from CSV.") | |
# Fallback to CSV | |
hostname = urlparse(url).hostname | |
csv_path = os.path.join(storage_location, f"{hostname}_changes.csv") | |
if os.path.exists(csv_path): | |
df = pd.read_csv(csv_path) | |
return df.to_string(index=False) | |
else: | |
return "No data available." | |
except Exception as e: | |
logging.error(f"Error fetching data for {url}: {e}") | |
return f"Error fetching data for {url}: {e}" | |
# Function to generate RSS feed from MySQL or CSV data | |
def generate_rss_feed(storage_location: str, url: str) -> str: | |
""" | |
Generates an RSS feed for the latest changes detected on a given URL from the MySQL database or CSV. | |
""" | |
try: | |
connection = get_db_connection() | |
rss_feed = "" | |
if connection: | |
try: | |
cursor = connection.cursor(dictionary=True) | |
query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC LIMIT 10" | |
cursor.execute(query, (url,)) | |
results = cursor.fetchall() | |
if not results: | |
return "No changes detected to include in RSS feed." | |
# Create the root RSS element | |
rss = ET.Element("rss", version="2.0") | |
channel = ET.SubElement(rss, "channel") | |
# Add channel elements | |
title = ET.SubElement(channel, "title") | |
title.text = f"RSS Feed for {urlparse(url).hostname}" | |
link = ET.SubElement(channel, "link") | |
link.text = url | |
description = ET.SubElement(channel, "description") | |
description.text = "Recent changes detected on the website." | |
# Add items to the feed | |
for row in results: | |
item = ET.SubElement(channel, "item") | |
item_title = ET.SubElement(item, "title") | |
item_title.text = f"Change detected at {row['url']}" | |
item_link = ET.SubElement(item, "link") | |
item_link.text = row["url"] | |
item_description = ET.SubElement(item, "description") | |
item_description.text = f"Content changed on {row['change_detected']}" | |
pub_date = ET.SubElement(item, "pubDate") | |
pub_date.text = datetime.datetime.strptime( | |
str(row['change_detected']), "%Y-%m-%d %H:%M:%S" | |
).strftime("%a, %d %b %Y %H:%M:%S +0000") | |
# Generate the XML string | |
rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8") | |
cursor.close() | |
connection.close() | |
return rss_feed | |
except mysql.connector.Error as err: | |
logging.error(f"Error fetching data from database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Generating RSS feed from CSV.") | |
# Fallback to CSV | |
hostname = urlparse(url).hostname | |
csv_path = os.path.join(storage_location, f"{hostname}_changes.csv") | |
if os.path.exists(csv_path): | |
df = pd.read_csv(csv_path).tail(10) | |
if df.empty: | |
return "No changes detected to include in RSS feed." | |
# Create the root RSS element | |
rss = ET.Element("rss", version="2.0") | |
channel = ET.SubElement(rss, "channel") | |
# Add channel elements | |
title = ET.SubElement(channel, "title") | |
title.text = f"RSS Feed for {hostname}" | |
link = ET.SubElement(channel, "link") | |
link.text = url | |
description = ET.SubElement(channel, "description") | |
description.text = "Recent changes detected on the website." | |
# Add items to the feed | |
for _, row in df.iterrows(): | |
item = ET.SubElement(channel, "item") | |
item_title = ET.SubElement(item, "title") | |
item_title.text = f"Change detected at {row['url']}" | |
item_link = ET.SubElement(item, "link") | |
item_link.text = row["url"] | |
item_description = ET.SubElement(item, "description") | |
item_description.text = f"Content changed on {row['date']} at {row['time']}" | |
pub_date = ET.SubElement(item, "pubDate") | |
pub_date.text = datetime.datetime.strptime( | |
f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S" | |
).strftime("%a, %d %b %Y %H:%M:%S +0000") | |
# Generate the XML string | |
rss_feed = ET.tostring(rss, encoding="utf-8", method="xml").decode("utf-8") | |
return rss_feed | |
else: | |
return "No data available." | |
except Exception as e: | |
logging.error(f"Error generating RSS feed for {url}: {e}") | |
return f"Error generating RSS feed for {url}: {e}" | |
# Function to parse user commands using spaCy | |
def parse_command(message: str) -> tuple: | |
""" | |
Parses the user message using spaCy to identify if it contains a command. | |
Returns the command and its parameters if found, else (None, None). | |
""" | |
doc = nlp(message.lower()) | |
command = None | |
params = {} | |
# Define command patterns | |
if "filter" in message.lower(): | |
# Example: "Filter apples, oranges in column Description" | |
match = re.search(r"filter\s+([\w\s,]+)\s+in\s+column\s+(\w+)", message, re.IGNORECASE) | |
if match: | |
words = [word.strip() for word in match.group(1).split(",")] | |
column = match.group(2) | |
command = "filter" | |
params = {"words": words, "column": column} | |
elif "sort" in message.lower(): | |
# Example: "Sort Price ascending" | |
match = re.search(r"sort\s+(\w+)\s+(ascending|descending)", message, re.IGNORECASE) | |
if match: | |
column = match.group(1) | |
order = match.group(2) | |
command = "sort" | |
params = {"column": column, "order": order} | |
elif "export to csv as" in message.lower(): | |
# Example: "Export to CSV as filtered_data.csv" | |
match = re.search(r"export\s+to\s+csv\s+as\s+([\w\-]+\.csv)", message, re.IGNORECASE) | |
if match: | |
filename = match.group(1) | |
command = "export" | |
params = {"filename": filename} | |
elif "log action" in message.lower(): | |
# Example: "Log action Filtered data for specific fruits" | |
match = re.search(r"log\s+action\s+(.+)", message, re.IGNORECASE) | |
if match: | |
action = match.group(1) | |
command = "log" | |
params = {"action": action} | |
return command, params | |
# Function to execute parsed commands | |
def execute_command(command: str, params: dict) -> str: | |
""" | |
Executes the corresponding function based on the command and parameters. | |
""" | |
if command == "filter": | |
words = params["words"] | |
column = params["column"] | |
return filter_data(column, words) | |
elif command == "sort": | |
column = params["column"] | |
order = params["order"] | |
return sort_data(column, order) | |
elif command == "export": | |
filename = params["filename"] | |
return export_csv(filename) | |
elif command == "log": | |
action = params["action"] | |
return log_action(action) | |
else: | |
return "Unknown command." | |
# Data Manipulation Functions | |
def filter_data(column: str, words: list) -> str: | |
""" | |
Filters the scraped data to include only rows where the specified column contains the given words. | |
Saves the filtered data to a new CSV file. | |
""" | |
try: | |
storage_location = DEFAULT_FILE_PATH | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor(dictionary=True) | |
# Fetch all data | |
query = "SELECT * FROM scraped_data" | |
cursor.execute(query) | |
results = cursor.fetchall() | |
if not results: | |
return "No data available to filter." | |
df = pd.DataFrame(results) | |
# Create a regex pattern to match any of the words | |
pattern = '|'.join(words) | |
if column not in df.columns: | |
return f"Column '{column}' does not exist in the data." | |
filtered_df = df[df[column].astype(str).str.contains(pattern, case=False, na=False)] | |
if filtered_df.empty: | |
return f"No records found with words {words} in column '{column}'." | |
# Save the filtered data to a new CSV | |
timestamp = int(time.time()) | |
filtered_csv = os.path.join(storage_location, f"filtered_data_{timestamp}.csv") | |
filtered_df.to_csv(filtered_csv, index=False) | |
logging.info(f"Data filtered on column '{column}' for words {words}.") | |
return f"Data filtered and saved to {filtered_csv}." | |
except mysql.connector.Error as err: | |
logging.error(f"Error fetching data from database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Filtering data from CSV.") | |
# Fallback to CSV | |
csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")] | |
if not csv_files: | |
return "No CSV files found to filter." | |
# Assume the latest CSV is the target | |
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime) | |
df = pd.read_csv(latest_csv) | |
if column not in df.columns: | |
return f"Column '{column}' does not exist in the data." | |
filtered_df = df[df[column].astype(str).str.contains('|'.join(words), case=False, na=False)] | |
if filtered_df.empty: | |
return f"No records found with words {words} in column '{column}'." | |
# Save the filtered data to a new CSV | |
timestamp = int(time.time()) | |
filtered_csv = latest_csv.replace(".csv", f"_filtered_{timestamp}.csv") | |
filtered_df.to_csv(filtered_csv, index=False) | |
logging.info(f"Data filtered on column '{column}' for words {words}.") | |
return f"Data filtered and saved to {filtered_csv}." | |
except Exception as e: | |
logging.error(f"Error filtering data: {e}") | |
return f"Error filtering data: {e}" | |
def sort_data(column: str, order: str) -> str: | |
""" | |
Sorts the scraped data based on the specified column and order. | |
Saves the sorted data to a new CSV file. | |
""" | |
try: | |
storage_location = DEFAULT_FILE_PATH | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor(dictionary=True) | |
# Fetch all data | |
query = "SELECT * FROM scraped_data" | |
cursor.execute(query) | |
results = cursor.fetchall() | |
if not results: | |
return "No data available to sort." | |
df = pd.DataFrame(results) | |
if column not in df.columns: | |
return f"Column '{column}' does not exist in the data." | |
ascending = True if order.lower() == "ascending" else False | |
sorted_df = df.sort_values(by=column, ascending=ascending) | |
# Save the sorted data to a new CSV | |
timestamp = int(time.time()) | |
sorted_csv = os.path.join(storage_location, f"sorted_data_{column}_{order.lower()}_{timestamp}.csv") | |
sorted_df.to_csv(sorted_csv, index=False) | |
logging.info(f"Data sorted on column '{column}' in {order} order.") | |
return f"Data sorted and saved to {sorted_csv}." | |
except mysql.connector.Error as err: | |
logging.error(f"Error fetching data from database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Sorting data from CSV.") | |
# Fallback to CSV | |
csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")] | |
if not csv_files: | |
return "No CSV files found to sort." | |
# Assume the latest CSV is the target | |
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime) | |
df = pd.read_csv(latest_csv) | |
if column not in df.columns: | |
return f"Column '{column}' does not exist in the data." | |
ascending = True if order.lower() == "ascending" else False | |
sorted_df = df.sort_values(by=column, ascending=ascending) | |
# Save the sorted data to a new CSV | |
timestamp = int(time.time()) | |
sorted_csv = latest_csv.replace(".csv", f"_sorted_{order.lower()}_{timestamp}.csv") | |
sorted_df.to_csv(sorted_csv, index=False) | |
logging.info(f"Data sorted on column '{column}' in {order} order.") | |
return f"Data sorted and saved to {sorted_csv}." | |
except Exception as e: | |
logging.error(f"Error sorting data: {e}") | |
return f"Error sorting data: {e}" | |
def export_csv(filename: str) -> str: | |
""" | |
Exports the latest scraped data to a specified CSV filename. | |
""" | |
try: | |
storage_location = DEFAULT_FILE_PATH | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor(dictionary=True) | |
# Fetch all data | |
query = "SELECT * FROM scraped_data" | |
cursor.execute(query) | |
results = cursor.fetchall() | |
if not results: | |
return "No data available to export." | |
df = pd.DataFrame(results) | |
export_path = os.path.join(storage_location, filename) | |
df.to_csv(export_path, index=False) | |
logging.info(f"Data exported to {export_path}.") | |
return f"Data exported to {export_path}." | |
except mysql.connector.Error as err: | |
logging.error(f"Error exporting data from database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Exporting data from CSV.") | |
# Fallback to CSV | |
csv_files = [f for f in os.listdir(storage_location) if f.endswith("_changes.csv") or f.endswith("_filtered.csv") or f.endswith("_sorted_asc.csv") or f.endswith("_sorted_desc.csv")] | |
if not csv_files: | |
return "No CSV files found to export." | |
# Assume the latest CSV is the target | |
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime) | |
df = pd.read_csv(latest_csv) | |
export_path = os.path.join(storage_location, filename) | |
df.to_csv(export_path, index=False) | |
logging.info(f"Data exported to {export_path}.") | |
return f"Data exported to {export_path}." | |
except Exception as e: | |
logging.error(f"Error exporting CSV: {e}") | |
return f"Error exporting CSV: {e}" | |
def log_action(action: str) -> str: | |
""" | |
Logs a custom action message to the MySQL database or CSV. | |
""" | |
try: | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor() | |
insert_query = """ | |
INSERT INTO action_logs (action) | |
VALUES (%s) | |
""" | |
cursor.execute(insert_query, (action,)) | |
connection.commit() | |
logging.info(f"Action logged in database: {action}") | |
cursor.close() | |
connection.close() | |
return f"Action logged: {action}" | |
except mysql.connector.Error as err: | |
logging.error(f"Error logging action to database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Logging action to CSV.") | |
# Fallback to CSV | |
storage_location = DEFAULT_FILE_PATH | |
try: | |
os.makedirs(storage_location, exist_ok=True) | |
csv_file_path = os.path.join(storage_location, "action_logs.csv") | |
file_exists = os.path.isfile(csv_file_path) | |
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile: | |
fieldnames = ["timestamp", "action"] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
if not file_exists: | |
writer.writeheader() | |
writer.writerow( | |
{ | |
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
"action": action, | |
} | |
) | |
logging.info(f"Action logged to CSV: {action}") | |
return f"Action logged: {action}" | |
except Exception as e: | |
logging.error(f"Error logging action to CSV: {e}") | |
return f"Error logging action: {e}" | |
except Exception as e: | |
logging.error(f"Error logging action: {e}") | |
return f"Error logging action: {e}" | |
# Function to get the latest CSV file based on modification time | |
def get_latest_csv() -> str: | |
""" | |
Retrieves the latest CSV file from the storage directory based on modification time. | |
""" | |
try: | |
storage_location = DEFAULT_FILE_PATH | |
csv_files = [f for f in os.listdir(storage_location) if f.endswith(".csv")] | |
if not csv_files: | |
return None | |
latest_csv = max([os.path.join(storage_location, f) for f in csv_files], key=os.path.getmtime) | |
return latest_csv | |
except Exception as e: | |
logging.error(f"Error retrieving latest CSV: {e}") | |
return None | |
def respond( | |
message: str, | |
history: list, | |
system_message: str, | |
max_tokens: int, | |
temperature: float, | |
top_p: float, | |
) -> str: | |
""" | |
Generates a response using OpenLlamaForCausalLM. | |
""" | |
try: | |
# Check if the message contains a command | |
command, params = parse_command(message) | |
if command: | |
# Execute the corresponding function | |
response = execute_command(command, params) | |
else: | |
# Generate a regular response using OpenLlama | |
prompt = ( | |
f"System: {system_message}\n" | |
f"History: {history}\n" | |
f"User: {message}\n" | |
f"Assistant:" | |
) | |
response = openllama_pipeline( | |
prompt, | |
max_length=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
)[0]["generated_text"] | |
# Extract the assistant's reply | |
response = response.split("Assistant:")[-1].strip() | |
return response | |
except Exception as e: | |
logging.error(f"Error generating response: {e}") | |
return "Error generating response." | |
# Define the Gradio interface | |
def create_interface() -> gr.Blocks(): | |
""" | |
Defines and returns the Gradio interface for the application. | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder") | |
with gr.Row(): | |
with gr.Column(): | |
# Scraping Controls | |
storage_location = gr.Textbox( | |
value=DEFAULT_FILE_PATH, label="Storage Location" | |
) | |
urls = gr.Textbox( | |
label="URLs (comma separated)", | |
placeholder="https://example.com, https://anotherexample.com", | |
) | |
scrape_interval = gr.Slider( | |
minimum=1, | |
maximum=60, | |
value=5, | |
step=1, | |
label="Scrape Interval (minutes)", | |
) | |
content_type = gr.Radio( | |
choices=["text", "media", "both"], | |
value="text", | |
label="Content Type", | |
) | |
selector = gr.Textbox( | |
label="CSS Selector for Media (Optional)", | |
placeholder="e.g., img.main-image", | |
) | |
start_button = gr.Button("Start Scraping") | |
stop_button = gr.Button("Stop Scraping") | |
status_output = gr.Textbox( | |
label="Status Output", interactive=False, lines=2 | |
) | |
with gr.Column(): | |
# Chat Interface | |
chat_history = gr.Chatbot(label="Chat History") | |
with gr.Row(): | |
message = gr.Textbox(label="Message", placeholder="Type your message here...") | |
system_message = gr.Textbox( | |
value="You are a helpful assistant.", label="System message" | |
) | |
max_tokens = gr.Slider( | |
minimum=1, | |
maximum=2048, | |
value=512, | |
step=1, | |
label="Max new tokens", | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=4.0, | |
value=0.7, | |
step=0.1, | |
label="Temperature", | |
) | |
top_p = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-p (nucleus sampling)", | |
) | |
response_box = gr.Textbox(label="Response", interactive=False, lines=2) | |
with gr.Row(): | |
with gr.Column(): | |
# CSV Display Controls | |
selected_url_csv = gr.Textbox( | |
label="Select URL for CSV Content", | |
placeholder="https://example.com", | |
) | |
csv_button = gr.Button("Display CSV Content") | |
csv_content_output = gr.Textbox( | |
label="CSV Content Output", interactive=False, lines=10 | |
) | |
with gr.Column(): | |
# RSS Feed Generation Controls | |
selected_url_rss = gr.Textbox( | |
label="Select URL for RSS Feed", | |
placeholder="https://example.com", | |
) | |
rss_button = gr.Button("Generate RSS Feed") | |
rss_output = gr.Textbox( | |
label="RSS Feed Output", interactive=False, lines=20 | |
) | |
# Historical Data View | |
with gr.Row(): | |
historical_view_url = gr.Textbox( | |
label="Select URL for Historical Data", | |
placeholder="https://example.com", | |
) | |
historical_button = gr.Button("View Historical Data") | |
historical_output = gr.Dataframe( | |
headers=["ID", "URL", "Content Hash", "Change Detected"], | |
label="Historical Data", | |
interactive=False | |
) | |
# Connect buttons to their respective functions | |
start_button.click( | |
fn=start_scraping, | |
inputs=[ | |
storage_location, | |
urls, | |
scrape_interval, | |
content_type, | |
selector, | |
], | |
outputs=status_output, | |
) | |
stop_button.click(fn=stop_scraping, outputs=status_output) | |
csv_button.click( | |
fn=display_csv, | |
inputs=[storage_location, selected_url_csv], | |
outputs=csv_content_output, | |
) | |
rss_button.click( | |
fn=generate_rss_feed, | |
inputs=[storage_location, selected_url_rss], | |
outputs=rss_output, | |
) | |
historical_button.click( | |
fn=display_historical_data, | |
inputs=[storage_location, historical_view_url], | |
outputs=historical_output, | |
) | |
# Connect message submission to the chat interface | |
def update_chat(message_input, history, system_msg, max_toks, temp, top_p_val): | |
if not message_input.strip(): | |
return history, "Please enter a message." | |
response = respond( | |
message_input, | |
history, | |
system_msg, | |
max_toks, | |
temp, | |
top_p_val, | |
) | |
history.append((message_input, response)) | |
return history, response | |
message.submit( | |
update_chat, | |
inputs=[ | |
message, | |
chat_history, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
], | |
outputs=[chat_history, response_box], | |
) | |
return demo | |
# Function to display historical data | |
def display_historical_data(storage_location: str, url: str): | |
""" | |
Retrieves and displays historical scraping data for a given URL. | |
""" | |
try: | |
connection = get_db_connection() | |
if connection: | |
try: | |
cursor = connection.cursor(dictionary=True) | |
query = "SELECT * FROM scraped_data WHERE url = %s ORDER BY change_detected DESC" | |
cursor.execute(query, (url,)) | |
results = cursor.fetchall() | |
if not results: | |
return pd.DataFrame() | |
df = pd.DataFrame(results) | |
cursor.close() | |
connection.close() | |
return df | |
except mysql.connector.Error as err: | |
logging.error(f"Error fetching historical data from database: {err}") | |
# Fallback to CSV | |
else: | |
logging.info("No database connection. Fetching historical data from CSV.") | |
# Fallback to CSV | |
hostname = urlparse(url).hostname | |
csv_path = os.path.join(storage_location, f"{hostname}_changes.csv") | |
if os.path.exists(csv_path): | |
df = pd.read_csv(csv_path) | |
return df | |
else: | |
return pd.DataFrame() | |
except Exception as e: | |
logging.error(f"Error fetching historical data for {url}: {e}") | |
return pd.DataFrame() | |
# Function to load the "google/flan-t5-xl" model | |
def load_model(): | |
""" | |
Loads the FlanT5XL model and tokenizer once and returns the pipeline. | |
""" | |
model_name = "google/flan-t5-xl" | |
try: | |
# Load tokenizer with warning suppression | |
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl", clean_up_tokenization_spaces=True) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
truncation=True, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.95, | |
device=0 if torch.cuda.is_available() else -1, | |
) | |
logging.info("Model loaded successfully.") | |
return pipe | |
except Exception as e: | |
logging.error(f"Error loading google/flan-t5-xl model: {e}") | |
return None | |
# Load the model once at the start | |
chat_pipeline = load_model() | |
# Automated Testing using unittest | |
class TestApp(unittest.TestCase): | |
def test_parse_command_filter(self): | |
command = "Filter apples, oranges in column Description" | |
parsed_command = parse_command(command) | |
self.assertEqual(parsed_command[0], "filter") | |
self.assertListEqual(parsed_command[1]["words"], ["apples", "oranges"]) | |
self.assertEqual(parsed_command[1]["column"], "Description") | |
def test_parse_command_sort(self): | |
command = "Sort Price ascending" | |
parsed_command = parse_command(command) | |
self.assertEqual(parsed_command[0], "sort") | |
self.assertEqual(parsed_command[1]["column"], "Price") | |
self.assertEqual(parsed_command[1]["order"], "ascending") | |
def test_parse_command_export(self): | |
command = "Export to CSV as filtered_data.csv" | |
parsed_command = parse_command(command) | |
self.assertEqual(parsed_command[0], "export") | |
self.assertEqual(parsed_command[1]["filename"], "filtered_data.csv") | |
def test_parse_command_log(self): | |
command = "Log action Filtered data for specific fruits" | |
parsed_command = parse_command(command) | |
self.assertEqual(parsed_command[0], "log") | |
self.assertEqual(parsed_command[1]["action"], "Filtered data for specific fruits") | |
def test_database_connection(self): | |
connection = get_db_connection() | |
# Connection may be None if not configured; adjust the test accordingly | |
if connection: | |
self.assertTrue(connection.is_connected()) | |
connection.close() | |
else: | |
self.assertIsNone(connection) | |
def main(): | |
# Initialize and run the application | |
logging.info("Starting the application...") | |
model = load_model() | |
if model: | |
logging.info("Application started successfully.") | |
print("Main function executed") | |
print("Creating interface...") | |
demo = create_interface() | |
print("Launching interface...") | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |
else: | |
logging.error("Failed to start the application.") | |
# Main execution | |
if __name__ == "__main__": | |
# Initialize database | |
initialize_database() | |
# Create and launch Gradio interface | |
demo = create_interface() | |
demo.launch() | |
# Run automated tests | |
unittest.main(argv=[''], exit=False) |