import time import csv import os import hashlib import re import requests from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from inspector import Configuration, Inspector config = Configuration('5713ec1deb658fd2e6c069ce313ddaa34e2feee3') inspector = Inspector(config) inspector.start_transaction('my python script') def handle_input(input): global CURRENT_INPUT CURRENT_INPUT = input def handle_output(output): global OUTPUT_HISTORY OUTPUT_HISTORY.append((CURRENT_INPUT, output)) def handle_system(output): global SYSTEM_OUTPUT SYSTEM_OUTPUT = output def handle_error(error): global ERROR_HISTORY ERROR_HISTORY.append((CURRENT_INPUT, error)) def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type): urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url] handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.") csv_file_path = f"{storage_location}/scraped_data.csv" csv_fieldnames = ["date", "time", "url", "change"] # Create the CSV file if it does not exist if not os.path.exists(csv_file_path): with open(csv_file_path, 'w', newline='') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames) csv_writer.writeheader() while True: # Check for scrape_interval time.sleep(scrape_interval * 60) # Check every scrape_interval minutes # Scrape data for url in urls: # Initialize Chrome webdriver options = Options() options.headless = True driver = webdriver.Chrome(executable_path='/path/to/chromedriver', options=options) driver.maximize_window() driver.set_window_size(1920, 1080) driver.implicitly_wait(10) driver.get(url) # Wait for page to load wait = WebDriverWait(driver, 10) wait.until(EC.title_is('Culver Community Schools')) # Scrape data soup = BeautifulSoup(driver.page_source, 'html.parser') content = None if content_type == 'text': content = soup.get_text() elif content_type == 'media': content = [img['src'] for img in soup.find_all('img')] else: raise Exception('Invalid content type') # Calculate hash of the content content_hash = hashlib.md5(str(content).encode('utf-8')).hexdigest() # Check if the content has changed with open(csv_file_path, 'r', newline='') as csvfile: csv_reader = csv.DictReader(csvfile) rows = list(csv_reader) if rows: last_row = rows[-1] if last_row['url'] == url and last_row['change'] == content_hash: print(f"No changes detected on {url}") continue # Save data to CSV file with open(csv_file_path, 'a', newline='') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames) csv_writer.writerow({ "date": datetime.datetime.now().strftime("%Y-%m-%d"), "time": datetime.datetime.now().strftime("%H:%M:%S"), "url": url, "change": content_hash }) # Save data to file with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f: if content_type == 'text': f.write(content) elif content_type == 'media': for img in content: response = requests.get(img) with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape/{hashlib.md5(response.content).hexdigest()[:10]}.jpg", 'wb') as f: f.write(response.content) else: raise Exception('Invalid content type') handle_output(f"Scraped {url} and saved data to {csv_file_path}") handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}") inspector.end_transaction() # Handle errors for error in ERROR_HISTORY: handle_error(error) # Return scraping status handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.") def handle_system(): handle_output(f"System: {SYSTEM_OUTPUT}") def handle_ui(ui): # Start scraping urls = ['https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/'] scrape_interval = 5 # Define the scrape interval content_type = 'text' # Define the content type start_scraping('scrape_data', *urls, scrape_interval, content_type) if __name__ == '__main__': # Read input input = "Start scraping https://www.culver.org/ and save data to scrape_data directory." # Call functions handle_input(input) handle_system() # Run system handle_ui()