acecalisto3 commited on
Commit
35d8738
·
verified ·
1 Parent(s): 17ea76e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -14
app.py CHANGED
@@ -36,6 +36,15 @@ def handle_error(error):
36
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
37
  urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
38
  handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
 
 
 
 
 
 
 
 
 
39
  while True:
40
  # Check for scrape_interval
41
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
@@ -64,7 +73,30 @@ def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, u
64
  else:
65
  raise Exception('Invalid content type')
66
 
67
- # Save data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
69
  if content_type == 'text':
70
  f.write(content)
@@ -76,12 +108,9 @@ def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, u
76
  else:
77
  raise Exception('Invalid content type')
78
 
79
- # End transaction
80
- inspector.end_transaction()
81
  handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
82
-
83
- # End transaction
84
- inspector.end_transaction()
85
 
86
  # Handle errors
87
  for error in ERROR_HISTORY:
@@ -95,14 +124,10 @@ def handle_system():
95
 
96
  def handle_ui(ui):
97
  # Start scraping
98
- start_scraping('scrape_data', 'https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/')
99
-
100
- # Handle errors
101
- for error in ERROR_HISTORY:
102
- handle_error(error)
103
-
104
- # Return scraping status
105
- handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
106
 
107
  if __name__ == '__main__':
108
  # Read input
 
36
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
37
  urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
38
  handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
39
+ csv_file_path = f"{storage_location}/scraped_data.csv"
40
+ csv_fieldnames = ["date", "time", "url", "change"]
41
+
42
+ # Create the CSV file if it does not exist
43
+ if not os.path.exists(csv_file_path):
44
+ with open(csv_file_path, 'w', newline='') as csvfile:
45
+ csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
46
+ csv_writer.writeheader()
47
+
48
  while True:
49
  # Check for scrape_interval
50
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
 
73
  else:
74
  raise Exception('Invalid content type')
75
 
76
+ # Calculate hash of the content
77
+ content_hash = hashlib.md5(str(content).encode('utf-8')).hexdigest()
78
+
79
+ # Check if the content has changed
80
+ with open(csv_file_path, 'r', newline='') as csvfile:
81
+ csv_reader = csv.DictReader(csvfile)
82
+ rows = list(csv_reader)
83
+ if rows:
84
+ last_row = rows[-1]
85
+ if last_row['url'] == url and last_row['change'] == content_hash:
86
+ print(f"No changes detected on {url}")
87
+ continue
88
+
89
+ # Save data to CSV file
90
+ with open(csv_file_path, 'a', newline='') as csvfile:
91
+ csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
92
+ csv_writer.writerow({
93
+ "date": datetime.datetime.now().strftime("%Y-%m-%d"),
94
+ "time": datetime.datetime.now().strftime("%H:%M:%S"),
95
+ "url": url,
96
+ "change": content_hash
97
+ })
98
+
99
+ # Save data to file
100
  with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
101
  if content_type == 'text':
102
  f.write(content)
 
108
  else:
109
  raise Exception('Invalid content type')
110
 
111
+ handle_output(f"Scraped {url} and saved data to {csv_file_path}")
 
112
  handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
113
+ inspector.end_transaction()
 
 
114
 
115
  # Handle errors
116
  for error in ERROR_HISTORY:
 
124
 
125
  def handle_ui(ui):
126
  # Start scraping
127
+ urls = ['https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/']
128
+ scrape_interval = 5 # Define the scrape interval
129
+ content_type = 'text' # Define the content type
130
+ start_scraping('scrape_data', *urls, scrape_interval, content_type)
 
 
 
 
131
 
132
  if __name__ == '__main__':
133
  # Read input