Spaces:
Runtime error
Runtime error
acecalisto3
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -36,6 +36,15 @@ def handle_error(error):
|
|
36 |
def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
|
37 |
urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
|
38 |
handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
while True:
|
40 |
# Check for scrape_interval
|
41 |
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
@@ -64,7 +73,30 @@ def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, u
|
|
64 |
else:
|
65 |
raise Exception('Invalid content type')
|
66 |
|
67 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
|
69 |
if content_type == 'text':
|
70 |
f.write(content)
|
@@ -76,12 +108,9 @@ def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, u
|
|
76 |
else:
|
77 |
raise Exception('Invalid content type')
|
78 |
|
79 |
-
|
80 |
-
inspector.end_transaction()
|
81 |
handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
|
82 |
-
|
83 |
-
# End transaction
|
84 |
-
inspector.end_transaction()
|
85 |
|
86 |
# Handle errors
|
87 |
for error in ERROR_HISTORY:
|
@@ -95,14 +124,10 @@ def handle_system():
|
|
95 |
|
96 |
def handle_ui(ui):
|
97 |
# Start scraping
|
98 |
-
|
99 |
-
|
100 |
-
#
|
101 |
-
|
102 |
-
handle_error(error)
|
103 |
-
|
104 |
-
# Return scraping status
|
105 |
-
handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
|
106 |
|
107 |
if __name__ == '__main__':
|
108 |
# Read input
|
|
|
36 |
def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
|
37 |
urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
|
38 |
handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
|
39 |
+
csv_file_path = f"{storage_location}/scraped_data.csv"
|
40 |
+
csv_fieldnames = ["date", "time", "url", "change"]
|
41 |
+
|
42 |
+
# Create the CSV file if it does not exist
|
43 |
+
if not os.path.exists(csv_file_path):
|
44 |
+
with open(csv_file_path, 'w', newline='') as csvfile:
|
45 |
+
csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
|
46 |
+
csv_writer.writeheader()
|
47 |
+
|
48 |
while True:
|
49 |
# Check for scrape_interval
|
50 |
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
|
|
73 |
else:
|
74 |
raise Exception('Invalid content type')
|
75 |
|
76 |
+
# Calculate hash of the content
|
77 |
+
content_hash = hashlib.md5(str(content).encode('utf-8')).hexdigest()
|
78 |
+
|
79 |
+
# Check if the content has changed
|
80 |
+
with open(csv_file_path, 'r', newline='') as csvfile:
|
81 |
+
csv_reader = csv.DictReader(csvfile)
|
82 |
+
rows = list(csv_reader)
|
83 |
+
if rows:
|
84 |
+
last_row = rows[-1]
|
85 |
+
if last_row['url'] == url and last_row['change'] == content_hash:
|
86 |
+
print(f"No changes detected on {url}")
|
87 |
+
continue
|
88 |
+
|
89 |
+
# Save data to CSV file
|
90 |
+
with open(csv_file_path, 'a', newline='') as csvfile:
|
91 |
+
csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
|
92 |
+
csv_writer.writerow({
|
93 |
+
"date": datetime.datetime.now().strftime("%Y-%m-%d"),
|
94 |
+
"time": datetime.datetime.now().strftime("%H:%M:%S"),
|
95 |
+
"url": url,
|
96 |
+
"change": content_hash
|
97 |
+
})
|
98 |
+
|
99 |
+
# Save data to file
|
100 |
with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
|
101 |
if content_type == 'text':
|
102 |
f.write(content)
|
|
|
108 |
else:
|
109 |
raise Exception('Invalid content type')
|
110 |
|
111 |
+
handle_output(f"Scraped {url} and saved data to {csv_file_path}")
|
|
|
112 |
handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
|
113 |
+
inspector.end_transaction()
|
|
|
|
|
114 |
|
115 |
# Handle errors
|
116 |
for error in ERROR_HISTORY:
|
|
|
124 |
|
125 |
def handle_ui(ui):
|
126 |
# Start scraping
|
127 |
+
urls = ['https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/']
|
128 |
+
scrape_interval = 5 # Define the scrape interval
|
129 |
+
content_type = 'text' # Define the content type
|
130 |
+
start_scraping('scrape_data', *urls, scrape_interval, content_type)
|
|
|
|
|
|
|
|
|
131 |
|
132 |
if __name__ == '__main__':
|
133 |
# Read input
|