Spaces:

acecalisto3
/

CEEMEESEEK

Runtime error

App Files Files Community

acecalisto3 commited on Sep 30, 2024

Commit

35d8738

verified ·

1 Parent(s): 17ea76e

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -14

app.py CHANGED Viewed

@@ -36,6 +36,15 @@ def handle_error(error):
 def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
     urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
     handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
     while True:
         # Check for scrape_interval
         time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
@@ -64,7 +73,30 @@ def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, u
             else:
                 raise Exception('Invalid content type')
-            # Save data
             with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
                 if content_type == 'text':
                     f.write(content)
@@ -76,12 +108,9 @@ def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, u
                 else:
                     raise Exception('Invalid content type')
-            # End transaction
-            inspector.end_transaction()
             handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
-    # End transaction
-    inspector.end_transaction()
     # Handle errors
     for error in ERROR_HISTORY:
@@ -95,14 +124,10 @@ def handle_system():
 def handle_ui(ui):
     # Start scraping
-    start_scraping('scrape_data', 'https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/')
-    # Handle errors
-    for error in ERROR_HISTORY:
-        handle_error(error)
-    # Return scraping status
-    handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
 if __name__ == '__main__':
     # Read input

 def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
     urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
     handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
+    csv_file_path = f"{storage_location}/scraped_data.csv"
+    csv_fieldnames = ["date", "time", "url", "change"]
+    # Create the CSV file if it does not exist
+    if not os.path.exists(csv_file_path):
+        with open(csv_file_path, 'w', newline='') as csvfile:
+            csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
+            csv_writer.writeheader()
     while True:
         # Check for scrape_interval
         time.sleep(scrape_interval * 60)  # Check every scrape_interval minutes
             else:
                 raise Exception('Invalid content type')
+            # Calculate hash of the content
+            content_hash = hashlib.md5(str(content).encode('utf-8')).hexdigest()
+            # Check if the content has changed
+            with open(csv_file_path, 'r', newline='') as csvfile:
+                csv_reader = csv.DictReader(csvfile)
+                rows = list(csv_reader)
+                if rows:
+                    last_row = rows[-1]
+                    if last_row['url'] == url and last_row['change'] == content_hash:
+                        print(f"No changes detected on {url}")
+                        continue
+            # Save data to CSV file
+            with open(csv_file_path, 'a', newline='') as csvfile:
+                csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
+                csv_writer.writerow({
+                    "date": datetime.datetime.now().strftime("%Y-%m-%d"),
+                    "time": datetime.datetime.now().strftime("%H:%M:%S"),
+                    "url": url,
+                    "change": content_hash
+                })
+            # Save data to file
             with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
                 if content_type == 'text':
                     f.write(content)
                 else:
                     raise Exception('Invalid content type')
+            handle_output(f"Scraped {url} and saved data to {csv_file_path}")
             handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
+            inspector.end_transaction()
     # Handle errors
     for error in ERROR_HISTORY:
 def handle_ui(ui):
     # Start scraping
+    urls = ['https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/']
+    scrape_interval = 5  # Define the scrape interval
+    content_type = 'text'  # Define the content type
+    start_scraping('scrape_data', *urls, scrape_interval, content_type)
 if __name__ == '__main__':
     # Read input