acecalisto3 commited on
Commit
b9f24c9
·
verified ·
1 Parent(s): ca646d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -76
app.py CHANGED
@@ -1,4 +1,3 @@
1
- ```python
2
  import os
3
  import time
4
  import hashlib
@@ -30,17 +29,14 @@ HISTORY = []
30
  CURRENT_TASK = None
31
  STOP_THREADS = False
32
 
33
- # Define the list of URLs to monitor (you can add more URLs here)
34
- URLS_TO_MONITOR = ["https://twitter.com/wlcscrdp", "https://www.facebook.com/aurorareddevils/", "https://www.facebook.com/brightpanthers/", "https://www.facebook.com/carrollcountychamberin/", "https://www.facebook.com/Culver.Cavs.MHS", "https://www.facebook.com/culver.elementary.school", "https://www.facebook.com/CulverCommunitySchools", "https://www.facebook.com/DillsboroBulldogs/", "https://www.facebook.com/ECMSTROJANS", "https://www.facebook.com/enjoywhitecountyIN/", "https://www.facebook.com/farmersvilleelementary", "https://www.facebook.com/groups/SDMSparents", "https://www.facebook.com/jghsart/", "https://www.facebook.com/jgmusicdept", "https://www.facebook.com/John-Glenn-Education-Foundation-208326199636364/", "https://www.facebook.com/John-Glenn-High-School-1102148953201006/", "https://www.facebook.com/John-Glenn-Theatre-Company-383638295064502/", "https://www.facebook.com/JohnGlennFalconsAthletics", "https://www.facebook.com/KIRPC-Head-Start-1485812354989001", "https://www.facebook.com/KIRPC1", "https://www.facebook.com/LHNEeagles", "https://www.facebook.com/LuceElementarySchool/", "https://www.facebook.com/marrselementary", "https://www.facebook.com/messhiners/", "https://www.facebook.com/monticellocitypool", "https://www.facebook.com/monticelloinwastewater/", "https://www.facebook.com/MooresHillBobcats/", "https://www.facebook.com/msdmv", "https://www.facebook.com/msdnorthposey", "https://www.facebook.com/MUTPL/", "https://www.facebook.com/MVJHS/", "https://www.facebook.com/mvshs", "https://www.facebook.com/njspjrsrhighschool?mibextid=b06tZ0", "https://www.facebook.com/NorthElementaryStars/", "https://www.facebook.com/NorthLibertyElementary/", "https://www.facebook.com/northposey/", "https://www.facebook.com/northposeyhs/", "https://www.facebook.com/NPJuniorHigh", "https://www.facebook.com/Prairie-Heights-Elementary-659322230934707/", "https://www.facebook.com/Prairie-Heights-High-School-2027713067459043/", "https://www.facebook.com/PrairieHeightsPanthers/", "https://www.facebook.com/profile.php?id=100057030237096", "https://www.facebook.com/profile.php?id=100057451179651", "https://www.facebook.com/profile.php?id=100063463513451", "https://www.facebook.com/profile.php?id=100063612319256", "https://www.facebook.com/profile.php?id=100064532596422", "https://www.facebook.com/profile.php?id=100067180226810", "https://www.facebook.com/profile.php?id=61563484312348", "https://www.facebook.com/PTOSWES/", "https://www.facebook.com/RandolphSouthern/", "https://www.facebook.com/RochesterMiddleSchool", "https://www.facebook.com/RochesterZebraNewTechHigh", "https://www.facebook.com/rockportelementarysouthspencer/", "https://www.facebook.com/satellitesathletics/", "https://www.facebook.com/seymourcommunityschools/", "https://www.facebook.com/SeymourHighSchool/", "https://www.facebook.com/SouthDearbornHighSchool/", "https://www.facebook.com/southarbornschools/", "https://www.facebook.com/SouthDearbornSquires/", "https://www.facebook.com/southspencerhighschool", "https://www.facebook.com/southspencermiddleschool/", "https://www.facebook.com/SouthSpencerSchools", "https://www.facebook.com/SouthTerracePanthers/", "https://www.facebook.com/sunmantigers/", "https://www.facebook.com/SWShelbySpartan/", "https://www.facebook.com/TallTimbersMarina", "https://www.facebook.com/WabashValleyESC/", "https://www.facebook.com/Walkerton-Elementary-School-283088605088622/", "https://www.facebook.com/westcentralcte/", "https://www.facebook.com/westelementary", "https://www.facebook.com/wlcscrdp", "https://www.instagram.com/mutpl/", "https://www.instagram.com/northposeyhsathletics", "https://www.instagram.com/rchsprincipalcook/", "https://www.instagram.com/southdearbornhighschool/", "https://www.instagram.com/southdearbornschools/", "https://www.instagram.com/westcentralcte/", "https://www.tiktok.com/@mutplteen"]
35
-
36
- # Function to monitor URLs for changes
37
- def monitor_urls(storage_location, urls, scrape_interval, content_type):
38
- global HISTORY, STOP_THREADS
39
  previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
40
 
41
  try:
42
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
43
- while not STOP_THREADS:
44
  for url in urls:
45
  try:
46
  driver.get(url)
@@ -49,14 +45,9 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
49
  current_content = driver.page_source
50
  elif content_type == "media":
51
  current_content = driver.find_elements(By.TAG_NAME, "img")
52
- elif content_type == "both":
53
- current_content = driver.page_source + str(driver.find_elements(By.TAG_NAME, "img"))
54
  else:
55
  current_content = driver.page_source
56
-
57
- # Calculate hash based on selected content type
58
  current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
59
-
60
  if current_hash != previous_hashes[url]:
61
  previous_hashes[url] = current_hash
62
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -65,17 +56,15 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
65
  csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
66
  csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
67
  logging.info(f"Change detected at {url} on {date_time_str}")
68
-
69
  except (NoSuchElementException, Exception) as e:
70
  logging.error(f"Error accessing {url}: {e}")
71
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
72
  except Exception as e:
73
  logging.error(f"Error starting ChromeDriver: {e}")
74
 
75
- # Function to start scraping
76
- def start_scraping(storage_location, urls, scrape_interval, content_type):
77
- global CURRENT_TASK, HISTORY, STOP_THREADS
78
- STOP_THREADS = False # Reset stop flag
79
 
80
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
81
  HISTORY.append(f"Task started: {CURRENT_TASK}")
@@ -95,8 +84,6 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
95
  initial_content = driver.page_source
96
  elif content_type == "media":
97
  initial_content = driver.find_elements(By.TAG_NAME, "img")
98
- elif content_type == "both":
99
- initial_content = driver.page_source + str(driver.find_elements(By.TAG_NAME, "img"))
100
  else:
101
  initial_content = driver.page_source
102
  initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
@@ -106,64 +93,17 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
106
  except (NoSuchElementException, Exception) as e:
107
  HISTORY.append(f"Error accessing {url}: {e}")
108
 
109
- # Start a separate thread for monitoring
110
- thread = threading.Thread(target=monitor_urls, args=(storage_location, urls, scrape_interval, content_type))
111
- thread.start()
112
 
113
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
114
 
115
- # Function to display CSV content
116
- def display_csv(url):
117
- hostname = urlparse(url).hostname
118
- folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
119
- csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
120
- if os.path.exists(csv_path):
121
- with open(csv_path, "r") as file:
122
- return file.read()
123
- else:
124
- return "No data available."
125
-
126
- # Define the chat response function using the Mistral model
127
- def respond(message, history, system_message, max_tokens, temperature, top_p):
128
- model = AutoModelForSeq2SeqLM.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
129
- tokenizer = AutoTokenizer.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
130
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
131
- response = pipe(f"User: {message}\nHistory: {history}\nSystem: {system_message}", max_length=max_tokens, temperature=temperature, top_p=top_p)[0]
132
- return response
133
-
134
- # Function to generate RSS feed for a given URL
135
- def generate_rss_feed(url):
136
- hostname = urlparse(url).hostname
137
- folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
138
- csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
139
- if os.path.exists(csv_path):
140
- with open(csv_path, "r") as file:
141
- reader = csv.DictReader(file)
142
- feed = feedparser.parse(f"rss.xml") # Create a new feed object
143
- feed.feed.title = f"Changes for {hostname}"
144
- feed.feed.link = url
145
- feed.feed.description = "Recent changes detected on the website."
146
- feed.entries = []
147
- for row in reader:
148
- feed.entries.append({
149
- "title": f"Change detected at {row['url']}",
150
- "link": row['url'],
151
- "description": f"Content changed on {row['date']} at {row['time']}",
152
- "published": datetime.datetime.strptime(f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S").isoformat(),
153
- })
154
- return feed.entries
155
- else:
156
- return "No data available."
157
-
158
- # Function to handle user input and generate response
159
- def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
160
- response = respond(message, history, system_message, max_tokens, temperature, top_p)
161
- history.append((message, response))
162
- return history, response
163
-
164
- # Gradio Interface
165
- import gradio as gr
166
 
 
167
  def create_interface():
168
  with gr.Blocks() as demo:
169
  with gr.Row():
@@ -185,8 +125,9 @@ def create_interface():
185
  chat_history = gr.Chatbot(label="Chat History")
186
  response_box = gr.Textbox(label="Response")
187
 
188
- start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type], outputs=csv_output)
189
- stop_button.click(lambda: (STOP_THREADS, "Scraping stopped."), outputs=[stop_button, csv_output])
 
190
  message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
191
 
192
  # Add a button to display the CSV content for a selected URL
 
 
1
  import os
2
  import time
3
  import hashlib
 
29
  CURRENT_TASK = None
30
  STOP_THREADS = False
31
 
32
+ # Define a function to monitor URLs for changes
33
+ def monitor_urls(storage_location, urls, scrape_interval, content_type, stop_scraping_flag):
34
+ global HISTORY
 
 
 
35
  previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
36
 
37
  try:
38
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
39
+ while not stop_scraping_flag[0]:
40
  for url in urls:
41
  try:
42
  driver.get(url)
 
45
  current_content = driver.page_source
46
  elif content_type == "media":
47
  current_content = driver.find_elements(By.TAG_NAME, "img")
 
 
48
  else:
49
  current_content = driver.page_source
 
 
50
  current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
 
51
  if current_hash != previous_hashes[url]:
52
  previous_hashes[url] = current_hash
53
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
56
  csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
57
  csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
58
  logging.info(f"Change detected at {url} on {date_time_str}")
 
59
  except (NoSuchElementException, Exception) as e:
60
  logging.error(f"Error accessing {url}: {e}")
61
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
62
  except Exception as e:
63
  logging.error(f"Error starting ChromeDriver: {e}")
64
 
65
+ # Define a function to start scraping
66
+ def start_scraping(storage_location, urls, scrape_interval, content_type, stop_scraping_flag):
67
+ global CURRENT_TASK, HISTORY
 
68
 
69
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
70
  HISTORY.append(f"Task started: {CURRENT_TASK}")
 
84
  initial_content = driver.page_source
85
  elif content_type == "media":
86
  initial_content = driver.find_elements(By.TAG_NAME, "img")
 
 
87
  else:
88
  initial_content = driver.page_source
89
  initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
 
93
  except (NoSuchElementException, Exception) as e:
94
  HISTORY.append(f"Error accessing {url}: {e}")
95
 
96
+ # Start a new thread for monitoring URLs
97
+ threading.Thread(target=monitor_urls, args=(storage_location, urls, scrape_interval, content_type, stop_scraping_flag)).start()
 
98
 
99
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
100
 
101
+ # Define a function to stop scraping
102
+ def stop_scraping(stop_scraping_flag):
103
+ stop_scraping_flag[0] = True
104
+ return "Scraping stopped."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Define the Gradio interface
107
  def create_interface():
108
  with gr.Blocks() as demo:
109
  with gr.Row():
 
125
  chat_history = gr.Chatbot(label="Chat History")
126
  response_box = gr.Textbox(label="Response")
127
 
128
+ stop_scraping_flag = [False]
129
+ start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type, stop_scraping_flag], outputs=csv_output)
130
+ stop_button.click(stop_scraping, inputs=[stop_scraping_flag], outputs=[csv_output])
131
  message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
132
 
133
  # Add a button to display the CSV content for a selected URL