acecalisto3 commited on
Commit
ca646d2
·
verified ·
1 Parent(s): 827e96b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -47
app.py CHANGED
@@ -1,53 +1,212 @@
 
1
  import os
2
- from agent import start_scraping, display_csv, generate_rss_feed, chat_interface
3
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- def chatbot_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type, selector):
6
- history, response = chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type, selector)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  return history, response
8
 
9
- def generate_rss(storage_location, url):
10
- feed_entries = generate_rss_feed(storage_location, url)
11
- return feedparser.FeedParserDict(feed_entries)
12
-
13
- def main():
14
- storage_location = "scraped_data"
15
- urls = ["https://www.culvers.com/"]
16
- scrape_interval = 5
17
- content_type = "text"
18
- selector = ""
19
-
20
- chatbot_input = gr.inputs.Textbox(lines=5, label="Chatbot Input")
21
- history_output = gr.outputs.Textbox(label="History")
22
- response_output = gr.outputs.Textbox(label="Response")
23
-
24
- csv_input = gr.inputs.Textbox(lines=1, label="CSV Input URL")
25
- csv_output = gr.outputs.Textbox(label="CSV Output")
26
-
27
- rss_input = gr.inputs.Textbox(lines=1, label="RSS Input URL")
28
- rss_output = gr.outputs.HTML(label="RSS Output")
29
-
30
- chatbot_interface = gr.Interface(
31
- chatbot_interface,
32
- [chatbot_input, history_output, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type, selector],
33
- [history_output, response_output]
34
- )
35
-
36
- csv_interface = gr.Interface(
37
- lambda url: display_csv(storage_location, url),
38
- [csv_input],
39
- [csv_output]
40
- )
41
-
42
- rss_interface = gr.Interface(
43
- generate_rss,
44
- [rss_input, storage_location],
45
- [rss_output]
46
- )
47
-
48
- chatbot_interface.launch()
49
- csv_interface.launch()
50
- rss_interface.launch()
 
 
 
51
 
52
  if __name__ == "__main__":
53
- main()
 
 
1
+ ```python
2
  import os
3
+ import time
4
+ import hashlib
5
+ import logging
6
+ import datetime
7
+ import csv
8
+ from urllib.parse import urlparse
9
+ from selenium import webdriver
10
+ from selenium.webdriver.chrome.service import Service
11
+ from selenium.webdriver.chrome.options import Options
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.common.keys import Keys
16
+ from selenium.common.exceptions import NoSuchElementException
17
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
18
+ from transformers import pipeline
19
+ import feedparser
20
+ from bs4 import BeautifulSoup
21
+ import threading
22
+
23
+ # Configure logging
24
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
+
26
+ # Define constants
27
+ DEFAULT_FILE_PATH = "scraped_data"
28
+ PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
29
+ HISTORY = []
30
+ CURRENT_TASK = None
31
+ STOP_THREADS = False
32
+
33
+ # Define the list of URLs to monitor (you can add more URLs here)
34
+ URLS_TO_MONITOR = ["https://twitter.com/wlcscrdp", "https://www.facebook.com/aurorareddevils/", "https://www.facebook.com/brightpanthers/", "https://www.facebook.com/carrollcountychamberin/", "https://www.facebook.com/Culver.Cavs.MHS", "https://www.facebook.com/culver.elementary.school", "https://www.facebook.com/CulverCommunitySchools", "https://www.facebook.com/DillsboroBulldogs/", "https://www.facebook.com/ECMSTROJANS", "https://www.facebook.com/enjoywhitecountyIN/", "https://www.facebook.com/farmersvilleelementary", "https://www.facebook.com/groups/SDMSparents", "https://www.facebook.com/jghsart/", "https://www.facebook.com/jgmusicdept", "https://www.facebook.com/John-Glenn-Education-Foundation-208326199636364/", "https://www.facebook.com/John-Glenn-High-School-1102148953201006/", "https://www.facebook.com/John-Glenn-Theatre-Company-383638295064502/", "https://www.facebook.com/JohnGlennFalconsAthletics", "https://www.facebook.com/KIRPC-Head-Start-1485812354989001", "https://www.facebook.com/KIRPC1", "https://www.facebook.com/LHNEeagles", "https://www.facebook.com/LuceElementarySchool/", "https://www.facebook.com/marrselementary", "https://www.facebook.com/messhiners/", "https://www.facebook.com/monticellocitypool", "https://www.facebook.com/monticelloinwastewater/", "https://www.facebook.com/MooresHillBobcats/", "https://www.facebook.com/msdmv", "https://www.facebook.com/msdnorthposey", "https://www.facebook.com/MUTPL/", "https://www.facebook.com/MVJHS/", "https://www.facebook.com/mvshs", "https://www.facebook.com/njspjrsrhighschool?mibextid=b06tZ0", "https://www.facebook.com/NorthElementaryStars/", "https://www.facebook.com/NorthLibertyElementary/", "https://www.facebook.com/northposey/", "https://www.facebook.com/northposeyhs/", "https://www.facebook.com/NPJuniorHigh", "https://www.facebook.com/Prairie-Heights-Elementary-659322230934707/", "https://www.facebook.com/Prairie-Heights-High-School-2027713067459043/", "https://www.facebook.com/PrairieHeightsPanthers/", "https://www.facebook.com/profile.php?id=100057030237096", "https://www.facebook.com/profile.php?id=100057451179651", "https://www.facebook.com/profile.php?id=100063463513451", "https://www.facebook.com/profile.php?id=100063612319256", "https://www.facebook.com/profile.php?id=100064532596422", "https://www.facebook.com/profile.php?id=100067180226810", "https://www.facebook.com/profile.php?id=61563484312348", "https://www.facebook.com/PTOSWES/", "https://www.facebook.com/RandolphSouthern/", "https://www.facebook.com/RochesterMiddleSchool", "https://www.facebook.com/RochesterZebraNewTechHigh", "https://www.facebook.com/rockportelementarysouthspencer/", "https://www.facebook.com/satellitesathletics/", "https://www.facebook.com/seymourcommunityschools/", "https://www.facebook.com/SeymourHighSchool/", "https://www.facebook.com/SouthDearbornHighSchool/", "https://www.facebook.com/southarbornschools/", "https://www.facebook.com/SouthDearbornSquires/", "https://www.facebook.com/southspencerhighschool", "https://www.facebook.com/southspencermiddleschool/", "https://www.facebook.com/SouthSpencerSchools", "https://www.facebook.com/SouthTerracePanthers/", "https://www.facebook.com/sunmantigers/", "https://www.facebook.com/SWShelbySpartan/", "https://www.facebook.com/TallTimbersMarina", "https://www.facebook.com/WabashValleyESC/", "https://www.facebook.com/Walkerton-Elementary-School-283088605088622/", "https://www.facebook.com/westcentralcte/", "https://www.facebook.com/westelementary", "https://www.facebook.com/wlcscrdp", "https://www.instagram.com/mutpl/", "https://www.instagram.com/northposeyhsathletics", "https://www.instagram.com/rchsprincipalcook/", "https://www.instagram.com/southdearbornhighschool/", "https://www.instagram.com/southdearbornschools/", "https://www.instagram.com/westcentralcte/", "https://www.tiktok.com/@mutplteen"]
35
+
36
+ # Function to monitor URLs for changes
37
+ def monitor_urls(storage_location, urls, scrape_interval, content_type):
38
+ global HISTORY, STOP_THREADS
39
+ previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
40
+
41
+ try:
42
+ with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
43
+ while not STOP_THREADS:
44
+ for url in urls:
45
+ try:
46
+ driver.get(url)
47
+ time.sleep(2) # Wait for the page to load
48
+ if content_type == "text":
49
+ current_content = driver.page_source
50
+ elif content_type == "media":
51
+ current_content = driver.find_elements(By.TAG_NAME, "img")
52
+ elif content_type == "both":
53
+ current_content = driver.page_source + str(driver.find_elements(By.TAG_NAME, "img"))
54
+ else:
55
+ current_content = driver.page_source
56
+
57
+ # Calculate hash based on selected content type
58
+ current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
59
+
60
+ if current_hash != previous_hashes[url]:
61
+ previous_hashes[url] = current_hash
62
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
64
+ with open(os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv"), "a", newline="") as csvfile:
65
+ csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
66
+ csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
67
+ logging.info(f"Change detected at {url} on {date_time_str}")
68
+
69
+ except (NoSuchElementException, Exception) as e:
70
+ logging.error(f"Error accessing {url}: {e}")
71
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
72
+ except Exception as e:
73
+ logging.error(f"Error starting ChromeDriver: {e}")
74
+
75
+ # Function to start scraping
76
+ def start_scraping(storage_location, urls, scrape_interval, content_type):
77
+ global CURRENT_TASK, HISTORY, STOP_THREADS
78
+ STOP_THREADS = False # Reset stop flag
79
+
80
+ CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
81
+ HISTORY.append(f"Task started: {CURRENT_TASK}")
82
+
83
+ for url in urls:
84
+ # Create a folder for the URL
85
+ hostname = urlparse(url).hostname
86
+ folder_path = os.path.join(storage_location, hostname)
87
+ os.makedirs(folder_path, exist_ok=True)
88
+
89
+ # Log the initial observation
90
+ try:
91
+ with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
92
+ driver.get(url)
93
+ time.sleep(2) # Wait for the page to load
94
+ if content_type == "text":
95
+ initial_content = driver.page_source
96
+ elif content_type == "media":
97
+ initial_content = driver.find_elements(By.TAG_NAME, "img")
98
+ elif content_type == "both":
99
+ initial_content = driver.page_source + str(driver.find_elements(By.TAG_NAME, "img"))
100
+ else:
101
+ initial_content = driver.page_source
102
+ initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
103
+ HISTORY.append(f"Initial observation at {url}: {initial_hash}")
104
+ with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
105
+ file.write(f"Initial observation at {url}: {initial_hash}")
106
+ except (NoSuchElementException, Exception) as e:
107
+ HISTORY.append(f"Error accessing {url}: {e}")
108
+
109
+ # Start a separate thread for monitoring
110
+ thread = threading.Thread(target=monitor_urls, args=(storage_location, urls, scrape_interval, content_type))
111
+ thread.start()
112
 
113
+ return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
114
+
115
+ # Function to display CSV content
116
+ def display_csv(url):
117
+ hostname = urlparse(url).hostname
118
+ folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
119
+ csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
120
+ if os.path.exists(csv_path):
121
+ with open(csv_path, "r") as file:
122
+ return file.read()
123
+ else:
124
+ return "No data available."
125
+
126
+ # Define the chat response function using the Mistral model
127
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
128
+ model = AutoModelForSeq2SeqLM.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
129
+ tokenizer = AutoTokenizer.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
130
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
131
+ response = pipe(f"User: {message}\nHistory: {history}\nSystem: {system_message}", max_length=max_tokens, temperature=temperature, top_p=top_p)[0]
132
+ return response
133
+
134
+ # Function to generate RSS feed for a given URL
135
+ def generate_rss_feed(url):
136
+ hostname = urlparse(url).hostname
137
+ folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
138
+ csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
139
+ if os.path.exists(csv_path):
140
+ with open(csv_path, "r") as file:
141
+ reader = csv.DictReader(file)
142
+ feed = feedparser.parse(f"rss.xml") # Create a new feed object
143
+ feed.feed.title = f"Changes for {hostname}"
144
+ feed.feed.link = url
145
+ feed.feed.description = "Recent changes detected on the website."
146
+ feed.entries = []
147
+ for row in reader:
148
+ feed.entries.append({
149
+ "title": f"Change detected at {row['url']}",
150
+ "link": row['url'],
151
+ "description": f"Content changed on {row['date']} at {row['time']}",
152
+ "published": datetime.datetime.strptime(f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S").isoformat(),
153
+ })
154
+ return feed.entries
155
+ else:
156
+ return "No data available."
157
+
158
+ # Function to handle user input and generate response
159
+ def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
160
+ response = respond(message, history, system_message, max_tokens, temperature, top_p)
161
+ history.append((message, response))
162
  return history, response
163
 
164
+ # Gradio Interface
165
+ import gradio as gr
166
+
167
+ def create_interface():
168
+ with gr.Blocks() as demo:
169
+ with gr.Row():
170
+ with gr.Column():
171
+ message = gr.Textbox(label="Message")
172
+ system_message = gr.Textbox(value="You are a helpful assistant.", label="System message")
173
+ max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
174
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
175
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
176
+ storage_location = gr.Textbox(value="scraped_data", label="Storage Location")
177
+ urls = gr.Textbox(label="URLs (comma separated)")
178
+ scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
179
+ content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
180
+ start_button = gr.Button("Start Scraping")
181
+ stop_button = gr.Button("Stop Scraping")
182
+ csv_output = gr.Textbox(label="CSV Output", interactive=False)
183
+
184
+ with gr.Column():
185
+ chat_history = gr.Chatbot(label="Chat History")
186
+ response_box = gr.Textbox(label="Response")
187
+
188
+ start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type], outputs=csv_output)
189
+ stop_button.click(lambda: (STOP_THREADS, "Scraping stopped."), outputs=[stop_button, csv_output])
190
+ message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
191
+
192
+ # Add a button to display the CSV content for a selected URL
193
+ with gr.Row():
194
+ selected_url = gr.Textbox(label="Select URL for CSV Content")
195
+ csv_button = gr.Button("Display CSV Content")
196
+ csv_output = gr.Textbox(label="CSV Content Output", interactive=False)
197
+
198
+ csv_button.click(display_csv, inputs=[selected_url], outputs=csv_output)
199
+
200
+ # Add a button to display the RSS feed for a selected URL
201
+ with gr.Row():
202
+ selected_url = gr.Textbox(label="Select URL for RSS Feed")
203
+ rss_button = gr.Button("Generate RSS Feed")
204
+ rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
205
+
206
+ rss_button.click(generate_rss_feed, inputs=[selected_url], outputs=rss_output)
207
+
208
+ return demo
209
 
210
  if __name__ == "__main__":
211
+ demo = create_interface()
212
+ demo.launch()