acecalisto3 commited on
Commit
d1685e2
1 Parent(s): b320b58

Update agent.py

Browse files
Files changed (1) hide show
  1. agent.py +40 -67
agent.py CHANGED
@@ -11,8 +11,7 @@ from selenium.webdriver.chrome.options import Options
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
- from selenium.webdriver.common.keys import Keys
15
- from selenium.webdriver.common.exceptions import NoSuchElementException
16
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
17
  from transformers import pipeline
18
  import feedparser
@@ -26,18 +25,10 @@ PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since
26
  HISTORY = []
27
  CURRENT_TASK = None
28
 
29
- # Define the list of URLs to monitor (you can add more URLs here)
30
- URLS_TO_MONITOR = [
31
- "https://www.example1.com/",
32
- "https://www.example2.com/",
33
- "https://www.example3.com/",
34
- # Add as many URLs as needed
35
- ]
36
-
37
  # Function to monitor URLs for changes
38
- def monitor_urls(storage_location, urls, scrape_interval, content_type):
39
  global HISTORY
40
- previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
41
 
42
  try:
43
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
@@ -45,15 +36,26 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
45
  for url in urls:
46
  try:
47
  driver.get(url)
48
- time.sleep(2) # Wait for the page to load
 
 
49
  if content_type == "text":
50
  current_content = driver.page_source
51
  elif content_type == "media":
52
- current_content = driver.find_elements(By.TAG_NAME, "img")
 
 
 
 
 
 
 
 
53
  else:
54
  current_content = driver.page_source
 
55
  current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
56
- if current_hash!= previous_hashes[url]:
57
  previous_hashes[url] = current_hash
58
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
59
  HISTORY.append(f"Change detected at {url} on {date_time_str}")
@@ -61,14 +63,14 @@ def monitor_urls(storage_location, urls, scrape_interval, content_type):
61
  csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
62
  csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
63
  logging.info(f"Change detected at {url} on {date_time_str}")
64
- except (NoSuchElementException, Exception) as e:
65
  logging.error(f"Error accessing {url}: {e}")
66
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
67
  except Exception as e:
68
  logging.error(f"Error starting ChromeDriver: {e}")
69
 
70
  # Function to start scraping
71
- def start_scraping(storage_location, urls, scrape_interval, content_type):
72
  global CURRENT_TASK, HISTORY
73
 
74
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
@@ -84,29 +86,40 @@ def start_scraping(storage_location, urls, scrape_interval, content_type):
84
  try:
85
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
86
  driver.get(url)
87
- time.sleep(2) # Wait for the page to load
 
 
88
  if content_type == "text":
89
  initial_content = driver.page_source
90
  elif content_type == "media":
91
- initial_content = driver.find_elements(By.TAG_NAME, "img")
 
 
 
 
 
 
 
 
92
  else:
93
  initial_content = driver.page_source
 
94
  initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
95
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
96
  with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
97
  file.write(f"Initial observation at {url}: {initial_hash}")
98
- except (NoSuchElementException, Exception) as e:
99
  HISTORY.append(f"Error accessing {url}: {e}")
100
 
101
  # Monitor the URLs
102
- monitor_urls(storage_location, urls, scrape_interval, content_type)
103
 
104
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
105
 
106
  # Function to display CSV content
107
- def display_csv(url):
108
  hostname = urlparse(url).hostname
109
- folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
110
  csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
111
  if os.path.exists(csv_path):
112
  with open(csv_path, "r") as file:
@@ -123,9 +136,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
123
  return response
124
 
125
  # Function to generate RSS feed for a given URL
126
- def generate_rss_feed(url):
127
  hostname = urlparse(url).hostname
128
- folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
129
  csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
130
  if os.path.exists(csv_path):
131
  with open(csv_path, "r") as file:
@@ -147,47 +160,7 @@ def generate_rss_feed(url):
147
  return "No data available."
148
 
149
  # Function to handle user input and generate response
150
- def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
151
  response = respond(message, history, system_message, max_tokens, temperature, top_p)
152
  history.append((message, response))
153
- return history, response
154
-
155
- if __name__ == "__main__":
156
- # Define the Gradio interface
157
- import gradio as gr
158
-
159
- def create_interface():
160
- with gr.Blocks() as demo:
161
- with gr.Row():
162
- with gr.Column():
163
- message = gr.Textbox(label="Message")
164
- system_message = gr.Textbox(value="You are a helpful assistant.", label="System message")
165
- max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
166
- temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
167
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
168
- storage_location = gr.Textbox(value="scraped_data", label="Storage Location")
169
- urls = gr.Textbox(label="URLs (comma separated)")
170
- scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
171
- content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
172
- start_button = gr.Button("Start Scraping")
173
- csv_output = gr.Textbox(label="CSV Output", interactive=False)
174
-
175
- with gr.Column():
176
- chat_history = gr.Chatbot(label="Chat History")
177
- response_box = gr.Textbox(label="Response")
178
-
179
- start_button.click(start_scraping, inputs=[storage_location, urls, scrape_interval, content_type], outputs=csv_output)
180
- message.submit(chat_interface, inputs=[message, chat_history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type], outputs=[chat_history, response_box])
181
-
182
- # Add a button to display the RSS feed for a selected URL
183
- with gr.Row():
184
- selected_url = gr.Textbox(label="Select URL for RSS Feed")
185
- rss_button = gr.Button("Generate RSS Feed")
186
- rss_output = gr.Textbox(label="RSS Feed Output", interactive=False)
187
-
188
- rss_button.click(generate_rss_feed, inputs=[selected_url], outputs=rss_output)
189
-
190
- return demo
191
-
192
- demo = create_interface()
193
- demo.launch()
 
11
  from selenium.webdriver.common.by import By
12
  from selenium.webdriver.support.ui import WebDriverWait
13
  from selenium.webdriver.support import expected_conditions as EC
14
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
 
15
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
16
  from transformers import pipeline
17
  import feedparser
 
25
  HISTORY = []
26
  CURRENT_TASK = None
27
 
 
 
 
 
 
 
 
 
28
  # Function to monitor URLs for changes
29
+ def monitor_urls(storage_location, urls, scrape_interval, content_type, selector=None):
30
  global HISTORY
31
+ previous_hashes = {url: "" for url in urls}
32
 
33
  try:
34
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
 
36
  for url in urls:
37
  try:
38
  driver.get(url)
39
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) # Wait for basic page load
40
+ time.sleep(2) # Additional wait for dynamic content
41
+
42
  if content_type == "text":
43
  current_content = driver.page_source
44
  elif content_type == "media":
45
+ if selector:
46
+ try:
47
+ elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
48
+ current_content = [element.get_attribute('src') for element in elements]
49
+ except TimeoutException:
50
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
51
+ current_content = []
52
+ else:
53
+ current_content = driver.find_elements(By.TAG_NAME, "img")
54
  else:
55
  current_content = driver.page_source
56
+
57
  current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
58
+ if current_hash != previous_hashes[url]:
59
  previous_hashes[url] = current_hash
60
  date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
61
  HISTORY.append(f"Change detected at {url} on {date_time_str}")
 
63
  csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
64
  csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
65
  logging.info(f"Change detected at {url} on {date_time_str}")
66
+ except (NoSuchElementException, StaleElementReferenceException, Exception) as e:
67
  logging.error(f"Error accessing {url}: {e}")
68
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
69
  except Exception as e:
70
  logging.error(f"Error starting ChromeDriver: {e}")
71
 
72
  # Function to start scraping
73
+ def start_scraping(storage_location, urls, scrape_interval, content_type, selector=None):
74
  global CURRENT_TASK, HISTORY
75
 
76
  CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
 
86
  try:
87
  with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
88
  driver.get(url)
89
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) # Wait for basic page load
90
+ time.sleep(2) # Additional wait for dynamic content
91
+
92
  if content_type == "text":
93
  initial_content = driver.page_source
94
  elif content_type == "media":
95
+ if selector:
96
+ try:
97
+ elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
98
+ initial_content = [element.get_attribute('src') for element in elements]
99
+ except TimeoutException:
100
+ logging.warning(f"Timeout waiting for media elements with selector '{selector}' on {url}")
101
+ initial_content = []
102
+ else:
103
+ initial_content = driver.find_elements(By.TAG_NAME, "img")
104
  else:
105
  initial_content = driver.page_source
106
+
107
  initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
108
  HISTORY.append(f"Initial observation at {url}: {initial_hash}")
109
  with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
110
  file.write(f"Initial observation at {url}: {initial_hash}")
111
+ except (NoSuchElementException, StaleElementReferenceException, Exception) as e:
112
  HISTORY.append(f"Error accessing {url}: {e}")
113
 
114
  # Monitor the URLs
115
+ monitor_urls(storage_location, urls, scrape_interval, content_type, selector)
116
 
117
  return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
118
 
119
  # Function to display CSV content
120
+ def display_csv(storage_location, url):
121
  hostname = urlparse(url).hostname
122
+ folder_path = os.path.join(storage_location, hostname)
123
  csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
124
  if os.path.exists(csv_path):
125
  with open(csv_path, "r") as file:
 
136
  return response
137
 
138
  # Function to generate RSS feed for a given URL
139
+ def generate_rss_feed(storage_location, url):
140
  hostname = urlparse(url).hostname
141
+ folder_path = os.path.join(storage_location, hostname)
142
  csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
143
  if os.path.exists(csv_path):
144
  with open(csv_path, "r") as file:
 
160
  return "No data available."
161
 
162
  # Function to handle user input and generate response
163
+ def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type, selector):
164
  response = respond(message, history, system_message, max_tokens, temperature, top_p)
165
  history.append((message, response))
166
+ return history, response