acecalisto3 commited on
Commit
a1d1b22
·
verified ·
1 Parent(s): 8bb876e

Delete agent.py

Browse files
Files changed (1) hide show
  1. agent.py +0 -466
agent.py DELETED
@@ -1,466 +0,0 @@
1
- import os
2
- import time
3
- import hashlib
4
- import logging
5
- import datetime
6
- import csv
7
- import threading
8
- from urllib.parse import urlparse
9
- from selenium import webdriver
10
- from selenium.webdriver.chrome.service import Service
11
- from selenium.webdriver.chrome.options import Options
12
- from selenium.webdriver.common.by import By
13
- from selenium.webdriver.support.ui import WebDriverWait
14
- from selenium.webdriver.support import expected_conditions as EC
15
- from selenium.common.exceptions import (
16
- TimeoutException,
17
- NoSuchElementException,
18
- StaleElementReferenceException,
19
- )
20
- from webdriver_manager.chrome import ChromeDriverManager # Added import
21
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
22
- import feedparser
23
- import gradio as gr
24
- import xml.etree.ElementTree as ET
25
-
26
- # Configure logging
27
- logging.basicConfig(
28
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
29
- )
30
-
31
- # Define constants
32
- DEFAULT_FILE_PATH = "scraped_data"
33
- PURPOSE = (
34
- "You go to Culvers sites, you continuously seek changes on them since your last observation. "
35
- "Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
36
- )
37
- HISTORY = []
38
- CURRENT_TASK = None
39
- STOP_THREADS = False # Flag to stop scraping threads
40
-
41
- # Function to monitor URLs for changes
42
- def monitor_urls(storage_location, urls, scrape_interval, content_type, selector=None):
43
- global HISTORY, STOP_THREADS
44
- previous_hashes = {url: "" for url in urls}
45
-
46
- options = Options()
47
- options.add_argument("--headless") # Run Chrome in headless mode
48
- options.add_argument("--no-sandbox")
49
- options.add_argument("--disable-dev-shm-usage")
50
-
51
- try:
52
- driver = webdriver.Chrome(
53
- service=Service(ChromeDriverManager().install()), options=options
54
- )
55
- except Exception as e:
56
- logging.error(f"Error initializing ChromeDriver: {e}")
57
- return
58
-
59
- try:
60
- while not STOP_THREADS:
61
- for url in urls:
62
- try:
63
- driver.get(url)
64
- WebDriverWait(driver, 10).until(
65
- EC.presence_of_element_located((By.TAG_NAME, "body"))
66
- ) # Wait for basic page load
67
- time.sleep(2) # Additional wait for dynamic content
68
-
69
- if content_type == "text":
70
- current_content = driver.page_source
71
- elif content_type == "media":
72
- if selector:
73
- try:
74
- elements = WebDriverWait(driver, 5).until(
75
- EC.presence_of_all_elements_located(
76
- (By.CSS_SELECTOR, selector)
77
- )
78
- )
79
- current_content = [
80
- element.get_attribute("src") for element in elements
81
- ]
82
- except TimeoutException:
83
- logging.warning(
84
- f"Timeout waiting for media elements with selector '{selector}' on {url}"
85
- )
86
- current_content = []
87
- else:
88
- elements = driver.find_elements(By.TAG_NAME, "img")
89
- current_content = [element.get_attribute("src") for element in elements]
90
- else:
91
- current_content = driver.page_source
92
-
93
- current_hash = hashlib.md5(
94
- str(current_content).encode("utf-8")
95
- ).hexdigest()
96
- if current_hash != previous_hashes[url]:
97
- previous_hashes[url] = current_hash
98
- date_time_str = datetime.datetime.now().strftime(
99
- "%Y-%m-%d %H:%M:%S"
100
- )
101
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
102
- csv_file_path = os.path.join(
103
- storage_location, f"{urlparse(url).hostname}_changes.csv"
104
- )
105
- os.makedirs(storage_location, exist_ok=True)
106
- file_exists = os.path.isfile(csv_file_path)
107
- with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
108
- fieldnames = ["date", "time", "url", "change"]
109
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
110
- if not file_exists:
111
- writer.writeheader()
112
- writer.writerow(
113
- {
114
- "date": date_time_str.split()[0],
115
- "time": date_time_str.split()[1],
116
- "url": url,
117
- "change": "Content changed",
118
- }
119
- )
120
- logging.info(f"Change detected at {url} on {date_time_str}")
121
- except (
122
- NoSuchElementException,
123
- StaleElementReferenceException,
124
- TimeoutException,
125
- Exception,
126
- ) as e:
127
- logging.error(f"Error accessing {url}: {e}")
128
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
129
- finally:
130
- driver.quit()
131
- logging.info("ChromeDriver session ended.")
132
-
133
- # Function to start scraping
134
- def start_scraping(storage_location, urls, scrape_interval, content_type, selector=None):
135
- global CURRENT_TASK, HISTORY, STOP_THREADS
136
-
137
- if STOP_THREADS:
138
- STOP_THREADS = False # Reset the flag if previously stopped
139
-
140
- CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
141
- HISTORY.append(f"Task started: {CURRENT_TASK}")
142
-
143
- for url in urls:
144
- # Create a folder for the URL
145
- hostname = urlparse(url).hostname
146
- folder_path = os.path.join(storage_location, hostname)
147
- os.makedirs(folder_path, exist_ok=True)
148
-
149
- # Log the initial observation
150
- try:
151
- options = Options()
152
- options.add_argument("--headless") # Run Chrome in headless mode
153
- options.add_argument("--no-sandbox")
154
- options.add_argument("--disable-dev-shm-usage")
155
-
156
- driver = webdriver.Chrome(
157
- service=Service(ChromeDriverManager().install()), options=options
158
- )
159
- driver.get(url)
160
- WebDriverWait(driver, 10).until(
161
- EC.presence_of_element_located((By.TAG_NAME, "body"))
162
- ) # Wait for basic page load
163
- time.sleep(2) # Additional wait for dynamic content
164
-
165
- if content_type == "text":
166
- initial_content = driver.page_source
167
- elif content_type == "media":
168
- if selector:
169
- try:
170
- elements = WebDriverWait(driver, 5).until(
171
- EC.presence_of_all_elements_located(
172
- (By.CSS_SELECTOR, selector)
173
- )
174
- )
175
- initial_content = [
176
- element.get_attribute("src") for element in elements
177
- ]
178
- except TimeoutException:
179
- logging.warning(
180
- f"Timeout waiting for media elements with selector '{selector}' on {url}"
181
- )
182
- initial_content = []
183
- else:
184
- elements = driver.find_elements(By.TAG_NAME, "img")
185
- initial_content = [element.get_attribute("src") for element in elements]
186
- else:
187
- initial_content = driver.page_source
188
-
189
- initial_hash = hashlib.md5(
190
- str(initial_content).encode("utf-8")
191
- ).hexdigest()
192
- HISTORY.append(f"Initial observation at {url}: {initial_hash}")
193
- initial_observation_path = os.path.join(
194
- folder_path, f"{hostname}_initial_observation.txt"
195
- )
196
- with open(initial_observation_path, "w", encoding="utf-8") as file:
197
- file.write(f"Initial observation at {url}: {initial_hash}")
198
- logging.info(f"Initial observation logged for {url}")
199
- except (
200
- NoSuchElementException,
201
- StaleElementReferenceException,
202
- TimeoutException,
203
- Exception,
204
- ) as e:
205
- HISTORY.append(f"Error accessing {url}: {e}")
206
- logging.error(f"Error accessing {url}: {e}")
207
- finally:
208
- driver.quit()
209
-
210
- # Start a new thread for monitoring URLs
211
- monitor_thread = threading.Thread(
212
- target=monitor_urls,
213
- args=(storage_location, urls, scrape_interval, content_type, selector),
214
- daemon=True,
215
- )
216
- monitor_thread.start()
217
- logging.info("Started scraping thread.")
218
-
219
- return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
220
-
221
- # Function to stop scraping
222
- def stop_scraping():
223
- global STOP_THREADS
224
- STOP_THREADS = True
225
- HISTORY.append("Scraping stopped by user.")
226
- logging.info("Scraping stop signal sent.")
227
- return "Scraping has been stopped."
228
-
229
- # Function to display CSV content
230
- def display_csv(storage_location, url):
231
- hostname = urlparse(url).hostname
232
- csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
233
- if os.path.exists(csv_path):
234
- try:
235
- with open(csv_path, "r", encoding="utf-8") as file:
236
- content = file.read()
237
- return content
238
- except Exception as e:
239
- logging.error(f"Error reading CSV file for {url}: {e}")
240
- return f"Error reading CSV file for {url}: {e}"
241
- else:
242
- return "No data available."
243
-
244
- # Function to generate RSS feed for a given URL
245
- def generate_rss_feed(storage_location, url):
246
- hostname = urlparse(url).hostname
247
- csv_path = os.path.join(storage_location, f"{hostname}_changes.csv")
248
- if os.path.exists(csv_path):
249
- try:
250
- # Parse the CSV file
251
- with open(csv_path, "r", encoding="utf-8") as file:
252
- reader = csv.DictReader(file)
253
- changes = list(reader)
254
-
255
- # Create the root RSS element
256
- rss = ET.Element("rss", version="2.0")
257
- channel = ET.SubElement(rss, "channel")
258
-
259
- # Add channel elements
260
- title = ET.SubElement(channel, "title")
261
- title.text = f"RSS Feed for {hostname}"
262
-
263
- link = ET.SubElement(channel, "link")
264
- link.text = url
265
-
266
- description = ET.SubElement(channel, "description")
267
- description.text = "Recent changes detected on the website."
268
-
269
- # Add items to the feed
270
- for change in changes[-10:]: # Last 10 changes
271
- item = ET.SubElement(channel, "item")
272
-
273
- item_title = ET.SubElement(item, "title")
274
- item_title.text = f"Change detected at {change['url']}"
275
-
276
- item_link = ET.SubElement(item, "link")
277
- item_link.text = change["url"]
278
-
279
- item_description = ET.SubElement(item, "description")
280
- item_description.text = f"Content changed on {change['date']} at {change['time']}"
281
-
282
- pub_date = ET.SubElement(item, "pubDate")
283
- pub_date.text = datetime.datetime.strptime(
284
- f"{change['date']} {change['time']}", "%Y-%m-%d %H:%M:%S"
285
- ).strftime("%a, %d %b %Y %H:%M:%S +0000")
286
-
287
- # Generate the XML string
288
- rss_feed = ET.tostring(rss, encoding="utf-8")
289
- return rss_feed.decode("utf-8")
290
- except Exception as e:
291
- logging.error(f"Error generating RSS feed for {url}: {e}")
292
- return f"Error generating RSS feed for {url}: {e}"
293
- else:
294
- return "No data available."
295
-
296
- # Function to define the chat response function using the Mistral model
297
- def respond(message, history, system_message, max_tokens, temperature, top_p):
298
- # Load the model and tokenizer once
299
- if not hasattr(respond, "pipe"):
300
- try:
301
- model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
302
- respond.tokenizer = AutoTokenizer.from_pretrained(model_name)
303
- respond.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
304
- respond.pipe = pipeline(
305
- "text-generation",
306
- model=respond.model,
307
- tokenizer=respond.tokenizer,
308
- device=0 if torch.cuda.is_available() else -1,
309
- )
310
- logging.info("Model loaded successfully.")
311
- except Exception as e:
312
- logging.error(f"Error loading model: {e}")
313
- return "Error loading the response model."
314
-
315
- try:
316
- prompt = (
317
- f"System: {system_message}\n"
318
- f"History: {history}\n"
319
- f"User: {message}\n"
320
- f"Assistant:"
321
- )
322
- response = respond.pipe(
323
- prompt, max_length=max_tokens, temperature=temperature, top_p=top_p
324
- )[0]["generated_text"]
325
- return response
326
- except Exception as e:
327
- logging.error(f"Error generating response: {e}")
328
- return "Error generating response."
329
-
330
- # Define the Gradio interface
331
- def create_interface():
332
- with gr.Blocks() as demo:
333
- gr.Markdown("# All-in-One Scraper, Database, and RSS Feeder")
334
-
335
- with gr.Row():
336
- with gr.Column():
337
- storage_location = gr.Textbox(
338
- value=DEFAULT_FILE_PATH, label="Storage Location"
339
- )
340
- urls = gr.Textbox(
341
- label="URLs (comma separated)",
342
- placeholder="https://example.com, https://anotherexample.com",
343
- )
344
- scrape_interval = gr.Slider(
345
- minimum=1,
346
- maximum=60,
347
- value=5,
348
- step=1,
349
- label="Scrape Interval (minutes)",
350
- )
351
- content_type = gr.Radio(
352
- choices=["text", "media", "both"],
353
- value="text",
354
- label="Content Type",
355
- )
356
- selector = gr.Textbox(
357
- label="CSS Selector for Media (Optional)",
358
- placeholder="e.g., img.main-image",
359
- )
360
- start_button = gr.Button("Start Scraping")
361
- stop_button = gr.Button("Stop Scraping")
362
- csv_output = gr.Textbox(
363
- label="CSV Output", interactive=False, lines=2
364
- )
365
-
366
- with gr.Column():
367
- chat_history = gr.Chatbot(label="Chat History")
368
- with gr.Row():
369
- message = gr.Textbox(label="Message", placeholder="Type your message here...")
370
- system_message = gr.Textbox(
371
- value="You are a helpful assistant.", label="System message"
372
- )
373
- max_tokens = gr.Slider(
374
- minimum=1,
375
- maximum=2048,
376
- value=512,
377
- step=1,
378
- label="Max new tokens",
379
- )
380
- temperature = gr.Slider(
381
- minimum=0.1,
382
- maximum=4.0,
383
- value=0.7,
384
- step=0.1,
385
- label="Temperature",
386
- )
387
- top_p = gr.Slider(
388
- minimum=0.1,
389
- maximum=1.0,
390
- value=0.95,
391
- step=0.05,
392
- label="Top-p (nucleus sampling)",
393
- )
394
- response_box = gr.Textbox(label="Response", interactive=False, lines=2)
395
-
396
- with gr.Row():
397
- selected_url_csv = gr.Textbox(
398
- label="Select URL for CSV Content",
399
- placeholder="https://example.com",
400
- )
401
- csv_button = gr.Button("Display CSV Content")
402
- csv_content_output = gr.Textbox(
403
- label="CSV Content Output", interactive=False, lines=10
404
- )
405
-
406
- with gr.Row():
407
- selected_url_rss = gr.Textbox(
408
- label="Select URL for RSS Feed",
409
- placeholder="https://example.com",
410
- )
411
- rss_button = gr.Button("Generate RSS Feed")
412
- rss_output = gr.Textbox(
413
- label="RSS Feed Output", interactive=False, lines=20
414
- )
415
-
416
- # Connect buttons to their respective functions
417
- start_button.click(
418
- fn=start_scraping,
419
- inputs=[
420
- storage_location,
421
- gr.Textbox.value,
422
- scrape_interval,
423
- content_type,
424
- selector,
425
- ],
426
- outputs=csv_output,
427
- )
428
-
429
- stop_button.click(fn=stop_scraping, outputs=csv_output)
430
-
431
- csv_button.click(
432
- fn=display_csv,
433
- inputs=[storage_location, selected_url_csv],
434
- outputs=csv_content_output,
435
- )
436
-
437
- rss_button.click(
438
- fn=generate_rss_feed,
439
- inputs=[storage_location, selected_url_rss],
440
- outputs=rss_output,
441
- )
442
-
443
- # Connect message submission to the chat interface
444
- def update_chat(message, history, system_message, max_tokens, temperature, top_p):
445
- response = respond(message, history, system_message, max_tokens, temperature, top_p)
446
- history.append((message, response))
447
- return history, response
448
-
449
- message.submit(
450
- update_chat,
451
- inputs=[
452
- message,
453
- chat_history,
454
- system_message,
455
- max_tokens,
456
- temperature,
457
- top_p,
458
- ],
459
- outputs=[chat_history, response_box],
460
- )
461
-
462
- return demo
463
-
464
- if __name__ == "__main__":
465
- demo = create_interface()
466
- demo.launch()