acecalisto3 commited on
Commit
465494c
1 Parent(s): d8076e1

Create agent.py

Browse files
Files changed (1) hide show
  1. agent.py +153 -0
agent.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import hashlib
4
+ import logging
5
+ import datetime
6
+ import csv
7
+ from urllib.parse import urlparse
8
+ from selenium import webdriver
9
+ from selenium.webdriver.chrome.service import Service
10
+ from selenium.webdriver.chrome.options import Options
11
+ from selenium.webdriver.common.by import By
12
+ from selenium.webdriver.support.ui import WebDriverWait
13
+ from selenium.webdriver.support import expected_conditions as EC
14
+ from selenium.webdriver.common.keys import Keys
15
+ from selenium.webdriver.common.exceptions import NoSuchElementException
16
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
17
+ from transformers import pipeline
18
+ import feedparser
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
+
23
+ # Define constants
24
+ DEFAULT_FILE_PATH = "scraped_data"
25
+ PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
26
+ HISTORY = []
27
+ CURRENT_TASK = None
28
+
29
+ # Define the list of URLs to monitor (you can add more URLs here)
30
+ URLS_TO_MONITOR = [
31
+ "https://www.example1.com/",
32
+ "https://www.example2.com/",
33
+ "https://www.example3.com/",
34
+ # Add as many URLs as needed
35
+ ]
36
+
37
+ # Function to monitor URLs for changes
38
+ def monitor_urls(storage_location, urls, scrape_interval, content_type):
39
+ global HISTORY
40
+ previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
41
+
42
+ try:
43
+ with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
44
+ while True:
45
+ for url in urls:
46
+ try:
47
+ driver.get(url)
48
+ time.sleep(2) # Wait for the page to load
49
+ if content_type == "text":
50
+ current_content = driver.page_source
51
+ elif content_type == "media":
52
+ current_content = driver.find_elements(By.TAG_NAME, "img")
53
+ else:
54
+ current_content = driver.page_source
55
+ current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
56
+ if current_hash != previous_hashes[url]:
57
+ previous_hashes[url] = current_hash
58
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
59
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
60
+ with open(os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv"), "a", newline="") as csvfile:
61
+ csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
62
+ csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
63
+ logging.info(f"Change detected at {url} on {date_time_str}")
64
+ except (NoSuchElementException, Exception) as e:
65
+ logging.error(f"Error accessing {url}: {e}")
66
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
67
+ except Exception as e:
68
+ logging.error(f"Error starting ChromeDriver: {e}")
69
+
70
+ # Function to start scraping
71
+ def start_scraping(storage_location, urls, scrape_interval, content_type):
72
+ global CURRENT_TASK, HISTORY
73
+
74
+ CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
75
+ HISTORY.append(f"Task started: {CURRENT_TASK}")
76
+
77
+ for url in urls:
78
+ # Create a folder for the URL
79
+ hostname = urlparse(url).hostname
80
+ folder_path = os.path.join(storage_location, hostname)
81
+ os.makedirs(folder_path, exist_ok=True)
82
+
83
+ # Log the initial observation
84
+ try:
85
+ with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
86
+ driver.get(url)
87
+ time.sleep(2) # Wait for the page to load
88
+ if content_type == "text":
89
+ initial_content = driver.page_source
90
+ elif content_type == "media":
91
+ initial_content = driver.find_elements(By.TAG_NAME, "img")
92
+ else:
93
+ initial_content = driver.page_source
94
+ initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
95
+ HISTORY.append(f"Initial observation at {url}: {initial_hash}")
96
+ with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
97
+ file.write(f"Initial observation at {url}: {initial_hash}")
98
+ except (NoSuchElementException, Exception) as e:
99
+ HISTORY.append(f"Error accessing {url}: {e}")
100
+
101
+ # Monitor the URLs
102
+ monitor_urls(storage_location, urls, scrape_interval, content_type)
103
+
104
+ return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
105
+
106
+ # Function to display CSV content
107
+ def display_csv(url):
108
+ hostname = urlparse(url).hostname
109
+ folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
110
+ csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
111
+ if os.path.exists(csv_path):
112
+ with open(csv_path, "r") as file:
113
+ return file.read()
114
+ else:
115
+ return "No data available."
116
+
117
+ # Define the chat response function using the Mistral model
118
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
119
+ model = AutoModelForSeq2SeqLM.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
120
+ tokenizer = AutoTokenizer.from_pretrained_model("mistralai/Mixtral-8x7B-Instruct-v0.1")
121
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
122
+ response = pipe(f"User: {message}\nHistory: {history}\nSystem: {system_message}", max_length=max_tokens, temperature=temperature, top_p=top_p)[0]
123
+ return response
124
+
125
+ # Function to generate RSS feed for a given URL
126
+ def generate_rss_feed(url):
127
+ hostname = urlparse(url).hostname
128
+ folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
129
+ csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
130
+ if os.path.exists(csv_path):
131
+ with open(csv_path, "r") as file:
132
+ reader = csv.DictReader(file)
133
+ feed = feedparser.parse(f"rss.xml") # Create a new feed object
134
+ feed.feed.title = f"Changes for {hostname}"
135
+ feed.feed.link = url
136
+ feed.feed.description = "Recent changes detected on the website."
137
+ feed.entries = []
138
+ for row in reader:
139
+ feed.entries.append({
140
+ "title": f"Change detected at {row['url']}",
141
+ "link": row['url'],
142
+ "description": f"Content changed on {row['date']} at {row['time']}",
143
+ "published": datetime.datetime.strptime(f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S").isoformat(),
144
+ })
145
+ return feed.entries
146
+ else:
147
+ return "No data available."
148
+
149
+ # Function to handle user input and generate response
150
+ def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
151
+ response = respond(message, history, system_message, max_tokens, temperature, top_p)
152
+ history.append((message, response))
153
+ return history, response