acecalisto3 commited on
Commit
da20049
1 Parent(s): 4e5b9ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -147
app.py CHANGED
@@ -17,169 +17,73 @@ import yaml
17
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
 
19
  # Define constants
20
- PREFIX = "Task started at {date_time_str}. Purpose: {purpose}"
21
- TASK_PROMPT = "Current task: {task}. History:\n{history}"
22
-
23
- # Define current date/time
24
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
25
-
26
- # Define purpose
27
- purpose = """
28
- You go to Culvers sites, you continuously seek changes on them since your last observation.
29
- Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data.
30
- """
31
-
32
- # Define history
33
- history = []
34
-
35
- # Define current task
36
- current_task = None
37
-
38
- # Default file path
39
- default_file_path = "user/app/scraped_data/culver/culvers_changes.csv"
40
 
41
  # Ensure the directory exists
42
- os.makedirs(os.path.dirname(default_file_path), exist_ok=True)
43
 
44
  # Function to monitor URLs for changes
45
  def monitor_urls(storage_location, urls, scrape_interval, content_type):
46
- global history
47
  previous_hashes = [""] * len(urls)
48
-
49
- # Ensure the directory exists
50
- os.makedirs(os.path.dirname(storage_location), exist_ok=True)
51
-
52
- with open(storage_location, "w", newline='') as csvfile:
53
- csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
54
- csv_toolkit.writeheader()
55
-
56
- options = Options()
57
- options.headless = True
58
- options.add_argument("--disable-gpu")
59
- options.add_argument("--no-sandbox")
60
- options.add_argument("--disable-dev-shm-usage")
61
-
62
- try:
63
- with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver:
64
- while True:
65
- for i, url in enumerate(urls):
66
- try:
67
- driver.get(url)
68
- time.sleep(2) # Wait for the page to load
69
- if content_type == "text":
70
- current_content = driver.page_source
71
- elif content_type == "media":
72
- current_content = driver.find_elements_by_tag_name("img")
73
- else:
74
- current_content = driver.page_source
75
-
76
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
77
-
78
- if current_hash != previous_hashes[i]:
79
- previous_hashes[i] = current_hash
80
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
81
- history.append(f"Change detected at {url} on {date_time_str}")
82
- csv_toolkit.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
83
  logging.info(f"Change detected at {url} on {date_time_str}")
84
- except Exception as e:
85
- logging.error(f"Error accessing {url}: {e}")
86
-
87
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
88
- except Exception as e:
89
- logging.error(f"Error starting ChromeDriver: {e}")
90
 
91
  # Define main function to handle user input
92
  def handle_input(storage_location, urls, scrape_interval, content_type):
93
- global current_task, history
94
 
95
- current_task = f"Monitoring URLs: {', '.join(urls)}"
96
- history.append(f"Task started: {current_task}")
97
  monitor_urls(storage_location, urls, scrape_interval, content_type)
98
- return TASK_PROMPT.format(task=current_task, history="\n".join(map(str, history)))
99
 
100
  # Load custom prompts
101
  try:
102
- with open('custom_prompts.yaml', 'r') as fp:
103
  custom_prompts = yaml.safe_load(fp)
104
  except FileNotFoundError:
105
- custom_prompts = {
106
- "WEB_DEV": "",
107
- "AI_SYSTEM_PROMPT": "",
108
- "PYTHON_CODE_DEV": "",
109
- "CODE_GENERATION": "",
110
- "CODE_INTERPRETATION": "",
111
- "CODE_TRANSLATION": "",
112
- "CODE_IMPLEMENTATION": ""
113
- }
114
 
115
  # Define the Mistral inference client
116
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
117
 
118
- VERBOSE = True
119
- MAX_HISTORY = 125
120
-
121
- def format_prompt(message, history):
122
- prompt = "<s>"
123
- for entry in history:
124
- if isinstance(entry, tuple) and len(entry) == 2:
125
- user_prompt, bot_response = entry
126
- prompt += f"[INST] {user_prompt} [/INST]"
127
- prompt += f" {bot_response}</s> "
128
- prompt += f"[INST] {message} [/INST]"
129
- return prompt
130
-
131
- agents = [
132
- "WEB_DEV",
133
- "AI_SYSTEM_PROMPT",
134
- "PYTHON_CODE_DEV",
135
- "CODE_GENERATION",
136
- "CODE_INTERPRETATION",
137
- "CODE_TRANSLATION",
138
- "CODE_IMPLEMENTATION"
139
- ]
140
-
141
- def generate(
142
- prompt, history, agent_name=agents[0], sys_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.7,
143
- ):
144
- seed = random.randint(1, 1111111111111111)
145
- agent = custom_prompts[agent_name]
146
-
147
- system_prompt = agent if sys_prompt == "" else sys_prompt
148
- temperature = max(float(temperature), 1e-2)
149
- top_p = float(top_p)
150
-
151
- generate_kwargs = dict(
152
- temperature=temperature,
153
- max_new_tokens=max_new_tokens,
154
- top_p=top_p,
155
- repetition_penalty=repetition_penalty,
156
- do_sample=True,
157
- seed=seed,
158
- )
159
-
160
- formatted_prompt = format_prompt(f"{system_prompt}\n\n{prompt}", history)
161
- output = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, return_full_text=False)
162
-
163
- return output
164
-
165
  # Define the chat response function
166
- def respond(
167
- message,
168
- history,
169
- system_message,
170
- max_tokens,
171
- temperature,
172
- top_p,
173
- ):
174
- response = generate(
175
- prompt=message,
176
- history=history,
177
- sys_prompt=system_message,
178
- temperature=temperature,
179
- max_new_tokens=max_tokens,
180
- top_p=top_p
181
- )
182
- return response
183
 
184
  # Function to start scraping
185
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
@@ -197,10 +101,10 @@ def display_csv(storage_location):
197
 
198
  # Create Gradio interface
199
  def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
200
- global history
201
- response = respond(message, history, system_message, max_tokens, temperature, top_p)
202
- history.append((message, response))
203
- return history, ""
204
 
205
  demo = gr.Blocks()
206
 
@@ -212,8 +116,8 @@ with demo:
212
  max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
213
  temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
214
  top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
215
- storage_location = gr.Textbox(value=default_file_path, label="Storage Location")
216
- url1 = gr.Textbox(value="https://www.culver.k12.in.us/", label="URL 1")
217
  url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2")
218
  url3 = gr.Textbox(label="URL 3")
219
  url4 = gr.Textbox(label="URL 4")
 
17
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
 
19
  # Define constants
20
+ DATE_TIME_STR = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
21
+ PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
22
+ HISTORY = []
23
+ CURRENT_TASK = None
24
+ DEFAULT_FILE_PATH = "user/app/scraped_data/culver/culvers_changes.csv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Ensure the directory exists
27
+ os.makedirs(os.path.dirname(DEFAULT_FILE_PATH), exist_ok=True)
28
 
29
  # Function to monitor URLs for changes
30
  def monitor_urls(storage_location, urls, scrape_interval, content_type):
31
+ global HISTORY
32
  previous_hashes = [""] * len(urls)
33
+
34
+ try:
35
+ with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options()) as driver:
36
+ while True:
37
+ for i, url in enumerate(urls):
38
+ try:
39
+ driver.get(url)
40
+ time.sleep(2) # Wait for the page to load
41
+ if content_type == "text":
42
+ current_content = driver.page_source
43
+ elif content_type == "media":
44
+ current_content = driver.find_elements_by_tag_name("img")
45
+ else:
46
+ current_content = driver.page_source
47
+ current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
48
+ if current_hash != previous_hashes[i]:
49
+ previous_hashes[i] = current_hash
50
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
51
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
52
+ with open(storage_location, "a", newline="") as csvfile:
53
+ csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
54
+ csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  logging.info(f"Change detected at {url} on {date_time_str}")
56
+ except Exception as e:
57
+ logging.error(f"Error accessing {url}: {e}")
58
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
59
+ except Exception as e:
60
+ logging.error(f"Error starting ChromeDriver: {e}")
 
61
 
62
  # Define main function to handle user input
63
  def handle_input(storage_location, urls, scrape_interval, content_type):
64
+ global CURRENT_TASK, HISTORY
65
 
66
+ CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
67
+ HISTORY.append(f"Task started: {CURRENT_TASK}")
68
  monitor_urls(storage_location, urls, scrape_interval, content_type)
69
+ return TASK_PROMPT.format(task=CURRENT_TASK, history="\n".join(map(str, HISTORY)))
70
 
71
  # Load custom prompts
72
  try:
73
+ with open("custom_prompts.yaml", "r") as fp:
74
  custom_prompts = yaml.safe_load(fp)
75
  except FileNotFoundError:
76
+ custom_prompts = {"WEB_DEV": "", "AI_SYSTEM_PROMPT": "", "PYTHON_CODE_DEV": "", "CODE_GENERATION": "", "CODE_INTERPRETATION": "", "CODE_TRANSLATION": "", "CODE_IMPLEMENTATION": ""}
77
+
78
+ # Define agents
79
+ AGENTS = ["WEB_DEV", "AI_SYSTEM_PROMPT", "PYTHON_CODE_DEV", "CODE_GENERATION", "CODE_INTERPRETATION", "CODE_TRANSLATION", "CODE_IMPLEMENTATION"]
 
 
 
 
 
80
 
81
  # Define the Mistral inference client
82
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Define the chat response function
85
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
86
+ return generate(message, history, system_message, max_tokens, temperature, top_p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  # Function to start scraping
89
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
 
101
 
102
  # Create Gradio interface
103
  def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
104
+ global HISTORY
105
+ response = respond(message, HISTORY, system_message, max_tokens, temperature, top_p)
106
+ HISTORY.append((message, response))
107
+ return HISTORY, ""
108
 
109
  demo = gr.Blocks()
110
 
 
116
  max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
117
  temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
118
  top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
119
+ storage_location = gr.Textbox(value=DEFAULT_FILE_PATH, label="Storage Location")
120
+ url1 = gr.Textbox(value="https://www.culver.k12.in/", label="URL 1")
121
  url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2")
122
  url3 = gr.Textbox(label="URL 3")
123
  url4 = gr.Textbox(label="URL 4")