acecalisto3 commited on
Commit
6ec50c7
·
verified ·
1 Parent(s): 35d8738

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -133
app.py CHANGED
@@ -1,139 +1,143 @@
1
- import time
2
- import csv
3
  import os
 
 
4
  import hashlib
5
- import re
6
- import requests
7
- from bs4 import BeautifulSoup
 
8
  from selenium.webdriver.chrome.options import Options
9
- from selenium.webdriver.common.keys import Keys
10
- from selenium.webdriver.common.by import By
11
- from selenium.webdriver.support.ui import WebDriverWait
12
- from selenium.webdriver.support import expected_conditions as EC
13
  from webdriver_manager.chrome import ChromeDriverManager
14
- from inspector import Configuration, Inspector
15
-
16
- config = Configuration('5713ec1deb658fd2e6c069ce313ddaa34e2feee3')
17
- inspector = Inspector(config)
18
- inspector.start_transaction('my python script')
19
-
20
- def handle_input(input):
21
- global CURRENT_INPUT
22
- CURRENT_INPUT = input
23
-
24
- def handle_output(output):
25
- global OUTPUT_HISTORY
26
- OUTPUT_HISTORY.append((CURRENT_INPUT, output))
27
-
28
- def handle_system(output):
29
- global SYSTEM_OUTPUT
30
- SYSTEM_OUTPUT = output
31
-
32
- def handle_error(error):
33
- global ERROR_HISTORY
34
- ERROR_HISTORY.append((CURRENT_INPUT, error))
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
37
  urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
38
- handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
39
- csv_file_path = f"{storage_location}/scraped_data.csv"
40
- csv_fieldnames = ["date", "time", "url", "change"]
41
-
42
- # Create the CSV file if it does not exist
43
- if not os.path.exists(csv_file_path):
44
- with open(csv_file_path, 'w', newline='') as csvfile:
45
- csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
46
- csv_writer.writeheader()
47
-
48
- while True:
49
- # Check for scrape_interval
50
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
51
- # Scrape data
52
- for url in urls:
53
- # Initialize Chrome webdriver
54
- options = Options()
55
- options.headless = True
56
- driver = webdriver.Chrome(executable_path='/path/to/chromedriver', options=options)
57
- driver.maximize_window()
58
- driver.set_window_size(1920, 1080)
59
- driver.implicitly_wait(10)
60
- driver.get(url)
61
-
62
- # Wait for page to load
63
- wait = WebDriverWait(driver, 10)
64
- wait.until(EC.title_is('Culver Community Schools'))
65
-
66
- # Scrape data
67
- soup = BeautifulSoup(driver.page_source, 'html.parser')
68
- content = None
69
- if content_type == 'text':
70
- content = soup.get_text()
71
- elif content_type == 'media':
72
- content = [img['src'] for img in soup.find_all('img')]
73
- else:
74
- raise Exception('Invalid content type')
75
-
76
- # Calculate hash of the content
77
- content_hash = hashlib.md5(str(content).encode('utf-8')).hexdigest()
78
-
79
- # Check if the content has changed
80
- with open(csv_file_path, 'r', newline='') as csvfile:
81
- csv_reader = csv.DictReader(csvfile)
82
- rows = list(csv_reader)
83
- if rows:
84
- last_row = rows[-1]
85
- if last_row['url'] == url and last_row['change'] == content_hash:
86
- print(f"No changes detected on {url}")
87
- continue
88
-
89
- # Save data to CSV file
90
- with open(csv_file_path, 'a', newline='') as csvfile:
91
- csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
92
- csv_writer.writerow({
93
- "date": datetime.datetime.now().strftime("%Y-%m-%d"),
94
- "time": datetime.datetime.now().strftime("%H:%M:%S"),
95
- "url": url,
96
- "change": content_hash
97
- })
98
-
99
- # Save data to file
100
- with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
101
- if content_type == 'text':
102
- f.write(content)
103
- elif content_type == 'media':
104
- for img in content:
105
- response = requests.get(img)
106
- with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape/{hashlib.md5(response.content).hexdigest()[:10]}.jpg", 'wb') as f:
107
- f.write(response.content)
108
- else:
109
- raise Exception('Invalid content type')
110
-
111
- handle_output(f"Scraped {url} and saved data to {csv_file_path}")
112
- handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
113
- inspector.end_transaction()
114
-
115
- # Handle errors
116
- for error in ERROR_HISTORY:
117
- handle_error(error)
118
-
119
- # Return scraping status
120
- handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
121
-
122
- def handle_system():
123
- handle_output(f"System: {SYSTEM_OUTPUT}")
124
-
125
- def handle_ui(ui):
126
- # Start scraping
127
- urls = ['https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/']
128
- scrape_interval = 5 # Define the scrape interval
129
- content_type = 'text' # Define the content type
130
- start_scraping('scrape_data', *urls, scrape_interval, content_type)
131
-
132
- if __name__ == '__main__':
133
- # Read input
134
- input = "Start scraping https://www.culver.org/ and save data to scrape_data directory."
135
- # Call functions
136
- handle_input(input)
137
- handle_system()
138
- # Run system
139
- handle_ui()
 
1
+ import datetime
 
2
  import os
3
+ import csv
4
+ import time
5
  import hashlib
6
+ import logging
7
+ import gradio as gr
8
+ from selenium import webdriver
9
+ from selenium.webdriver.chrome.service import Service
10
  from selenium.webdriver.chrome.options import Options
 
 
 
 
11
  from webdriver_manager.chrome import ChromeDriverManager
12
+ from huggingface_hub import InferenceClient
13
+ import random
14
+ import yaml
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
+
19
+ # Define constants
20
+ DATE_TIME_STR = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
21
+ PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
22
+ HISTORY = []
23
+ CURRENT_TASK = None
24
+ DEFAULT_FILE_PATH = "user/app/scraped_data/culver/culvers_changes.csv"
25
+
26
+ # Ensure the directory exists
27
+ os.makedirs(os.path.dirname(DEFAULT_FILE_PATH), exist_ok=True)
28
+
29
+ # Function to monitor URLs for changes
30
+ def monitor_urls(storage_location, urls, scrape_interval, content_type):
31
+ global HISTORY
32
+ previous_hashes = [""] * len(urls)
33
+
34
+ try:
35
+ with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options()) as driver:
36
+ while True:
37
+ for i, url in enumerate(urls):
38
+ try:
39
+ driver.get(url)
40
+ time.sleep(2) # Wait for the page to load
41
+ if content_type == "text":
42
+ current_content = driver.page_source
43
+ elif content_type == "media":
44
+ current_content = driver.find_elements_by_tag_name("img")
45
+ else:
46
+ current_content = driver.page_source
47
+ current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
48
+ if current_hash != previous_hashes[i]:
49
+ previous_hashes[i] = current_hash
50
+ date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
51
+ HISTORY.append(f"Change detected at {url} on {date_time_str}")
52
+ with open(storage_location, "a", newline="") as csvfile:
53
+ csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
54
+ csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
55
+ logging.info(f"Change detected at {url} on {date_time_str}")
56
+ except Exception as e:
57
+ logging.error(f"Error accessing {url}: {e}")
58
+ time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
59
+ except Exception as e:
60
+ logging.error(f"Error starting ChromeDriver: {e}")
61
+
62
+ # Define main function to handle user input
63
+ def handle_input(storage_location, urls, scrape_interval, content_type):
64
+ global CURRENT_TASK, HISTORY
65
+
66
+ CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
67
+ HISTORY.append(f"Task started: {CURRENT_TASK}")
68
+ monitor_urls(storage_location, urls, scrape_interval, content_type)
69
+ return TASK_PROMPT.format(task=CURRENT_TASK, history="\n".join(map(str, HISTORY)))
70
+
71
+ # Load custom prompts
72
+ try:
73
+ with open("custom_prompts.yaml", "r") as fp:
74
+ custom_prompts = yaml.safe_load(fp)
75
+ except FileNotFoundError:
76
+ custom_prompts = {"WEB_DEV": "", "AI_SYSTEM_PROMPT": "", "PYTHON_CODE_DEV": "", "CODE_GENERATION": "", "CODE_INTERPRETATION": "", "CODE_TRANSLATION": "", "CODE_IMPLEMENTATION": ""}
77
+
78
+ # Define agents
79
+ AGENTS = ["WEB_DEV", "AI_SYSTEM_PROMPT", "PYTHON_CODE_DEV", "CODE_GENERATION", "CODE_INTERPRETATION", "CODE_TRANSLATION", "CODE_IMPLEMENTATION"]
80
+
81
+ # Define the Mistral inference client
82
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
83
+
84
+ # Define the chat response function
85
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
86
+ return generate(message, history, system_message, max_tokens, temperature, top_p)
87
+
88
+ # Function to start scraping
89
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
90
  urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
91
+ handle_input(storage_location, urls, scrape_interval, content_type)
92
+ return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
93
+
94
+ # Function to display CSV content
95
+ def display_csv(storage_location):
96
+ if os.path.exists(storage_location):
97
+ with open(storage_location, "r") as file:
98
+ return file.read()
99
+ else:
100
+ return "No data available."
101
+
102
+ # Create Gradio interface
103
+ def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
104
+ global HISTORY
105
+ response = respond(message, HISTORY, system_message, max_tokens, temperature, top_p)
106
+ HISTORY.append((message, response))
107
+ return HISTORY, ""
108
+
109
+ demo = gr.Blocks()
110
+
111
+ with demo:
112
+ with gr.Row():
113
+ with gr.Column():
114
+ message = gr.Textbox(label="Message")
115
+ system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
116
+ max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
117
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
118
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
119
+ storage_location = gr.Textbox(value=DEFAULT_FILE_PATH, label="Storage Location")
120
+ url1 = gr.Textbox(value="https://www.culver.k12.in/", label="URL 1")
121
+ url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2")
122
+ url3 = gr.Textbox(label="URL 3")
123
+ url4 = gr.Textbox(label="URL 4")
124
+ url5 = gr.Textbox(label="URL 5")
125
+ url6 = gr.Textbox(label="URL 6")
126
+ url7 = gr.Textbox(label="URL 7")
127
+ url8 = gr.Textbox(label="URL 8")
128
+ url9 = gr.Textbox(label="URL 9")
129
+ url10 = gr.Textbox(label="URL 10")
130
+ scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
131
+ content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
132
+ start_button = gr.Button("Start Scraping")
133
+ csv_output = gr.Textbox(label="CSV Output", interactive=False)
134
+
135
+ with gr.Column():
136
+ chat_history = gr.Chatbot(label="Chat History")
137
+ response_box = gr.Textbox(label="Response")
138
+
139
+ start_button.click(start_scraping, inputs=[storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=csv_output)
140
+ message.submit(chat_interface, inputs=[message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=[chat_history, response_box])
141
+
142
+ if __name__ == "__main__":
143
+ demo.launch()