Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,150 +1,114 @@
|
|
1 |
-
import datetime
|
2 |
-
import os
|
3 |
-
import csv
|
4 |
import time
|
|
|
|
|
5 |
import hashlib
|
6 |
-
import
|
7 |
-
import
|
8 |
-
from
|
9 |
-
from selenium.webdriver.chrome.service import Service
|
10 |
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
|
|
|
|
11 |
from webdriver_manager.chrome import ChromeDriverManager
|
12 |
-
from
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
def
|
31 |
-
global
|
32 |
-
|
33 |
-
|
34 |
-
try:
|
35 |
-
with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options()) as driver:
|
36 |
-
while True:
|
37 |
-
for i, url in enumerate(urls):
|
38 |
-
try:
|
39 |
-
driver.get(url)
|
40 |
-
time.sleep(2) # Wait for the page to load
|
41 |
-
if content_type == "text":
|
42 |
-
current_content = driver.page_source
|
43 |
-
elif content_type == "media":
|
44 |
-
current_content = driver.find_elements_by_tag_name("img")
|
45 |
-
else:
|
46 |
-
current_content = driver.page_source
|
47 |
-
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
|
48 |
-
if current_hash != previous_hashes[i]:
|
49 |
-
previous_hashes[i] = current_hash
|
50 |
-
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
51 |
-
HISTORY.append(f"Change detected at {url} on {date_time_str}")
|
52 |
-
with open(storage_location, "a", newline="") as csvfile:
|
53 |
-
csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
|
54 |
-
csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
|
55 |
-
logging.info(f"Change detected at {url} on {date_time_str}")
|
56 |
-
except Exception as e:
|
57 |
-
logging.error(f"Error accessing {url}: {e}")
|
58 |
-
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
59 |
-
except Exception as e:
|
60 |
-
logging.error(f"Error starting ChromeDriver: {e}")
|
61 |
-
|
62 |
-
# Define main function to handle user input
|
63 |
-
def handle_input(storage_location, urls, scrape_interval, content_type):
|
64 |
-
global CURRENT_TASK, HISTORY
|
65 |
-
|
66 |
-
CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
|
67 |
-
HISTORY.append(f"Task started: {CURRENT_TASK}")
|
68 |
-
monitor_urls(storage_location, urls, scrape_interval, content_type)
|
69 |
-
return TASK_PROMPT.format(task=CURRENT_TASK, history="\n".join(map(str, HISTORY)))
|
70 |
-
|
71 |
-
# Load custom prompts
|
72 |
-
try:
|
73 |
-
with open("custom_prompts.yaml", "r") as fp:
|
74 |
-
custom_prompts = yaml.safe_load(fp)
|
75 |
-
except FileNotFoundError:
|
76 |
-
custom_prompts = {"WEB_DEV": "", "AI_SYSTEM_PROMPT": "", "PYTHON_CODE_DEV": "", "CODE_GENERATION": "", "CODE_INTERPRETATION": "", "CODE_TRANSLATION": "", "CODE_IMPLEMENTATION": ""}
|
77 |
-
|
78 |
-
# Define agents
|
79 |
-
AGENTS = ["WEB_DEV", "AI_SYSTEM_PROMPT", "PYTHON_CODE_DEV", "CODE_GENERATION", "CODE_INTERPRETATION", "CODE_TRANSLATION", "CODE_IMPLEMENTATION"]
|
80 |
-
|
81 |
-
# Define the Mistral inference client
|
82 |
-
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
83 |
-
|
84 |
-
# Define the chat response function
|
85 |
-
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
86 |
-
return generate(message, history, system_message, max_tokens, temperature, top_p)
|
87 |
|
88 |
def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
|
89 |
urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
|
90 |
-
handle_input(
|
91 |
-
# Start transaction
|
92 |
-
inspector.start_transaction('start_scraping')
|
93 |
-
# Scrape data
|
94 |
while True:
|
95 |
# Check for scrape_interval
|
96 |
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
# End transaction
|
98 |
inspector.end_transaction()
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
def
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
url3 = gr.Textbox(label="URL 3")
|
130 |
-
url4 = gr.Textbox(label="URL 4")
|
131 |
-
url5 = gr.Textbox(label="URL 5")
|
132 |
-
url6 = gr.Textbox(label="URL 6")
|
133 |
-
url7 = gr.Textbox(label="URL 7")
|
134 |
-
url8 = gr.Textbox(label="URL 8")
|
135 |
-
url9 = gr.Textbox(label="URL 9")
|
136 |
-
url10 = gr.Textbox(label="URL 10")
|
137 |
-
scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
|
138 |
-
content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
|
139 |
-
start_button = gr.Button("Start Scraping")
|
140 |
-
csv_output = gr.Textbox(label="CSV Output", interactive=False)
|
141 |
-
|
142 |
-
with gr.Column():
|
143 |
-
chat_history = gr.Chatbot(label="Chat History")
|
144 |
-
response_box = gr.Textbox(label="Response")
|
145 |
-
|
146 |
-
start_button.click(start_scraping, inputs=[storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=csv_output)
|
147 |
-
message.submit(chat_interface, inputs=[message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=[chat_history, response_box])
|
148 |
-
|
149 |
-
if __name__ == "__main__":
|
150 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
1 |
import time
|
2 |
+
import csv
|
3 |
+
import os
|
4 |
import hashlib
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
|
|
8 |
from selenium.webdriver.chrome.options import Options
|
9 |
+
from selenium.webdriver.common.keys import Keys
|
10 |
+
from selenium.webdriver.common.by import By
|
11 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
12 |
+
from selenium.webdriver.support import expected_conditions as EC
|
13 |
from webdriver_manager.chrome import ChromeDriverManager
|
14 |
+
from inspector import Configuration, Inspector
|
15 |
+
|
16 |
+
config = Configuration('5713ec1deb658fd2e6c069ce313ddaa34e2feee3')
|
17 |
+
inspector = Inspector(config)
|
18 |
+
inspector.start_transaction('my python script')
|
19 |
+
|
20 |
+
def handle_input(input):
|
21 |
+
global CURRENT_INPUT
|
22 |
+
CURRENT_INPUT = input
|
23 |
+
|
24 |
+
def handle_output(output):
|
25 |
+
global OUTPUT_HISTORY
|
26 |
+
OUTPUT_HISTORY.append((CURRENT_INPUT, output))
|
27 |
+
|
28 |
+
def handle_system(output):
|
29 |
+
global SYSTEM_OUTPUT
|
30 |
+
SYSTEM_OUTPUT = output
|
31 |
+
|
32 |
+
def handle_error(error):
|
33 |
+
global ERROR_HISTORY
|
34 |
+
ERROR_HISTORY.append((CURRENT_INPUT, error))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
|
37 |
urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
|
38 |
+
handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
|
|
|
|
|
|
|
39 |
while True:
|
40 |
# Check for scrape_interval
|
41 |
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
42 |
+
# Scrape data
|
43 |
+
for url in urls:
|
44 |
+
# Initialize Chrome webdriver
|
45 |
+
options = Options()
|
46 |
+
options.headless = True
|
47 |
+
driver = webdriver.Chrome(executable_path='/path/to/chromedriver', options=options)
|
48 |
+
driver.maximize_window()
|
49 |
+
driver.set_window_size(1920, 1080)
|
50 |
+
driver.implicitly_wait(10)
|
51 |
+
driver.get(url)
|
52 |
+
|
53 |
+
# Wait for page to load
|
54 |
+
wait = WebDriverWait(driver, 10)
|
55 |
+
wait.until(EC.title_is('Culver Community Schools'))
|
56 |
+
|
57 |
+
# Scrape data
|
58 |
+
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
59 |
+
content = None
|
60 |
+
if content_type == 'text':
|
61 |
+
content = soup.get_text()
|
62 |
+
elif content_type == 'media':
|
63 |
+
content = [img['src'] for img in soup.find_all('img')]
|
64 |
+
else:
|
65 |
+
raise Exception('Invalid content type')
|
66 |
+
|
67 |
+
# Save data
|
68 |
+
with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
|
69 |
+
if content_type == 'text':
|
70 |
+
f.write(content)
|
71 |
+
elif content_type == 'media':
|
72 |
+
for img in content:
|
73 |
+
response = requests.get(img)
|
74 |
+
with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape/{hashlib.md5(response.content).hexdigest()[:10]}.jpg", 'wb') as f:
|
75 |
+
f.write(response.content)
|
76 |
+
else:
|
77 |
+
raise Exception('Invalid content type')
|
78 |
+
|
79 |
+
# End transaction
|
80 |
+
inspector.end_transaction()
|
81 |
+
handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
|
82 |
+
|
83 |
# End transaction
|
84 |
inspector.end_transaction()
|
85 |
+
|
86 |
+
# Handle errors
|
87 |
+
for error in ERROR_HISTORY:
|
88 |
+
handle_error(error)
|
89 |
+
|
90 |
+
# Return scraping status
|
91 |
+
handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
|
92 |
+
|
93 |
+
def handle_system():
|
94 |
+
handle_output(f"System: {SYSTEM_OUTPUT}")
|
95 |
+
|
96 |
+
def handle_ui(ui):
|
97 |
+
# Start scraping
|
98 |
+
start_scraping('scrape_data', 'https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/')
|
99 |
+
|
100 |
+
# Handle errors
|
101 |
+
for error in ERROR_HISTORY:
|
102 |
+
handle_error(error)
|
103 |
+
|
104 |
+
# Return scraping status
|
105 |
+
handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
# Read input
|
109 |
+
input = "Start scraping https://www.culver.org/ and save data to scrape_data directory."
|
110 |
+
# Call functions
|
111 |
+
handle_input(input)
|
112 |
+
handle_system()
|
113 |
+
# Run system
|
114 |
+
handle_ui()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|