acecalisto3 commited on
Commit
17ea76e
·
verified ·
1 Parent(s): 82cd09a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -138
app.py CHANGED
@@ -1,150 +1,114 @@
1
- import datetime
2
- import os
3
- import csv
4
  import time
 
 
5
  import hashlib
6
- import logging
7
- import gradio as gr
8
- from selenium import webdriver
9
- from selenium.webdriver.chrome.service import Service
10
  from selenium.webdriver.chrome.options import Options
 
 
 
 
11
  from webdriver_manager.chrome import ChromeDriverManager
12
- from huggingface_hub import InferenceClient
13
- import random
14
- import yaml
15
-
16
- # Configure logging
17
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
-
19
- # Define constants
20
- DATE_TIME_STR = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
21
- PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
22
- HISTORY = []
23
- CURRENT_TASK = None
24
- DEFAULT_FILE_PATH = "user/app/scraped_data/culver/culvers_changes.csv"
25
-
26
- # Ensure the directory exists
27
- os.makedirs(os.path.dirname(DEFAULT_FILE_PATH), exist_ok=True)
28
-
29
- # Function to monitor URLs for changes
30
- def monitor_urls(storage_location, urls, scrape_interval, content_type):
31
- global HISTORY
32
- previous_hashes = [""] * len(urls)
33
-
34
- try:
35
- with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options()) as driver:
36
- while True:
37
- for i, url in enumerate(urls):
38
- try:
39
- driver.get(url)
40
- time.sleep(2) # Wait for the page to load
41
- if content_type == "text":
42
- current_content = driver.page_source
43
- elif content_type == "media":
44
- current_content = driver.find_elements_by_tag_name("img")
45
- else:
46
- current_content = driver.page_source
47
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
48
- if current_hash != previous_hashes[i]:
49
- previous_hashes[i] = current_hash
50
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
51
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
52
- with open(storage_location, "a", newline="") as csvfile:
53
- csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
54
- csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
55
- logging.info(f"Change detected at {url} on {date_time_str}")
56
- except Exception as e:
57
- logging.error(f"Error accessing {url}: {e}")
58
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
59
- except Exception as e:
60
- logging.error(f"Error starting ChromeDriver: {e}")
61
-
62
- # Define main function to handle user input
63
- def handle_input(storage_location, urls, scrape_interval, content_type):
64
- global CURRENT_TASK, HISTORY
65
-
66
- CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
67
- HISTORY.append(f"Task started: {CURRENT_TASK}")
68
- monitor_urls(storage_location, urls, scrape_interval, content_type)
69
- return TASK_PROMPT.format(task=CURRENT_TASK, history="\n".join(map(str, HISTORY)))
70
-
71
- # Load custom prompts
72
- try:
73
- with open("custom_prompts.yaml", "r") as fp:
74
- custom_prompts = yaml.safe_load(fp)
75
- except FileNotFoundError:
76
- custom_prompts = {"WEB_DEV": "", "AI_SYSTEM_PROMPT": "", "PYTHON_CODE_DEV": "", "CODE_GENERATION": "", "CODE_INTERPRETATION": "", "CODE_TRANSLATION": "", "CODE_IMPLEMENTATION": ""}
77
-
78
- # Define agents
79
- AGENTS = ["WEB_DEV", "AI_SYSTEM_PROMPT", "PYTHON_CODE_DEV", "CODE_GENERATION", "CODE_INTERPRETATION", "CODE_TRANSLATION", "CODE_IMPLEMENTATION"]
80
-
81
- # Define the Mistral inference client
82
- client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
83
-
84
- # Define the chat response function
85
- def respond(message, history, system_message, max_tokens, temperature, top_p):
86
- return generate(message, history, system_message, max_tokens, temperature, top_p)
87
 
88
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
89
  urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
90
- handle_input(storage_location, urls, scrape_interval, content_type)
91
- # Start transaction
92
- inspector.start_transaction('start_scraping')
93
- # Scrape data
94
  while True:
95
  # Check for scrape_interval
96
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # End transaction
98
  inspector.end_transaction()
99
- return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
100
-
101
- # Function to display CSV content
102
- def display_csv(storage_location):
103
- if os.path.exists(storage_location):
104
- with open(storage_location, "r") as file:
105
- return file.read()
106
- else:
107
- return "No data available."
108
-
109
- # Create Gradio interface
110
- def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
111
- global HISTORY
112
- response = respond(message, HISTORY, system_message, max_tokens, temperature, top_p)
113
- HISTORY.append((message, response))
114
- return HISTORY, ""
115
-
116
- demo = gr.Blocks()
117
-
118
- with demo:
119
- with gr.Row():
120
- with gr.Column():
121
- message = gr.Textbox(label="Message")
122
- system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
123
- max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
124
- temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
125
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
126
- storage_location = gr.Textbox(value=DEFAULT_FILE_PATH, label="Storage Location")
127
- url1 = gr.Textbox(value="https://www.culver.k12.in/", label="URL 1")
128
- url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2")
129
- url3 = gr.Textbox(label="URL 3")
130
- url4 = gr.Textbox(label="URL 4")
131
- url5 = gr.Textbox(label="URL 5")
132
- url6 = gr.Textbox(label="URL 6")
133
- url7 = gr.Textbox(label="URL 7")
134
- url8 = gr.Textbox(label="URL 8")
135
- url9 = gr.Textbox(label="URL 9")
136
- url10 = gr.Textbox(label="URL 10")
137
- scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
138
- content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
139
- start_button = gr.Button("Start Scraping")
140
- csv_output = gr.Textbox(label="CSV Output", interactive=False)
141
-
142
- with gr.Column():
143
- chat_history = gr.Chatbot(label="Chat History")
144
- response_box = gr.Textbox(label="Response")
145
-
146
- start_button.click(start_scraping, inputs=[storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=csv_output)
147
- message.submit(chat_interface, inputs=[message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=[chat_history, response_box])
148
-
149
- if __name__ == "__main__":
150
- demo.launch()
 
 
 
 
1
  import time
2
+ import csv
3
+ import os
4
  import hashlib
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
 
8
  from selenium.webdriver.chrome.options import Options
9
+ from selenium.webdriver.common.keys import Keys
10
+ from selenium.webdriver.common.by import By
11
+ from selenium.webdriver.support.ui import WebDriverWait
12
+ from selenium.webdriver.support import expected_conditions as EC
13
  from webdriver_manager.chrome import ChromeDriverManager
14
+ from inspector import Configuration, Inspector
15
+
16
+ config = Configuration('5713ec1deb658fd2e6c069ce313ddaa34e2feee3')
17
+ inspector = Inspector(config)
18
+ inspector.start_transaction('my python script')
19
+
20
+ def handle_input(input):
21
+ global CURRENT_INPUT
22
+ CURRENT_INPUT = input
23
+
24
+ def handle_output(output):
25
+ global OUTPUT_HISTORY
26
+ OUTPUT_HISTORY.append((CURRENT_INPUT, output))
27
+
28
+ def handle_system(output):
29
+ global SYSTEM_OUTPUT
30
+ SYSTEM_OUTPUT = output
31
+
32
+ def handle_error(error):
33
+ global ERROR_HISTORY
34
+ ERROR_HISTORY.append((CURRENT_INPUT, error))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
37
  urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
38
+ handle_input(f"Start scraping {', '.join(urls)} every {scrape_interval} minutes.")
 
 
 
39
  while True:
40
  # Check for scrape_interval
41
  time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
42
+ # Scrape data
43
+ for url in urls:
44
+ # Initialize Chrome webdriver
45
+ options = Options()
46
+ options.headless = True
47
+ driver = webdriver.Chrome(executable_path='/path/to/chromedriver', options=options)
48
+ driver.maximize_window()
49
+ driver.set_window_size(1920, 1080)
50
+ driver.implicitly_wait(10)
51
+ driver.get(url)
52
+
53
+ # Wait for page to load
54
+ wait = WebDriverWait(driver, 10)
55
+ wait.until(EC.title_is('Culver Community Schools'))
56
+
57
+ # Scrape data
58
+ soup = BeautifulSoup(driver.page_source, 'html.parser')
59
+ content = None
60
+ if content_type == 'text':
61
+ content = soup.get_text()
62
+ elif content_type == 'media':
63
+ content = [img['src'] for img in soup.find_all('img')]
64
+ else:
65
+ raise Exception('Invalid content type')
66
+
67
+ # Save data
68
+ with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}", 'w') as f:
69
+ if content_type == 'text':
70
+ f.write(content)
71
+ elif content_type == 'media':
72
+ for img in content:
73
+ response = requests.get(img)
74
+ with open(f"{storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape/{hashlib.md5(response.content).hexdigest()[:10]}.jpg", 'wb') as f:
75
+ f.write(response.content)
76
+ else:
77
+ raise Exception('Invalid content type')
78
+
79
+ # End transaction
80
+ inspector.end_transaction()
81
+ handle_output(f"Scraped {url} and saved data to {storage_location}/{url.split('/')[-2]}/{url.split('/')[-1]}_scrape.{content_type}")
82
+
83
  # End transaction
84
  inspector.end_transaction()
85
+
86
+ # Handle errors
87
+ for error in ERROR_HISTORY:
88
+ handle_error(error)
89
+
90
+ # Return scraping status
91
+ handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
92
+
93
+ def handle_system():
94
+ handle_output(f"System: {SYSTEM_OUTPUT}")
95
+
96
+ def handle_ui(ui):
97
+ # Start scraping
98
+ start_scraping('scrape_data', 'https://www.culver.org/', 'https://www.culver.org/about-us/', 'https://www.culver.org/academics/', 'https://www.culver.org/athletics/', 'https://www.culver.org/arts-and-humanities/', 'https://www.culver.org/fine-and-performing-arts/', 'https://www.culver.org/clubs/', 'https://www.culver.org/community-education/', 'https://www.culver.org/community-outreach/')
99
+
100
+ # Handle errors
101
+ for error in ERROR_HISTORY:
102
+ handle_error(error)
103
+
104
+ # Return scraping status
105
+ handle_output(f"Scraping {', '.join(urls)} every {scrape_interval} minutes.")
106
+
107
+ if __name__ == '__main__':
108
+ # Read input
109
+ input = "Start scraping https://www.culver.org/ and save data to scrape_data directory."
110
+ # Call functions
111
+ handle_input(input)
112
+ handle_system()
113
+ # Run system
114
+ handle_ui()