acecalisto3 commited on
Commit
885ce0d
·
verified ·
1 Parent(s): d4febba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -133
app.py CHANGED
@@ -1,150 +1,160 @@
 
 
 
 
 
1
  import datetime
2
  import os
3
- import csv
4
- import time
5
- import hashlib
6
  import logging
7
- import gradio as gr
8
- from selenium import webdriver
9
- from selenium.webdriver.chrome.service import Service
10
- from selenium.webdriver.chrome.options import Options
11
- from webdriver_manager.chrome import ChromeDriverManager
12
- from huggingface_hub import InferenceClient
13
- import random
14
- import yaml
15
 
16
- # Configure logging
17
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
 
19
- # Define constants
20
- DATE_TIME_STR = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
21
- PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
22
- HISTORY = []
23
- CURRENT_TASK = None
24
- DEFAULT_FILE_PATH = "user/app/scraped_data/culver/culvers_changes.csv"
25
 
26
  # Ensure the directory exists
27
- os.makedirs(os.path.dirname(DEFAULT_FILE_PATH), exist_ok=True)
 
 
 
 
 
28
 
29
- # Function to monitor URLs for changes
30
- def monitor_urls(storage_location, urls, scrape_interval, content_type):
31
- global HISTORY
32
- previous_hashes = [""] * len(urls)
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
- with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options()) as driver:
36
- while True:
37
- for i, url in enumerate(urls):
38
- try:
39
- driver.get(url)
40
- time.sleep(2) # Wait for the page to load
41
- if content_type == "text":
42
- current_content = driver.page_source
43
- elif content_type == "media":
44
- current_content = driver.find_elements_by_tag_name("img")
45
- else:
46
- current_content = driver.page_source
47
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
48
- if current_hash != previous_hashes[i]:
49
- previous_hashes[i] = current_hash
50
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
51
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
52
- with open(storage_location, "a", newline="") as csvfile:
53
- csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
54
- csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
55
- logging.info(f"Change detected at {url} on {date_time_str}")
56
- except Exception as e:
57
- logging.error(f"Error accessing {url}: {e}")
58
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
59
  except Exception as e:
60
- logging.error(f"Error starting ChromeDriver: {e}")
 
 
61
 
62
- # Define main function to handle user input
63
- def handle_input(storage_location, urls, scrape_interval, content_type):
64
- global CURRENT_TASK, HISTORY
 
 
 
 
 
 
 
65
 
66
- CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
67
- HISTORY.append(f"Task started: {CURRENT_TASK}")
68
- monitor_urls(storage_location, urls, scrape_interval, content_type)
69
- return TASK_PROMPT.format(task=CURRENT_TASK, history="\n".join(map(str, HISTORY)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Load custom prompts
72
- try:
73
- with open("custom_prompts.yaml", "r") as fp:
74
- custom_prompts = yaml.safe_load(fp)
75
- except FileNotFoundError:
76
- custom_prompts = {"WEB_DEV": "", "AI_SYSTEM_PROMPT": "", "PYTHON_CODE_DEV": "", "CODE_GENERATION": "", "CODE_INTERPRETATION": "", "CODE_TRANSLATION": "", "CODE_IMPLEMENTATION": ""}
77
-
78
- # Define agents
79
- AGENTS = ["WEB_DEV", "AI_SYSTEM_PROMPT", "PYTHON_CODE_DEV", "CODE_GENERATION", "CODE_INTERPRETATION", "CODE_TRANSLATION", "CODE_IMPLEMENTATION"]
80
-
81
- # Define the Mistral inference client
82
- client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
83
-
84
- # Define the chat response function
85
- def respond(message, history, system_message, max_tokens, temperature, top_p):
86
- return generate(message, history, system_message, max_tokens, temperature, top_p)
87
-
88
- def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
89
- urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
90
- handle_input(storage_location, urls, scrape_interval, content_type)
91
- # Start transaction
92
- inspector.start_transaction('start_scraping')
93
- # Scrape data
94
- while True:
95
- # Check for scrape_interval
96
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
97
- # End transaction
98
- inspector.end_transaction()
99
- return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
100
-
101
- # Function to display CSV content
102
- def display_csv(storage_location):
103
- if os.path.exists(storage_location):
104
- with open(storage_location, "r") as file:
105
- return file.read()
106
- else:
107
- return "No data available."
108
-
109
- # Create Gradio interface
110
- def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
111
- global HISTORY
112
- response = respond(message, HISTORY, system_message, max_tokens, temperature, top_p)
113
- HISTORY.append((message, response))
114
- return HISTORY, ""
115
-
116
- demo = gr.Blocks()
117
-
118
- with demo:
119
- with gr.Row():
120
- with gr.Column():
121
- message = gr.Textbox(label="Message")
122
- system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
123
- max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
124
- temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
125
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
126
- storage_location = gr.Textbox(value=DEFAULT_FILE_PATH, label="Storage Location")
127
- url1 = gr.Textbox(value="https://www.culver.k12.in/", label="URL 1")
128
- url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2")
129
- url3 = gr.Textbox(label="URL 3")
130
- url4 = gr.Textbox(label="URL 4")
131
- url5 = gr.Textbox(label="URL 5")
132
- url6 = gr.Textbox(label="URL 6")
133
- url7 = gr.Textbox(label="URL 7")
134
- url8 = gr.Textbox(label="URL 8")
135
- url9 = gr.Textbox(label="URL 9")
136
- url10 = gr.Textbox(label="URL 10")
137
- scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
138
- content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
139
- start_button = gr.Button("Start Scraping")
140
- csv_output = gr.Textbox(label="CSV Output", interactive=False)
141
 
142
- with gr.Column():
143
- chat_history = gr.Chatbot(label="Chat History")
144
- response_box = gr.Textbox(label="Response")
145
-
146
- start_button.click(start_scraping, inputs=[storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=csv_output)
147
- message.submit(chat_interface, inputs=[message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=[chat_history, response_box])
 
 
148
 
149
  if __name__ == "__main__":
150
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import sqlite3
5
+ from feedgen.feed import FeedGenerator
6
  import datetime
7
  import os
 
 
 
8
  import logging
9
+ import sys
10
+ import csv
11
+ import traceback
 
 
 
 
 
12
 
13
+ sys.path.append('/home/user')
14
+ from app.background_tasks import start_background_monitoring, create_database
15
 
16
+ # Set up absolute paths
17
+ BASE_DIR = '/home/user/app/scraped_data/culver'
18
+ LOG_FILE = os.path.join(BASE_DIR, 'main.log')
19
+ CSV_FILE = os.path.join(BASE_DIR, 'culvers_changes.csv')
20
+ DB_FILE = os.path.join(BASE_DIR, 'culvers_changes.db')
21
+ XML_FILE = os.path.join(BASE_DIR, 'culvers_changes.xml')
22
 
23
  # Ensure the directory exists
24
+ try:
25
+ os.makedirs(BASE_DIR, exist_ok=True)
26
+ print(f"Directory created or already exists: {BASE_DIR}")
27
+ except Exception as e:
28
+ print(f"Error creating directory: {e}")
29
+ traceback.print_exc()
30
 
31
+ # Configure logging
32
+ try:
33
+ logging.basicConfig(filename=LOG_FILE, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
34
+ print(f"Logging configured. Log file: {LOG_FILE}")
35
+ except Exception as e:
36
+ print(f"Error configuring logging: {e}")
37
+ traceback.print_exc()
38
 
39
+ # Write directly to log file
40
+ try:
41
+ with open(LOG_FILE, 'w') as log_file:
42
+ log_file.write(f"Log file created at {datetime.datetime.now()}\n")
43
+ print(f"Log file created: {LOG_FILE}")
44
+ except Exception as e:
45
+ print(f"Error writing to log file: {e}")
46
+ traceback.print_exc()
47
+
48
+ # Write directly to CSV file
49
+ try:
50
+ with open(CSV_FILE, 'w', newline='') as csv_file:
51
+ writer = csv.writer(csv_file)
52
+ writer.writerow(['date', 'time', 'url', 'change'])
53
+ writer.writerow([datetime.datetime.now().strftime("%Y-%m-%d"), datetime.datetime.now().strftime("%H:%M:%S"), 'Initial', 'CSV file created'])
54
+ print(f"CSV file created: {CSV_FILE}")
55
+ except Exception as e:
56
+ print(f"Error writing to CSV file: {e}")
57
+ traceback.print_exc()
58
+
59
+ # Start background monitoring
60
+ urls = ["https://www.culver.k12.in.us/", "https://www.facebook.com/CulverCommunitySchools"]
61
+ try:
62
+ start_background_monitoring(CSV_FILE, urls, 1, "text") # Changed interval to 1 minute for testing
63
+ print("Background monitoring started")
64
+ except Exception as e:
65
+ print(f"Error starting background monitoring: {e}")
66
+ traceback.print_exc()
67
+
68
+ logging.info("Background monitoring initiated from main.py")
69
+
70
+ def view_scraped_data():
71
  try:
72
+ create_database() # Ensure the database and table exist
73
+ conn = sqlite3.connect(DB_FILE)
74
+ df = pd.read_sql_query("SELECT * FROM changes ORDER BY date DESC, time DESC LIMIT 50", conn)
75
+ conn.close()
76
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  except Exception as e:
78
+ print(f"Error viewing scraped data: {e}")
79
+ traceback.print_exc()
80
+ return pd.DataFrame()
81
 
82
+ def view_rss_feed():
83
+ try:
84
+ with open(XML_FILE, 'r') as file:
85
+ return file.read()
86
+ except FileNotFoundError:
87
+ return "RSS feed not generated yet."
88
+ except Exception as e:
89
+ print(f"Error viewing RSS feed: {e}")
90
+ traceback.print_exc()
91
+ return "Error viewing RSS feed"
92
 
93
+ def generate_rss_feed():
94
+ try:
95
+ create_database() # Ensure the database and table exist
96
+ fg = FeedGenerator()
97
+ fg.title('Culvers Site Changes')
98
+ fg.link(href='http://example.com', rel='alternate')
99
+ fg.description('Recent changes detected on Culvers websites')
100
+
101
+ conn = sqlite3.connect(DB_FILE)
102
+ c = conn.cursor()
103
+ c.execute("SELECT * FROM changes ORDER BY date DESC, time DESC LIMIT 20")
104
+ changes = c.fetchall()
105
+
106
+ for change in changes:
107
+ fe = fg.add_entry()
108
+ fe.id(str(change[0]))
109
+ fe.title(f'Change detected at {change[3]}')
110
+ fe.link(href=change[3])
111
+ fe.description(change[4])
112
+ fe.pubDate(datetime.datetime.strptime(f"{change[1]} {change[2]}", "%Y-%m-%d %H:%M:%S"))
113
+
114
+ conn.close()
115
+
116
+ fg.rss_file(XML_FILE)
117
+ return "RSS feed generated successfully."
118
+ except Exception as e:
119
+ print(f"Error generating RSS feed: {e}")
120
+ traceback.print_exc()
121
+ return "Error generating RSS feed"
122
 
123
+ def create_viewer():
124
+ with gr.Blocks() as demo:
125
+ gr.Markdown("# Culvers Site Monitor and Viewer")
126
+
127
+ with gr.Tab("Monitor Status"):
128
+ gr.Markdown("Continuous monitoring is active for the following URLs:")
129
+ for url in urls:
130
+ gr.Markdown(f"- {url}")
131
+ gr.Markdown(f"Monitoring interval: 1 minute")
132
+ gr.Markdown(f"Data is being stored in: {CSV_FILE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ with gr.Tab("View Scraped Data"):
135
+ gr.DataFrame(view_scraped_data, label="Recent Changes")
136
+
137
+ with gr.Tab("View RSS Feed"):
138
+ gr.TextArea(view_rss_feed, label="RSS Feed Content")
139
+ gr.Button("Generate RSS Feed").click(generate_rss_feed, outputs=gr.TextArea(label="Generation Status"))
140
+
141
+ return demo
142
 
143
  if __name__ == "__main__":
144
+ try:
145
+ # Create the database and table before launching the viewer
146
+ create_database()
147
+ print("Database created")
148
+
149
+ # Create and launch the viewer
150
+ viewer = create_viewer()
151
+ print("Viewer created")
152
+ viewer.launch()
153
+ print("Viewer launched")
154
+
155
+ logging.info("Web-based viewer created and launched with continuous monitoring.")
156
+ except Exception as e:
157
+ print(f"Error in main execution: {e}")
158
+ traceback.print_exc()
159
+
160
+ print("Main application file updated with error handling, console logging, and all necessary functions.")