Spaces:
Runtime error
Runtime error
acecalisto3
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,150 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import datetime
|
2 |
import os
|
3 |
-
import csv
|
4 |
-
import time
|
5 |
-
import hashlib
|
6 |
import logging
|
7 |
-
import
|
8 |
-
|
9 |
-
|
10 |
-
from selenium.webdriver.chrome.options import Options
|
11 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
12 |
-
from huggingface_hub import InferenceClient
|
13 |
-
import random
|
14 |
-
import yaml
|
15 |
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
|
26 |
# Ensure the directory exists
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
try:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
time.sleep(2) # Wait for the page to load
|
41 |
-
if content_type == "text":
|
42 |
-
current_content = driver.page_source
|
43 |
-
elif content_type == "media":
|
44 |
-
current_content = driver.find_elements_by_tag_name("img")
|
45 |
-
else:
|
46 |
-
current_content = driver.page_source
|
47 |
-
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
|
48 |
-
if current_hash != previous_hashes[i]:
|
49 |
-
previous_hashes[i] = current_hash
|
50 |
-
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
51 |
-
HISTORY.append(f"Change detected at {url} on {date_time_str}")
|
52 |
-
with open(storage_location, "a", newline="") as csvfile:
|
53 |
-
csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
|
54 |
-
csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
|
55 |
-
logging.info(f"Change detected at {url} on {date_time_str}")
|
56 |
-
except Exception as e:
|
57 |
-
logging.error(f"Error accessing {url}: {e}")
|
58 |
-
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
59 |
except Exception as e:
|
60 |
-
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
# Define the Mistral inference client
|
82 |
-
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
83 |
-
|
84 |
-
# Define the chat response function
|
85 |
-
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
86 |
-
return generate(message, history, system_message, max_tokens, temperature, top_p)
|
87 |
-
|
88 |
-
def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
|
89 |
-
urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url]
|
90 |
-
handle_input(storage_location, urls, scrape_interval, content_type)
|
91 |
-
# Start transaction
|
92 |
-
inspector.start_transaction('start_scraping')
|
93 |
-
# Scrape data
|
94 |
-
while True:
|
95 |
-
# Check for scrape_interval
|
96 |
-
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
97 |
-
# End transaction
|
98 |
-
inspector.end_transaction()
|
99 |
-
return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
|
100 |
-
|
101 |
-
# Function to display CSV content
|
102 |
-
def display_csv(storage_location):
|
103 |
-
if os.path.exists(storage_location):
|
104 |
-
with open(storage_location, "r") as file:
|
105 |
-
return file.read()
|
106 |
-
else:
|
107 |
-
return "No data available."
|
108 |
-
|
109 |
-
# Create Gradio interface
|
110 |
-
def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type):
|
111 |
-
global HISTORY
|
112 |
-
response = respond(message, HISTORY, system_message, max_tokens, temperature, top_p)
|
113 |
-
HISTORY.append((message, response))
|
114 |
-
return HISTORY, ""
|
115 |
-
|
116 |
-
demo = gr.Blocks()
|
117 |
-
|
118 |
-
with demo:
|
119 |
-
with gr.Row():
|
120 |
-
with gr.Column():
|
121 |
-
message = gr.Textbox(label="Message")
|
122 |
-
system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
|
123 |
-
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
|
124 |
-
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
|
125 |
-
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
|
126 |
-
storage_location = gr.Textbox(value=DEFAULT_FILE_PATH, label="Storage Location")
|
127 |
-
url1 = gr.Textbox(value="https://www.culver.k12.in/", label="URL 1")
|
128 |
-
url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2")
|
129 |
-
url3 = gr.Textbox(label="URL 3")
|
130 |
-
url4 = gr.Textbox(label="URL 4")
|
131 |
-
url5 = gr.Textbox(label="URL 5")
|
132 |
-
url6 = gr.Textbox(label="URL 6")
|
133 |
-
url7 = gr.Textbox(label="URL 7")
|
134 |
-
url8 = gr.Textbox(label="URL 8")
|
135 |
-
url9 = gr.Textbox(label="URL 9")
|
136 |
-
url10 = gr.Textbox(label="URL 10")
|
137 |
-
scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)")
|
138 |
-
content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type")
|
139 |
-
start_button = gr.Button("Start Scraping")
|
140 |
-
csv_output = gr.Textbox(label="CSV Output", interactive=False)
|
141 |
|
142 |
-
with gr.
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
148 |
|
149 |
if __name__ == "__main__":
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
import sqlite3
|
5 |
+
from feedgen.feed import FeedGenerator
|
6 |
import datetime
|
7 |
import os
|
|
|
|
|
|
|
8 |
import logging
|
9 |
+
import sys
|
10 |
+
import csv
|
11 |
+
import traceback
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
sys.path.append('/home/user')
|
14 |
+
from app.background_tasks import start_background_monitoring, create_database
|
15 |
|
16 |
+
# Set up absolute paths
|
17 |
+
BASE_DIR = '/home/user/app/scraped_data/culver'
|
18 |
+
LOG_FILE = os.path.join(BASE_DIR, 'main.log')
|
19 |
+
CSV_FILE = os.path.join(BASE_DIR, 'culvers_changes.csv')
|
20 |
+
DB_FILE = os.path.join(BASE_DIR, 'culvers_changes.db')
|
21 |
+
XML_FILE = os.path.join(BASE_DIR, 'culvers_changes.xml')
|
22 |
|
23 |
# Ensure the directory exists
|
24 |
+
try:
|
25 |
+
os.makedirs(BASE_DIR, exist_ok=True)
|
26 |
+
print(f"Directory created or already exists: {BASE_DIR}")
|
27 |
+
except Exception as e:
|
28 |
+
print(f"Error creating directory: {e}")
|
29 |
+
traceback.print_exc()
|
30 |
|
31 |
+
# Configure logging
|
32 |
+
try:
|
33 |
+
logging.basicConfig(filename=LOG_FILE, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
34 |
+
print(f"Logging configured. Log file: {LOG_FILE}")
|
35 |
+
except Exception as e:
|
36 |
+
print(f"Error configuring logging: {e}")
|
37 |
+
traceback.print_exc()
|
38 |
|
39 |
+
# Write directly to log file
|
40 |
+
try:
|
41 |
+
with open(LOG_FILE, 'w') as log_file:
|
42 |
+
log_file.write(f"Log file created at {datetime.datetime.now()}\n")
|
43 |
+
print(f"Log file created: {LOG_FILE}")
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error writing to log file: {e}")
|
46 |
+
traceback.print_exc()
|
47 |
+
|
48 |
+
# Write directly to CSV file
|
49 |
+
try:
|
50 |
+
with open(CSV_FILE, 'w', newline='') as csv_file:
|
51 |
+
writer = csv.writer(csv_file)
|
52 |
+
writer.writerow(['date', 'time', 'url', 'change'])
|
53 |
+
writer.writerow([datetime.datetime.now().strftime("%Y-%m-%d"), datetime.datetime.now().strftime("%H:%M:%S"), 'Initial', 'CSV file created'])
|
54 |
+
print(f"CSV file created: {CSV_FILE}")
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error writing to CSV file: {e}")
|
57 |
+
traceback.print_exc()
|
58 |
+
|
59 |
+
# Start background monitoring
|
60 |
+
urls = ["https://www.culver.k12.in.us/", "https://www.facebook.com/CulverCommunitySchools"]
|
61 |
+
try:
|
62 |
+
start_background_monitoring(CSV_FILE, urls, 1, "text") # Changed interval to 1 minute for testing
|
63 |
+
print("Background monitoring started")
|
64 |
+
except Exception as e:
|
65 |
+
print(f"Error starting background monitoring: {e}")
|
66 |
+
traceback.print_exc()
|
67 |
+
|
68 |
+
logging.info("Background monitoring initiated from main.py")
|
69 |
+
|
70 |
+
def view_scraped_data():
|
71 |
try:
|
72 |
+
create_database() # Ensure the database and table exist
|
73 |
+
conn = sqlite3.connect(DB_FILE)
|
74 |
+
df = pd.read_sql_query("SELECT * FROM changes ORDER BY date DESC, time DESC LIMIT 50", conn)
|
75 |
+
conn.close()
|
76 |
+
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
except Exception as e:
|
78 |
+
print(f"Error viewing scraped data: {e}")
|
79 |
+
traceback.print_exc()
|
80 |
+
return pd.DataFrame()
|
81 |
|
82 |
+
def view_rss_feed():
|
83 |
+
try:
|
84 |
+
with open(XML_FILE, 'r') as file:
|
85 |
+
return file.read()
|
86 |
+
except FileNotFoundError:
|
87 |
+
return "RSS feed not generated yet."
|
88 |
+
except Exception as e:
|
89 |
+
print(f"Error viewing RSS feed: {e}")
|
90 |
+
traceback.print_exc()
|
91 |
+
return "Error viewing RSS feed"
|
92 |
|
93 |
+
def generate_rss_feed():
|
94 |
+
try:
|
95 |
+
create_database() # Ensure the database and table exist
|
96 |
+
fg = FeedGenerator()
|
97 |
+
fg.title('Culvers Site Changes')
|
98 |
+
fg.link(href='http://example.com', rel='alternate')
|
99 |
+
fg.description('Recent changes detected on Culvers websites')
|
100 |
+
|
101 |
+
conn = sqlite3.connect(DB_FILE)
|
102 |
+
c = conn.cursor()
|
103 |
+
c.execute("SELECT * FROM changes ORDER BY date DESC, time DESC LIMIT 20")
|
104 |
+
changes = c.fetchall()
|
105 |
+
|
106 |
+
for change in changes:
|
107 |
+
fe = fg.add_entry()
|
108 |
+
fe.id(str(change[0]))
|
109 |
+
fe.title(f'Change detected at {change[3]}')
|
110 |
+
fe.link(href=change[3])
|
111 |
+
fe.description(change[4])
|
112 |
+
fe.pubDate(datetime.datetime.strptime(f"{change[1]} {change[2]}", "%Y-%m-%d %H:%M:%S"))
|
113 |
+
|
114 |
+
conn.close()
|
115 |
+
|
116 |
+
fg.rss_file(XML_FILE)
|
117 |
+
return "RSS feed generated successfully."
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error generating RSS feed: {e}")
|
120 |
+
traceback.print_exc()
|
121 |
+
return "Error generating RSS feed"
|
122 |
|
123 |
+
def create_viewer():
|
124 |
+
with gr.Blocks() as demo:
|
125 |
+
gr.Markdown("# Culvers Site Monitor and Viewer")
|
126 |
+
|
127 |
+
with gr.Tab("Monitor Status"):
|
128 |
+
gr.Markdown("Continuous monitoring is active for the following URLs:")
|
129 |
+
for url in urls:
|
130 |
+
gr.Markdown(f"- {url}")
|
131 |
+
gr.Markdown(f"Monitoring interval: 1 minute")
|
132 |
+
gr.Markdown(f"Data is being stored in: {CSV_FILE}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
with gr.Tab("View Scraped Data"):
|
135 |
+
gr.DataFrame(view_scraped_data, label="Recent Changes")
|
136 |
+
|
137 |
+
with gr.Tab("View RSS Feed"):
|
138 |
+
gr.TextArea(view_rss_feed, label="RSS Feed Content")
|
139 |
+
gr.Button("Generate RSS Feed").click(generate_rss_feed, outputs=gr.TextArea(label="Generation Status"))
|
140 |
+
|
141 |
+
return demo
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
+
try:
|
145 |
+
# Create the database and table before launching the viewer
|
146 |
+
create_database()
|
147 |
+
print("Database created")
|
148 |
+
|
149 |
+
# Create and launch the viewer
|
150 |
+
viewer = create_viewer()
|
151 |
+
print("Viewer created")
|
152 |
+
viewer.launch()
|
153 |
+
print("Viewer launched")
|
154 |
+
|
155 |
+
logging.info("Web-based viewer created and launched with continuous monitoring.")
|
156 |
+
except Exception as e:
|
157 |
+
print(f"Error in main execution: {e}")
|
158 |
+
traceback.print_exc()
|
159 |
+
|
160 |
+
print("Main application file updated with error handling, console logging, and all necessary functions.")
|