acecalisto3 commited on
Commit
f0acb7f
·
verified ·
1 Parent(s): 465494c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -153
app.py CHANGED
@@ -1,154 +1,5 @@
1
- import os
2
- import time
3
- import hashlib
4
- import logging
5
- import datetime
6
- import csv
7
- from urllib.parse import urlparse
8
- from selenium import webdriver
9
- from selenium.webdriver.chrome.service import Service
10
- from selenium.webdriver.chrome.options import Options
11
- from selenium.webdriver.common.by import By
12
- from selenium.webdriver.support.ui import WebDriverWait
13
- from selenium.webdriver.support import expected_conditions as EC
14
- from selenium.webdriver.common.keys import Keys
15
- from selenium.common.exceptions import NoSuchElementException
16
- import requests
17
- import feedparser
18
  import gradio as gr
19
-
20
- # Configure logging
21
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
-
23
- # Define constants
24
- DEFAULT_FILE_PATH = "scraped_data"
25
- PURPOSE = f"You go to Culvers sites, you continuously seek changes on them since your last observation. Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data."
26
- HISTORY = []
27
- CURRENT_TASK = None
28
-
29
- # Define the list of URLs to monitor (you can add more URLs here)
30
- URLS_TO_MONITOR = ["https://twitter.com/wlcscrdp", "https://www.facebook.com/aurorareddevils/", "https://www.facebook.com/brightpanthers/", "https://www.facebook.com/carrollcountychamberin/", "https://www.facebook.com/Culver.Cavs.MHS", "https://www.facebook.com/culver.elementary.school", "https://www.facebook.com/CulverCommunitySchools", "https://www.facebook.com/DillsboroBulldogs/", "https://www.facebook.com/ECMSTROJANS", "https://www.facebook.com/enjoywhitecountyIN/", "https://www.facebook.com/farmersvilleelementary", "https://www.facebook.com/groups/SDMSparents", "https://www.facebook.com/jghsart/", "https://www.facebook.com/jgmusicdept", "https://www.facebook.com/John-Glenn-Education-Foundation-208326199636364/", "https://www.facebook.com/John-Glenn-High-School-1102148953201006/", "https://www.facebook.com/John-Glenn-Theatre-Company-383638295064502/", "https://www.facebook.com/JohnGlennFalconsAthletics", "https://www.facebook.com/KIRPC-Head-Start-1485812354989001", "https://www.facebook.com/KIRPC1", "https://www.facebook.com/LHNEeagles", "https://www.facebook.com/LuceElementarySchool/", "https://www.facebook.com/marrselementary", "https://www.facebook.com/messhiners/", "https://www.facebook.com/monticellocitypool", "https://www.facebook.com/monticelloinwastewater/", "https://www.facebook.com/MooresHillBobcats/", "https://www.facebook.com/msdmv", "https://www.facebook.com/msdnorthposey", "https://www.facebook.com/MUTPL/", "https://www.facebook.com/MVJHS/", "https://www.facebook.com/mvshs", "https://www.facebook.com/njspjrsrhighschool?mibextid=b06tZ0", "https://www.facebook.com/NorthElementaryStars/", "https://www.facebook.com/NorthLibertyElementary/", "https://www.facebook.com/northposey/", "https://www.facebook.com/northposeyhs/", "https://www.facebook.com/NPJuniorHigh", "https://www.facebook.com/Prairie-Heights-Elementary-659322230934707/", "https://www.facebook.com/Prairie-Heights-High-School-2027713067459043/", "https://www.facebook.com/PrairieHeightsPanthers/", "https://www.facebook.com/profile.php?id=100057030237096", "https://www.facebook.com/profile.php?id=100057451179651", "https://www.facebook.com/profile.php?id=100063463513451", "https://www.facebook.com/profile.php?id=100063612319256", "https://www.facebook.com/profile.php?id=100064532596422", "https://www.facebook.com/profile.php?id=100067180226810", "https://www.facebook.com/profile.php?id=61563484312348", "https://www.facebook.com/PTOSWES/", "https://www.facebook.com/RandolphSouthern/", "https://www.facebook.com/RochesterMiddleSchool", "https://www.facebook.com/RochesterZebraNewTechHigh", "https://www.facebook.com/rockportelementarysouthspencer/", "https://www.facebook.com/satellitesathletics/", "https://www.facebook.com/seymourcommunityschools/", "https://www.facebook.com/SeymourHighSchool/", "https://www.facebook.com/SouthDearbornHighSchool/", "https://www.facebook.com/southarbornschools/", "https://www.facebook.com/SouthDearbornSquires/", "https://www.facebook.com/southspencerhighschool", "https://www.facebook.com/southspencermiddleschool/", "https://www.facebook.com/SouthSpencerSchools", "https://www.facebook.com/SouthTerracePanthers/", "https://www.facebook.com/sunmantigers/", "https://www.facebook.com/SWShelbySpartan/", "https://www.facebook.com/TallTimbersMarina", "https://www.facebook.com/WabashValleyESC/", "https://www.facebook.com/Walkerton-Elementary-School-283088605088622/", "https://www.facebook.com/westcentralcte/", "https://www.facebook.com/westelementary", "https://www.facebook.com/wlcscrdp", "https://www.instagram.com/mutpl/", "https://www.instagram.com/northposeyhsathletics", "https://www.instagram.com/rchsprincipalcook/", "https://www.instagram.com/southdearbornhighschool/", "https://www.instagram.com/southdearbornschools/", "https://www.instagram.com/westcentralcte/", "https://www.tiktok.com/@mutplteen"]
31
-
32
- # Function to monitor URLs for changes
33
- def monitor_urls(storage_location, urls, scrape_interval, content_type):
34
- global HISTORY
35
- previous_hashes = {url: "" for url in urls} # Use a dictionary for better organization
36
-
37
- try:
38
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
39
- while True:
40
- for url in urls:
41
- try:
42
- driver.get(url)
43
- time.sleep(2) # Wait for the page to load
44
- if content_type == "text":
45
- current_content = driver.page_source
46
- elif content_type == "media":
47
- current_content = driver.find_elements(By.TAG_NAME, "img")
48
- else:
49
- current_content = driver.page_source
50
- current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
51
- if current_hash != previous_hashes[url]:
52
- previous_hashes[url] = current_hash
53
- date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
54
- HISTORY.append(f"Change detected at {url} on {date_time_str}")
55
- with open(os.path.join(storage_location, f"{urlparse(url).hostname}_changes.csv"), "a", newline="") as csvfile:
56
- csv_writer = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
57
- csv_writer.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"})
58
- logging.info(f"Change detected at {url} on {date_time_str}")
59
- except (NoSuchElementException, Exception) as e:
60
- logging.error(f"Error accessing {url}: {e}")
61
- time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
62
- except Exception as e:
63
- logging.error(f"Error starting ChromeDriver: {e}")
64
-
65
- # Function to start scraping
66
- def start_scraping(storage_location, urls, scrape_interval, content_type):
67
- global CURRENT_TASK, HISTORY
68
-
69
- CURRENT_TASK = f"Monitoring URLs: {', '.join(urls)}"
70
- HISTORY.append(f"Task started: {CURRENT_TASK}")
71
-
72
- for url in urls:
73
- # Create a folder for the URL
74
- hostname = urlparse(url).hostname
75
- folder_path = os.path.join(storage_location, hostname)
76
- os.makedirs(folder_path, exist_ok=True)
77
-
78
- # Log the initial observation
79
- try:
80
- with webdriver.Chrome(service=Service(webdriver.ChromeDriverManager().install()), options=Options()) as driver:
81
- driver.get(url)
82
- time.sleep(2) # Wait for the page to load
83
- if content_type == "text":
84
- initial_content = driver.page_source
85
- elif content_type == "media":
86
- initial_content = driver.find_elements(By.TAG_NAME, "img")
87
- else:
88
- initial_content = driver.page_source
89
- initial_hash = hashlib.md5(str(initial_content).encode('utf-8')).hexdigest()
90
- HISTORY.append(f"Initial observation at {url}: {initial_hash}")
91
- with open(os.path.join(folder_path, f"{hostname}_initial_observation.txt"), "w") as file:
92
- file.write(f"Initial observation at {url}: {initial_hash}")
93
- except (NoSuchElementException, Exception) as e:
94
- HISTORY.append(f"Error accessing {url}: {e}")
95
-
96
- # Monitor the URLs
97
- monitor_urls(storage_location, urls, scrape_interval, content_type)
98
-
99
- return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes."
100
-
101
- # Function to display CSV content
102
- def display_csv(url):
103
- hostname = urlparse(url).hostname
104
- folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
105
- csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
106
- if os.path.exists(csv_path):
107
- with open(csv_path, "r") as file:
108
- return file.read()
109
- else:
110
- return "No data available."
111
-
112
- # Define the chat response function using the Mistral model
113
- def respond(message, history, system_message, max_tokens, temperature, top_p):
114
- API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
115
- headers = {"Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}"}
116
- payload = {
117
- "inputs": f"User: {message}\nHistory: {history}\nSystem: {system_message}",
118
- "parameters": {"max_length": max_tokens, "temperature": temperature, "top_p": top_p},
119
- }
120
- response = requests.post(API_URL, headers=headers, json=payload)
121
- return response.json()[0]["generated_text"]
122
-
123
- # Function to generate RSS feed for a given URL
124
- def generate_rss_feed(url):
125
- hostname = urlparse(url).hostname
126
- folder_path = os.path.join(DEFAULT_FILE_PATH, hostname)
127
- csv_path = os.path.join(folder_path, f"{hostname}_changes.csv")
128
- if os.path.exists(csv_path):
129
- with open(csv_path, "r") as file:
130
- reader = csv.DictReader(file)
131
- feed = feedparser.parse(f"rss.xml") # Create a new feed object
132
- feed.feed.title = f"Changes for {hostname}"
133
- feed.feed.link = url
134
- feed.feed.description = "Recent changes detected on the website."
135
- feed.entries = []
136
- for row in reader:
137
- feed.entries.append({
138
- "title": f"Change detected at {row['url']}",
139
- "link": row['url'],
140
- "description": f"Content changed on {row['date']} at {row['time']}",
141
- "published": datetime.datetime.strptime(f"{row['date']} {row['time']}", "%Y-%m-%d %H:%M:%S").isoformat(),
142
- })
143
- return feed.entries
144
- else:
145
- return "No data available."
146
-
147
- # Function to handle user input and generate response
148
- def chat_interface(message, history, system_message, max_tokens, temperature, top_p, storage_location, urls, scrape_interval, content_type):
149
- response = respond(message, history, system_message, max_tokens, temperature, top_p)
150
- history.append((message, response))
151
- return history, response
152
 
153
  # Create Gradio interface
154
  def create_interface():
@@ -184,7 +35,6 @@ def create_interface():
184
 
185
  return demo
186
 
187
-
188
  if __name__ == "__main__":
189
- interface = gr.Interface(fn=create_interface, title="Web Scraper and Chatbot")
190
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from agent import start_scraping, display_csv, generate_rss_feed, chat_interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  # Create Gradio interface
5
  def create_interface():
 
35
 
36
  return demo
37
 
 
38
  if __name__ == "__main__":
39
+ demo = create_interface()
40
+ demo.launch()