Spaces:
Runtime error
Runtime error
import datetime | |
import os | |
import csv | |
import time | |
import hashlib | |
import logging | |
import gradio as gr | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from webdriver_manager.chrome import ChromeDriverManager | |
from huggingface_hub import InferenceClient | |
import random | |
import yaml | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Define constants | |
PREFIX = "Task started at {date_time_str}. Purpose: {purpose}" | |
TASK_PROMPT = "Current task: {task}. History:\n{history}" | |
# Define current date/time | |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
# Define purpose | |
purpose = """ | |
You go to Culvers sites, you continuously seek changes on them since your last observation. | |
Anything new that gets logged and dumped into csv, stored in your log folder at user/app/scraped_data. | |
""" | |
# Define history | |
history = [] | |
# Define current task | |
current_task = None | |
# Default file path | |
default_file_path = "user/app/scraped_data/culver/culvers_changes.csv" | |
# Ensure the directory exists | |
os.makedirs(os.path.dirname(default_file_path), exist_ok=True) | |
# Function to monitor URLs for changes | |
def monitor_urls(storage_location, urls, scrape_interval, content_type): | |
global history | |
previous_hashes = [""] * len(urls) | |
# Ensure the directory exists | |
os.makedirs(os.path.dirname(storage_location), exist_ok=True) | |
with open(storage_location, "w", newline='') as csvfile: | |
csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"]) | |
csv_toolkit.writeheader() | |
options = Options() | |
options.headless = True | |
options.add_argument("--disable-gpu") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
try: | |
with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver: | |
while True: | |
for i, url in enumerate(urls): | |
try: | |
driver.get(url) | |
time.sleep(2) # Wait for the page to load | |
if content_type == "text": | |
current_content = driver.page_source | |
elif content_type == "media": | |
current_content = driver.find_elements_by_tag_name("img") | |
else: | |
current_content = driver.page_source | |
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest() | |
if current_hash != previous_hashes[i]: | |
previous_hashes[i] = current_hash | |
date_time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
history.append(f"Change detected at {url} on {date_time_str}") | |
csv_toolkit.writerow({"date": date_time_str.split()[0], "time": date_time_str.split()[1], "url": url, "change": "Content changed"}) | |
logging.info(f"Change detected at {url} on {date_time_str}") | |
except Exception as e: | |
logging.error(f"Error accessing {url}: {e}") | |
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes | |
except Exception as e: | |
logging.error(f"Error starting ChromeDriver: {e}") | |
# Define main function to handle user input | |
def handle_input(storage_location, urls, scrape_interval, content_type): | |
global current_task, history | |
current_task = f"Monitoring URLs: {', '.join(urls)}" | |
history.append(f"Task started: {current_task}") | |
monitor_urls(storage_location, urls, scrape_interval, content_type) | |
return TASK_PROMPT.format(task=current_task, history="\n".join(map(str, history))) | |
# Load custom prompts | |
try: | |
with open('custom_prompts.yaml', 'r') as fp: | |
custom_prompts = yaml.safe_load(fp) | |
except FileNotFoundError: | |
custom_prompts = { | |
"WEB_DEV": "", | |
"AI_SYSTEM_PROMPT": "", | |
"PYTHON_CODE_DEV": "", | |
"CODE_GENERATION": "", | |
"CODE_INTERPRETATION": "", | |
"CODE_TRANSLATION": "", | |
"CODE_IMPLEMENTATION": "" | |
} | |
# Define the Mistral inference client | |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") | |
VERBOSE = True | |
MAX_HISTORY = 125 | |
def format_prompt(message, history): | |
prompt = "<s>" | |
for entry in history: | |
if isinstance(entry, tuple) and len(entry) == 2: | |
user_prompt, bot_response = entry | |
prompt += f"[INST] {user_prompt} [/INST]" | |
prompt += f" {bot_response}</s> " | |
prompt += f"[INST] {message} [/INST]" | |
return prompt | |
agents = [ | |
"WEB_DEV", | |
"AI_SYSTEM_PROMPT", | |
"PYTHON_CODE_DEV", | |
"CODE_GENERATION", | |
"CODE_INTERPRETATION", | |
"CODE_TRANSLATION", | |
"CODE_IMPLEMENTATION" | |
] | |
def generate( | |
prompt, history, agent_name=agents[0], sys_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.7, | |
): | |
seed = random.randint(1, 1111111111111111) | |
agent = custom_prompts[agent_name] | |
system_prompt = agent if sys_prompt == "" else sys_prompt | |
temperature = max(float(temperature), 1e-2) | |
top_p = float(top_p) | |
generate_kwargs = dict( | |
temperature=temperature, | |
max_new_tokens=max_new_tokens, | |
top_p=top_p, | |
repetition_penalty=repetition_penalty, | |
do_sample=True, | |
seed=seed, | |
) | |
formatted_prompt = format_prompt(f"{system_prompt}\n\n{prompt}", history) | |
output = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, return_full_text=False) | |
return output | |
# Define the chat response function | |
def respond( | |
message, | |
history, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
): | |
response = generate( | |
prompt=message, | |
history=history, | |
sys_prompt=system_message, | |
temperature=temperature, | |
max_new_tokens=max_tokens, | |
top_p=top_p | |
) | |
return response | |
# Function to start scraping | |
def start_scraping(storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type): | |
urls = [url for url in [url1, url2, url3, url4, url5, url6, url7, url8, url9, url10] if url] | |
handle_input(storage_location, urls, scrape_interval, content_type) | |
return f"Started scraping {', '.join(urls)} every {scrape_interval} minutes." | |
# Function to display CSV content | |
def display_csv(storage_location): | |
if os.path.exists(storage_location): | |
with open(storage_location, "r") as file: | |
return file.read() | |
else: | |
return "No data available." | |
# Create Gradio interface | |
def chat_interface(message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type): | |
global history | |
response = respond(message, history, system_message, max_tokens, temperature, top_p) | |
history.append((message, response)) | |
return history, "" | |
demo = gr.Blocks() | |
with demo: | |
with gr.Row(): | |
with gr.Column(): | |
message = gr.Textbox(label="Message") | |
system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message") | |
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens") | |
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature") | |
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") | |
storage_location = gr.Textbox(value=default_file_path, label="Storage Location") | |
url1 = gr.Textbox(value="https://www.culver.k12.in.us/", label="URL 1") | |
url2 = gr.Textbox(value="https://www.facebook.com/CulverCommunitySchools", label="URL 2") | |
url3 = gr.Textbox(label="URL 3") | |
url4 = gr.Textbox(label="URL 4") | |
url5 = gr.Textbox(label="URL 5") | |
url6 = gr.Textbox(label="URL 6") | |
url7 = gr.Textbox(label="URL 7") | |
url8 = gr.Textbox(label="URL 8") | |
url9 = gr.Textbox(label="URL 9") | |
url10 = gr.Textbox(label="URL 10") | |
scrape_interval = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Scrape Interval (minutes)") | |
content_type = gr.Radio(choices=["text", "media", "both"], value="text", label="Content Type") | |
start_button = gr.Button("Start Scraping") | |
csv_output = gr.Textbox(label="CSV Output", interactive=False) | |
with gr.Column(): | |
chat_history = gr.Chatbot(label="Chat History") | |
response_box = gr.Textbox(label="Response") | |
start_button.click(start_scraping, inputs=[storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=csv_output) | |
message.submit(chat_interface, inputs=[message, system_message, max_tokens, temperature, top_p, storage_location, url1, url2, url3, url4, url5, url6, url7, url8, url9, url10, scrape_interval, content_type], outputs=[chat_history, response_box]) | |
if __name__ == "__main__": | |
demo.launch() |