WebDatasets / app.py
awacke1's picture
Update app.py
2739000
raw
history blame
8.15 kB
import streamlit as st
import requests
import os
import urllib
import base64
from bs4 import BeautifulSoup
import hashlib
import json
import mimetypes
EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
# Create a history.json file if it doesn't exist yet
if not os.path.exists("history.json"):
with open("history.json", "w") as f:
json.dump({}, f)
def download_file(url, local_filename):
if url.startswith('http://') or url.startswith('https://'):
try:
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return local_filename
except requests.exceptions.HTTPError as err:
print(f"HTTP error occurred: {err}")
def download_html_and_files(url, subdir):
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, 'html.parser')
base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
for link in soup.find_all('a'):
file_url = urllib.parse.urljoin(base_url, link.get('href'))
local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
# Skip if the local filename is a directory
if not local_filename.endswith('/') and local_filename != subdir:
link['href'] = local_filename
download_file(file_url, local_filename)
# Save the modified HTML content
with open(os.path.join(subdir, "index.html"), "w") as file:
file.write(str(soup))
def list_files(directory_path='.'):
files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
return [f for f in files if f not in EXCLUDED_FILES]
def get_download_link(file):
with open(file, "rb") as f:
bytes = f.read()
b64 = base64.b64encode(bytes).decode()
href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Click to download {os.path.basename(file)}</a>'
return href
def is_binary(file_path):
"""Determine if the given file is binary or text-based."""
try:
with open(file_path, 'rb') as f:
textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})
is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))
return is_binary_string(f.read(1024))
except:
return True
def is_text_file(file_path):
"""Check if a file is a text file."""
text_file_types = ['.txt', '.py', '.md', '.html', '.css', '.js', '.json', '.xml']
return any(file_path.endswith(ext) for ext in text_file_types)
def file_column(file_path):
col1, col2, col3 = st.columns([3, 1, 1])
# Column 1: File content or info
with col1:
if is_text_file(file_path):
file_content = ''
if 'edit_content' in st.session_state and st.session_state['edit_content'][0] == file_path:
file_content = st.session_state['edit_content'][1]
else:
with open(file_path, "r", encoding='utf-8') as f:
file_content = f.read()
edited_content = st.text_area(f"Edit {os.path.basename(file_path)}:", value=file_content, height=250, key=f"textarea_{file_path}")
if st.button("💾 Save", key=f"save_{file_path}"):
with open(file_path, "w", encoding='utf-8') as f:
f.write(edited_content)
st.success(f"File {os.path.basename(file_path)} saved!")
st.session_state['edit_content'] = (file_path, edited_content)
else:
st.info(f"This is a binary file ({os.path.basename(file_path)}) and cannot be edited.")
# Column 2: Download link
with col2:
st.markdown("Download")
st.markdown(get_download_link(file_path), unsafe_allow_html=True)
# Column 3: Delete button
with col3:
if st.button(f"🗑️ Delete", key=f"delete_{file_path}"):
os.remove(file_path)
st.success(f"File {os.path.basename(file_path)} deleted!")
# Update the listing by removing the deleted file
st.experimental_rerun()
def show_download_links(subdir):
st.write(f'Files in {subdir}:')
for file in list_files(subdir):
file_path = os.path.join(subdir, file)
if os.path.isfile(file_path):
file_column(file_path)
def generate_hash_key(path, counter):
"""Generate a unique hash key for a given file path and counter."""
return hashlib.md5(f"{path}_{counter}".encode()).hexdigest()
def show_file_operations(file_path):
# Increment counter for each file path
counter_key = f"counter_{file_path}"
if counter_key in st.session_state:
st.session_state[counter_key] += 1
else:
st.session_state[counter_key] = 1
# Unique hash keys for each file and operation based on their path and counter
counter = st.session_state[counter_key]
edit_button_key = f"edit_button_{generate_hash_key(file_path, counter)}"
save_button_key = f"save_button_{generate_hash_key(file_path, counter)}"
delete_button_key = f"delete_button_{generate_hash_key(file_path, counter)}"
content_key = f"content_{generate_hash_key(file_path, counter)}"
# Start Edit operation
if st.button(f"✏️ Edit {os.path.basename(file_path)}", key=edit_button_key):
if edit_button_key not in st.session_state:
with open(file_path, "r") as f:
st.session_state[content_key] = f.read()
st.session_state[edit_button_key] = True
# Display text area for editing if in edit mode
if st.session_state.get(edit_button_key, False):
edited_content = st.text_area("Edit the file content:", value=st.session_state.get(content_key, ""), height=250, key=content_key)
# Save button
if st.button(f"💾 Save {os.path.basename(file_path)}", key=save_button_key):
with open(file_path, "w") as f:
f.write(edited_content)
new_file_size = os.path.getsize(file_path)
download_link = get_download_link(file_path)
st.markdown(f"✅ File **{os.path.basename(file_path)}** saved! ([{download_link}]) - New size: {new_file_size} bytes", unsafe_allow_html=True)
st.session_state[edit_button_key] = False # Exit edit mode
# Delete button
if st.button(f"🗑️ Delete {os.path.basename(file_path)}", key=delete_button_key):
os.remove(file_path)
st.markdown(f"🎉 File {os.path.basename(file_path)} deleted!")
# Remove state variables related to the deleted file
st.session_state.pop(edit_button_key, None)
st.session_state.pop(content_key, None)
def main():
st.sidebar.title('Web Datasets Bulk Downloader')
url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
# Load history
with open("history.json", "r") as f:
history = json.load(f)
# Save the history of URL entered as a json file
if url:
subdir = hashlib.md5(url.encode()).hexdigest()
if not os.path.exists(subdir):
os.makedirs(subdir)
if url not in history:
history[url] = subdir
with open("history.json", "w") as f:
json.dump(history, f)
if st.sidebar.button('📥 Get All the Content'):
download_html_and_files(url, history[url])
show_download_links(history[url])
if st.sidebar.button('📂 Show Download Links'):
for subdir in history.values():
show_download_links(subdir)
# Display history as markdown
with st.expander("URL History and Downloaded Files"):
for url, subdir in history.items():
st.markdown(f"#### {url}")
show_download_links(subdir)
if __name__ == "__main__":
main()