Update app.py
Browse files
app.py
CHANGED
@@ -30,11 +30,6 @@ if not os.path.exists("history.json"):
|
|
30 |
with open("history.json", "w") as f:
|
31 |
json.dump({}, f)
|
32 |
|
33 |
-
import os
|
34 |
-
import base64
|
35 |
-
import zipfile
|
36 |
-
import streamlit as st
|
37 |
-
|
38 |
def zip_subdirs(start_dir):
|
39 |
for subdir, dirs, files in os.walk(start_dir):
|
40 |
if subdir != start_dir: # Skip the root directory
|
@@ -55,7 +50,6 @@ def get_zip_download_link(zip_file):
|
|
55 |
link_name = os.path.basename(zip_file)
|
56 |
href = f'<a href="data:file/zip;base64,{b64}" download="{link_name}">Download: {link_name}</a>'
|
57 |
return href
|
58 |
-
|
59 |
|
60 |
@st.cache_resource
|
61 |
def create_zip_of_files(files):
|
@@ -73,8 +67,6 @@ def get_zip_download_link(zip_file):
|
|
73 |
href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
|
74 |
return href
|
75 |
|
76 |
-
|
77 |
-
|
78 |
def download_file(url, local_filename):
|
79 |
if url.startswith('http://') or url.startswith('https://'):
|
80 |
try:
|
@@ -91,15 +83,12 @@ def download_html_and_files(url, subdir):
|
|
91 |
html_content = requests.get(url).text
|
92 |
soup = BeautifulSoup(html_content, 'html.parser')
|
93 |
base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
|
94 |
-
|
95 |
for link in soup.find_all('a'):
|
96 |
file_url = urllib.parse.urljoin(base_url, link.get('href'))
|
97 |
local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
|
98 |
-
|
99 |
if not local_filename.endswith('/') and local_filename != subdir:
|
100 |
link['href'] = local_filename
|
101 |
download_file(file_url, local_filename)
|
102 |
-
|
103 |
with open(os.path.join(subdir, "index.html"), "w") as file:
|
104 |
file.write(str(soup))
|
105 |
|
@@ -110,34 +99,27 @@ def list_files(directory_path='.'):
|
|
110 |
def file_editor(file_path):
|
111 |
st.write(f"Editing File: {os.path.basename(file_path)}")
|
112 |
file_content = ""
|
113 |
-
|
114 |
with open(file_path, "r") as f:
|
115 |
file_content = f.read()
|
116 |
-
|
117 |
file_content = st.text_area("Edit the file content:", value=file_content, height=250)
|
118 |
-
|
119 |
if st.button("πΎ Save"):
|
120 |
with open(file_path, "w") as f:
|
121 |
f.write(file_content)
|
122 |
st.success(f"File '{os.path.basename(file_path)}' saved!")
|
123 |
|
124 |
-
|
125 |
def show_file_operations(file_path, sequence_number):
|
126 |
-
#st.write(f"File: {os.path.basename(file_path)}")
|
127 |
unique_key = hashlib.md5(file_path.encode()).hexdigest()
|
128 |
file_content = ""
|
129 |
-
|
130 |
col01, col02, col1, col2, col3 = st.columns(5)
|
131 |
with col01:
|
132 |
st.write(os.path.basename(file_path))
|
133 |
with col1:
|
134 |
edit_key = f"edit_{unique_key}_{sequence_number}"
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
with col2:
|
142 |
save_key = f"save_{unique_key}_{sequence_number}"
|
143 |
if st.button(f"πΎ Save", key=save_key):
|
@@ -145,21 +127,17 @@ def show_file_operations(file_path, sequence_number):
|
|
145 |
with open(file_path, "w") as f:
|
146 |
f.write(file_content)
|
147 |
st.success(f"File saved!")
|
148 |
-
|
149 |
with col3:
|
150 |
delete_key = f"delete_{unique_key}_{sequence_number}"
|
151 |
if st.button(f"ποΈ Delete", key=delete_key):
|
152 |
os.remove(file_path)
|
153 |
st.markdown(f"File deleted!")
|
154 |
|
155 |
-
|
156 |
file_sequence_numbers = {}
|
157 |
|
158 |
-
|
159 |
def show_file_content(file_path):
|
160 |
_, file_extension = os.path.splitext(file_path)
|
161 |
try:
|
162 |
-
|
163 |
if file_extension in ['.png', '.jpg', '.jpeg']:
|
164 |
image_url = file_path.replace('File:','').replace('/','')
|
165 |
st.write('Image URL:' + image_url)
|
@@ -176,15 +154,9 @@ def show_file_content(file_path):
|
|
176 |
elif file_extension in ['.html', '.txt']:
|
177 |
with open(file_path, "r") as file:
|
178 |
st.markdown(file.read(), unsafe_allow_html=True)
|
179 |
-
#elif file_extension in ['.pdf']:
|
180 |
-
#pdf_file = open(file_path, "rb")
|
181 |
-
#base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
|
182 |
-
#pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
|
183 |
-
#st.markdown(pdf_display, unsafe_allow_html=True)
|
184 |
except Exception as e:
|
185 |
st.error(f"Error reading file {file_path}: {e}")
|
186 |
|
187 |
-
|
188 |
def show_download_links(subdir):
|
189 |
global file_sequence_numbers
|
190 |
for file in list_files(subdir):
|
@@ -194,7 +166,6 @@ def show_download_links(subdir):
|
|
194 |
else:
|
195 |
file_sequence_numbers[file_path] += 1
|
196 |
sequence_number = file_sequence_numbers[file_path]
|
197 |
-
|
198 |
if os.path.isfile(file_path):
|
199 |
st.markdown(file_path) # Display file path
|
200 |
show_file_content(file_path) # Display file content based on type
|
@@ -210,9 +181,7 @@ def show_download_links_backup(subdir):
|
|
210 |
else:
|
211 |
file_sequence_numbers[file_path] += 1
|
212 |
sequence_number = file_sequence_numbers[file_path]
|
213 |
-
|
214 |
if os.path.isfile(file_path):
|
215 |
-
#st.markdown(get_download_link(file_path), unsafe_allow_html=True)
|
216 |
st.markdown(file_path, unsafe_allow_html=True) # faster than encapsulating file into base64 download link
|
217 |
show_file_operations(file_path, sequence_number)
|
218 |
else:
|
@@ -224,26 +193,14 @@ def get_download_link(file):
|
|
224 |
b64 = base64.b64encode(bytes).decode()
|
225 |
href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
|
226 |
return href
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
def main():
|
231 |
st.sidebar.title('π Web Datasets Bulk Downloader')
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
# Check for query parameters for file editing
|
236 |
-
#query_params = st.query_params()
|
237 |
query_params = st.experimental_get_query_params()
|
238 |
-
|
239 |
file_to_edit = query_params.get('file_to_edit', [None])[0]
|
240 |
-
|
241 |
if file_to_edit and os.path.exists(file_to_edit):
|
242 |
file_editor(file_to_edit)
|
243 |
else:
|
244 |
-
# Selecting URL input method
|
245 |
-
|
246 |
-
# Selecting URL input method
|
247 |
url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"], index=1)
|
248 |
url = ""
|
249 |
if url_input_method == "Enter URL":
|
@@ -252,15 +209,6 @@ def main():
|
|
252 |
selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()), index=0)
|
253 |
url = URLS[selected_site]
|
254 |
|
255 |
-
|
256 |
-
#url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"])
|
257 |
-
#url = ""
|
258 |
-
#if url_input_method == "Enter URL":
|
259 |
-
# url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
|
260 |
-
#else:
|
261 |
-
# selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()))
|
262 |
-
# url = URLS[selected_site]
|
263 |
-
|
264 |
# Reading or creating history.json
|
265 |
if not os.path.exists("history.json"):
|
266 |
with open("history.json", "w") as f:
|
@@ -282,7 +230,6 @@ def main():
|
|
282 |
with open("history.json", "w") as f:
|
283 |
json.dump(history, f)
|
284 |
|
285 |
-
|
286 |
if st.sidebar.button('π₯ Get All the Content', help="Download content from the selected URL"):
|
287 |
download_html_and_files(url, history[url])
|
288 |
show_download_links(history[url])
|
@@ -291,23 +238,11 @@ def main():
|
|
291 |
for subdir in history.values():
|
292 |
show_download_links(subdir)
|
293 |
|
294 |
-
|
295 |
-
# Button for downloading content
|
296 |
-
#if st.sidebar.button('π₯ Get All the Content'):
|
297 |
-
# download_html_and_files(url, history[url])
|
298 |
-
# show_download_links(history[url])
|
299 |
-
|
300 |
-
# Button for showing download links
|
301 |
-
#if st.sidebar.button('π Show Download Links'):
|
302 |
-
# for subdir in history.values():
|
303 |
-
# show_download_links(subdir)
|
304 |
|
305 |
if st.sidebar.button("π Delete All", help="Delete all downloaded content"):
|
306 |
-
#if st.sidebar.button("π Delete All"):
|
307 |
# Clear history file
|
308 |
with open("history.json", "w") as f:
|
309 |
json.dump({}, f)
|
310 |
-
|
311 |
# Delete all files in subdirectories
|
312 |
for subdir in glob.glob('*'):
|
313 |
if os.path.isdir(subdir) and subdir not in EXCLUDED_FILES:
|
@@ -316,29 +251,18 @@ def main():
|
|
316 |
os.remove(file_path)
|
317 |
st.write(f"Deleted: {file_path}")
|
318 |
os.rmdir(subdir) # Remove the empty directory
|
319 |
-
|
320 |
st.experimental_rerun()
|
321 |
-
|
322 |
if st.sidebar.button("β¬οΈ Download All", help="Download all files in a zip"):
|
323 |
start_directory = '.' # Current directory
|
324 |
for zip_file in zip_subdirs(start_directory):
|
325 |
st.sidebar.markdown(zip_file, unsafe_allow_html=True)
|
326 |
st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
|
327 |
-
#if st.sidebar.button("β¬οΈ Download All"):
|
328 |
-
# start_directory = '.' # Current directory
|
329 |
-
# for zip_file in zip_subdirs(start_directory):
|
330 |
-
# st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
|
331 |
-
|
332 |
-
# Expander for showing URL history and download links
|
333 |
with st.expander("URL History and Downloaded Files"):
|
334 |
try:
|
335 |
for url, subdir in history.items():
|
336 |
st.markdown(f"#### {url}")
|
337 |
-
# show_download_links(subdir)
|
338 |
except:
|
339 |
print('url history is empty')
|
340 |
-
|
341 |
-
#for subdir in history.values():
|
342 |
-
# show_download_links(subdir)
|
343 |
if __name__ == "__main__":
|
344 |
main()
|
|
|
30 |
with open("history.json", "w") as f:
|
31 |
json.dump({}, f)
|
32 |
|
|
|
|
|
|
|
|
|
|
|
33 |
def zip_subdirs(start_dir):
|
34 |
for subdir, dirs, files in os.walk(start_dir):
|
35 |
if subdir != start_dir: # Skip the root directory
|
|
|
50 |
link_name = os.path.basename(zip_file)
|
51 |
href = f'<a href="data:file/zip;base64,{b64}" download="{link_name}">Download: {link_name}</a>'
|
52 |
return href
|
|
|
53 |
|
54 |
@st.cache_resource
|
55 |
def create_zip_of_files(files):
|
|
|
67 |
href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
|
68 |
return href
|
69 |
|
|
|
|
|
70 |
def download_file(url, local_filename):
|
71 |
if url.startswith('http://') or url.startswith('https://'):
|
72 |
try:
|
|
|
83 |
html_content = requests.get(url).text
|
84 |
soup = BeautifulSoup(html_content, 'html.parser')
|
85 |
base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
|
|
|
86 |
for link in soup.find_all('a'):
|
87 |
file_url = urllib.parse.urljoin(base_url, link.get('href'))
|
88 |
local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
|
|
|
89 |
if not local_filename.endswith('/') and local_filename != subdir:
|
90 |
link['href'] = local_filename
|
91 |
download_file(file_url, local_filename)
|
|
|
92 |
with open(os.path.join(subdir, "index.html"), "w") as file:
|
93 |
file.write(str(soup))
|
94 |
|
|
|
99 |
def file_editor(file_path):
|
100 |
st.write(f"Editing File: {os.path.basename(file_path)}")
|
101 |
file_content = ""
|
|
|
102 |
with open(file_path, "r") as f:
|
103 |
file_content = f.read()
|
|
|
104 |
file_content = st.text_area("Edit the file content:", value=file_content, height=250)
|
|
|
105 |
if st.button("πΎ Save"):
|
106 |
with open(file_path, "w") as f:
|
107 |
f.write(file_content)
|
108 |
st.success(f"File '{os.path.basename(file_path)}' saved!")
|
109 |
|
|
|
110 |
def show_file_operations(file_path, sequence_number):
|
|
|
111 |
unique_key = hashlib.md5(file_path.encode()).hexdigest()
|
112 |
file_content = ""
|
|
|
113 |
col01, col02, col1, col2, col3 = st.columns(5)
|
114 |
with col01:
|
115 |
st.write(os.path.basename(file_path))
|
116 |
with col1:
|
117 |
edit_key = f"edit_{unique_key}_{sequence_number}"
|
118 |
+
if st.button(f"βοΈ Edit", key=edit_key):
|
119 |
+
with open(file_path, "r") as f:
|
120 |
+
file_content = f.read()
|
121 |
+
text_area_key = f"text_area_{unique_key}_{sequence_number}"
|
122 |
+
file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key)
|
|
|
123 |
with col2:
|
124 |
save_key = f"save_{unique_key}_{sequence_number}"
|
125 |
if st.button(f"πΎ Save", key=save_key):
|
|
|
127 |
with open(file_path, "w") as f:
|
128 |
f.write(file_content)
|
129 |
st.success(f"File saved!")
|
|
|
130 |
with col3:
|
131 |
delete_key = f"delete_{unique_key}_{sequence_number}"
|
132 |
if st.button(f"ποΈ Delete", key=delete_key):
|
133 |
os.remove(file_path)
|
134 |
st.markdown(f"File deleted!")
|
135 |
|
|
|
136 |
file_sequence_numbers = {}
|
137 |
|
|
|
138 |
def show_file_content(file_path):
|
139 |
_, file_extension = os.path.splitext(file_path)
|
140 |
try:
|
|
|
141 |
if file_extension in ['.png', '.jpg', '.jpeg']:
|
142 |
image_url = file_path.replace('File:','').replace('/','')
|
143 |
st.write('Image URL:' + image_url)
|
|
|
154 |
elif file_extension in ['.html', '.txt']:
|
155 |
with open(file_path, "r") as file:
|
156 |
st.markdown(file.read(), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
157 |
except Exception as e:
|
158 |
st.error(f"Error reading file {file_path}: {e}")
|
159 |
|
|
|
160 |
def show_download_links(subdir):
|
161 |
global file_sequence_numbers
|
162 |
for file in list_files(subdir):
|
|
|
166 |
else:
|
167 |
file_sequence_numbers[file_path] += 1
|
168 |
sequence_number = file_sequence_numbers[file_path]
|
|
|
169 |
if os.path.isfile(file_path):
|
170 |
st.markdown(file_path) # Display file path
|
171 |
show_file_content(file_path) # Display file content based on type
|
|
|
181 |
else:
|
182 |
file_sequence_numbers[file_path] += 1
|
183 |
sequence_number = file_sequence_numbers[file_path]
|
|
|
184 |
if os.path.isfile(file_path):
|
|
|
185 |
st.markdown(file_path, unsafe_allow_html=True) # faster than encapsulating file into base64 download link
|
186 |
show_file_operations(file_path, sequence_number)
|
187 |
else:
|
|
|
193 |
b64 = base64.b64encode(bytes).decode()
|
194 |
href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
|
195 |
return href
|
196 |
+
|
|
|
|
|
197 |
def main():
|
198 |
st.sidebar.title('π Web Datasets Bulk Downloader')
|
|
|
|
|
|
|
|
|
|
|
199 |
query_params = st.experimental_get_query_params()
|
|
|
200 |
file_to_edit = query_params.get('file_to_edit', [None])[0]
|
|
|
201 |
if file_to_edit and os.path.exists(file_to_edit):
|
202 |
file_editor(file_to_edit)
|
203 |
else:
|
|
|
|
|
|
|
204 |
url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"], index=1)
|
205 |
url = ""
|
206 |
if url_input_method == "Enter URL":
|
|
|
209 |
selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()), index=0)
|
210 |
url = URLS[selected_site]
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
# Reading or creating history.json
|
213 |
if not os.path.exists("history.json"):
|
214 |
with open("history.json", "w") as f:
|
|
|
230 |
with open("history.json", "w") as f:
|
231 |
json.dump(history, f)
|
232 |
|
|
|
233 |
if st.sidebar.button('π₯ Get All the Content', help="Download content from the selected URL"):
|
234 |
download_html_and_files(url, history[url])
|
235 |
show_download_links(history[url])
|
|
|
238 |
for subdir in history.values():
|
239 |
show_download_links(subdir)
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
if st.sidebar.button("π Delete All", help="Delete all downloaded content"):
|
|
|
243 |
# Clear history file
|
244 |
with open("history.json", "w") as f:
|
245 |
json.dump({}, f)
|
|
|
246 |
# Delete all files in subdirectories
|
247 |
for subdir in glob.glob('*'):
|
248 |
if os.path.isdir(subdir) and subdir not in EXCLUDED_FILES:
|
|
|
251 |
os.remove(file_path)
|
252 |
st.write(f"Deleted: {file_path}")
|
253 |
os.rmdir(subdir) # Remove the empty directory
|
|
|
254 |
st.experimental_rerun()
|
|
|
255 |
if st.sidebar.button("β¬οΈ Download All", help="Download all files in a zip"):
|
256 |
start_directory = '.' # Current directory
|
257 |
for zip_file in zip_subdirs(start_directory):
|
258 |
st.sidebar.markdown(zip_file, unsafe_allow_html=True)
|
259 |
st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
with st.expander("URL History and Downloaded Files"):
|
261 |
try:
|
262 |
for url, subdir in history.items():
|
263 |
st.markdown(f"#### {url}")
|
|
|
264 |
except:
|
265 |
print('url history is empty')
|
266 |
+
|
|
|
|
|
267 |
if __name__ == "__main__":
|
268 |
main()
|