awacke1 commited on
Commit
654e3a2
·
1 Parent(s): 5a7ef97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -21
app.py CHANGED
@@ -4,12 +4,15 @@ import os
4
  import urllib
5
  import base64
6
  from bs4 import BeautifulSoup
 
 
7
 
8
  EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
9
 
10
- # Create a history.txt file if it doesn't exist yet
11
- with open("history.txt", "a+") as f:
12
- f.close()
 
13
 
14
  def download_file(url, local_filename):
15
  if url.startswith('http://') or url.startswith('https://'):
@@ -23,17 +26,17 @@ def download_file(url, local_filename):
23
  except requests.exceptions.HTTPError as err:
24
  print(f"HTTP error occurred: {err}")
25
 
26
- def download_html_and_files(url):
27
  html_content = requests.get(url).text
28
  soup = BeautifulSoup(html_content, 'html.parser')
29
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
30
  for link in soup.find_all('a'):
31
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
32
- local_filename = urllib.parse.urlparse(file_url).path.split('/')[-1]
33
  if local_filename:
34
  link['href'] = local_filename
35
  download_file(file_url, local_filename)
36
- with open("index.html", "w") as file:
37
  file.write(str(soup))
38
 
39
  def list_files(directory_path='.'):
@@ -44,33 +47,45 @@ def get_download_link(file):
44
  with open(file, "rb") as f:
45
  bytes = f.read()
46
  b64 = base64.b64encode(bytes).decode()
47
- href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{file}\'>Click to download {file}</a>'
48
  return href
49
 
50
- def show_download_links():
51
- st.sidebar.write('Here are the files you can download:')
52
- for file in list_files():
53
- st.sidebar.markdown(get_download_link(file), unsafe_allow_html=True)
54
 
55
  def main():
56
  st.sidebar.title('Web Datasets Bulk Downloader')
57
  url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
58
 
59
- # Save the history of URL entered as a text file
 
 
 
 
60
  if url:
61
- with open("history.txt", "a") as f:
62
- f.write(url + "\n")
 
 
 
 
 
63
 
64
  if st.sidebar.button('📥 Get All the Content'):
65
- download_html_and_files(url)
66
- show_download_links()
 
67
  if st.sidebar.button('📂 Show Download Links'):
68
- show_download_links()
 
69
 
70
  # Display history as markdown
71
- with open("history.txt", "r") as f:
72
- history = f.read()
73
- st.markdown(f"### History\n\n{history}")
 
74
 
75
  if __name__ == "__main__":
76
- main()
 
4
  import urllib
5
  import base64
6
  from bs4 import BeautifulSoup
7
+ import hashlib
8
+ import json
9
 
10
  EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
11
 
12
+ # Create a history.json file if it doesn't exist yet
13
+ if not os.path.exists("history.json"):
14
+ with open("history.json", "w") as f:
15
+ json.dump({}, f)
16
 
17
  def download_file(url, local_filename):
18
  if url.startswith('http://') or url.startswith('https://'):
 
26
  except requests.exceptions.HTTPError as err:
27
  print(f"HTTP error occurred: {err}")
28
 
29
+ def download_html_and_files(url, subdir):
30
  html_content = requests.get(url).text
31
  soup = BeautifulSoup(html_content, 'html.parser')
32
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
33
  for link in soup.find_all('a'):
34
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
35
+ local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
36
  if local_filename:
37
  link['href'] = local_filename
38
  download_file(file_url, local_filename)
39
+ with open(os.path.join(subdir, "index.html"), "w") as file:
40
  file.write(str(soup))
41
 
42
  def list_files(directory_path='.'):
 
47
  with open(file, "rb") as f:
48
  bytes = f.read()
49
  b64 = base64.b64encode(bytes).decode()
50
+ href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{file.split("/")[-1]}\'>Click to download {file.split("/")[-1]}</a>'
51
  return href
52
 
53
+ def show_download_links(subdir):
54
+ st.write(f'Files for {subdir}:')
55
+ for file in list_files(subdir):
56
+ st.markdown(get_download_link(file), unsafe_allow_html=True)
57
 
58
  def main():
59
  st.sidebar.title('Web Datasets Bulk Downloader')
60
  url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
61
 
62
+ # Load history
63
+ with open("history.json", "r") as f:
64
+ history = json.load(f)
65
+
66
+ # Save the history of URL entered as a json file
67
  if url:
68
+ subdir = hashlib.md5(url.encode()).hexdigest()
69
+ if not os.path.exists(subdir):
70
+ os.makedirs(subdir)
71
+ if url not in history:
72
+ history[url] = subdir
73
+ with open("history.json", "w") as f:
74
+ json.dump(history, f)
75
 
76
  if st.sidebar.button('📥 Get All the Content'):
77
+ download_html_and_files(url, history[url])
78
+ show_download_links(history[url])
79
+
80
  if st.sidebar.button('📂 Show Download Links'):
81
+ for subdir in history.values():
82
+ show_download_links(subdir)
83
 
84
  # Display history as markdown
85
+ with st.expander("URL History and Downloaded Files"):
86
+ for url, subdir in history.items():
87
+ st.markdown(f"#### {url}")
88
+ show_download_links(subdir)
89
 
90
  if __name__ == "__main__":
91
+ main()