euler314 commited on
Commit
5641dea
·
verified ·
1 Parent(s): 3b03ee1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -260
app.py CHANGED
@@ -7,7 +7,7 @@ import subprocess
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8
  import asyncio
9
  import logging
10
- from urllib.parse import urlparse
11
  import re
12
  from pathlib import Path
13
  from io import BytesIO
@@ -19,97 +19,21 @@ import tempfile
19
  import mimetypes
20
  import requests
21
  import datetime
22
- import spacy
23
- import spacy.cli
24
- from spacy.language import Language
25
- import google_auth_oauthlib.flow
26
- import googleapiclient.discovery
27
- import google.auth.transport.requests
28
- from async_timeout import timeout as async_timeout
29
- import pandas as pd
30
- from sentence_transformers import SentenceTransformer
31
- from transformers import pipeline
32
- import schedule
33
- import threading
34
- import time
35
- import hashlib
36
- from reportlab.lib.pagesizes import letter
37
- from reportlab.pdfgen import canvas
38
- from sklearn.cluster import KMeans
39
- import numpy as np
40
  import base64
41
  import shutil
42
- from PIL import Image # Make sure to pip install Pillow
 
43
  from reportlab.pdfgen import canvas
44
 
45
  # -------------------- Logging Setup --------------------
46
  logging.basicConfig(
47
- filename='advanced_download_log.txt',
48
  level=logging.INFO,
49
  format='%(asctime)s - %(levelname)s - %(message)s'
50
  )
51
  logger = logging.getLogger(__name__)
52
 
53
- GOOGLE_OAUTH_CONFIG = {
54
- "web": {
55
- "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
56
- "project_id": "huggingface-449214",
57
- "auth_uri": "https://accounts.google.com/o/oauth2/auth",
58
- "token_uri": "https://oauth2.googleapis.com/token",
59
- "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
60
- "client_secret": "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",
61
- "redirect_uris": ["https://euler314-craw-web.hf.space/"]
62
- }
63
- }
64
-
65
- # Playwright Setup
66
- def install_playwright_dependencies():
67
- os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
68
- subprocess.run(['apt-get', 'update', '-y'], check=True)
69
- packages = [
70
- 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
71
- 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
72
- 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
73
- ]
74
- subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
75
- subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
76
-
77
- install_playwright_dependencies()
78
-
79
- # Model Loading
80
- @st.cache_resource
81
- def load_models():
82
- try:
83
- # Load spaCy model
84
- try:
85
- nlp = spacy.load("en_core_web_sm")
86
- except OSError:
87
- st.info("Downloading spaCy model...")
88
- spacy.cli.download("en_core_web_sm")
89
- nlp = spacy.load("en_core_web_sm")
90
-
91
- # Load SentenceTransformer
92
- try:
93
- semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
94
- except Exception as e:
95
- st.error(f"Error loading SentenceTransformer: {e}")
96
- semantic_model = None
97
-
98
- # Load Transformers pipeline
99
- try:
100
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
101
- except Exception as e:
102
- st.error(f"Error loading Transformers: {e}")
103
- summarizer = None
104
-
105
- return nlp, semantic_model, summarizer
106
- except Exception as e:
107
- st.error(f"Error loading models: {e}")
108
- return None, None, None
109
-
110
- nlp_model, semantic_model, summarizer = load_models()
111
-
112
- # Utility Functions
113
  def get_random_user_agent():
114
  USER_AGENTS = [
115
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
@@ -134,57 +58,25 @@ def create_zip_file(file_paths, output_dir):
134
  zipf.write(file_path, os.path.basename(file_path))
135
  return zip_path
136
 
137
- # Google Drive Functions
138
- def get_google_auth_url():
139
- client_config = GOOGLE_OAUTH_CONFIG["web"]
140
- flow = google_auth_oauthlib.flow.Flow.from_client_config(
141
- {"web": client_config},
142
- scopes=["https://www.googleapis.com/auth/drive.file"]
143
- )
144
- flow.redirect_uri = client_config["redirect_uris"][0]
145
- authorization_url, _ = flow.authorization_url(
146
- access_type="offline",
147
- include_granted_scopes="true",
148
- prompt="consent"
149
- )
150
- return authorization_url
151
-
152
- def exchange_code_for_credentials(auth_code):
153
- if not auth_code.strip():
154
- return None, "No code provided."
155
- try:
156
- client_config = GOOGLE_OAUTH_CONFIG["web"]
157
- flow = google_auth_oauthlib.flow.Flow.from_client_config(
158
- {"web": client_config},
159
- scopes=["https://www.googleapis.com/auth/drive.file"]
160
- )
161
- flow.redirect_uri = client_config["redirect_uris"][0]
162
- flow.fetch_token(code=auth_code.strip())
163
- creds = flow.credentials
164
- if not creds or not creds.valid:
165
- return None, "Could not validate credentials. Check code and try again."
166
- return creds, "Google Sign-In successful!"
167
- except Exception as e:
168
- return None, f"Error during token exchange: {e}"
169
-
170
- def google_drive_upload(file_path, credentials, folder_id=None):
171
  try:
172
- drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
173
- file_metadata = {'name': os.path.basename(file_path)}
174
- if folder_id:
175
- file_metadata['parents'] = [folder_id]
176
- media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
177
- created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
178
- return created.get("id", "")
 
 
179
  except Exception as e:
180
- return f"Error uploading to Drive: {str(e)}"
 
 
 
181
 
182
- def create_drive_folder(drive_service, name):
183
- folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
184
- folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
185
- return folder.get('id')
186
-
187
- # DownloadManager Class
188
  class DownloadManager:
189
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
190
  self.use_proxy = use_proxy
@@ -291,10 +183,6 @@ class DownloadManager:
291
  links = set()
292
 
293
  # Use requests for a faster initial scan
294
- import requests
295
- from bs4 import BeautifulSoup
296
- from urllib.parse import urljoin, urlparse
297
-
298
  headers = {"User-Agent": get_random_user_agent()}
299
  response = requests.get(url, headers=headers, timeout=30)
300
 
@@ -398,7 +286,6 @@ class DownloadManager:
398
  # If filename is URL encoded (common with Chinese/international sites)
399
  if '%' in filename:
400
  try:
401
- from urllib.parse import unquote
402
  filename = unquote(filename)
403
  except Exception:
404
  pass
@@ -735,12 +622,9 @@ class DownloadManager:
735
  await page.screenshot(path=screenshot_path)
736
 
737
  # Convert to PDF
738
- from PIL import Image
739
- from reportlab.pdfgen import canvas as pdf_canvas
740
-
741
  img = Image.open(screenshot_path)
742
  width, height = img.size
743
- c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
744
  c.drawImage(screenshot_path, 0, 0, width, height)
745
  c.save()
746
 
@@ -874,17 +758,13 @@ class DownloadManager:
874
  # Combine screenshots into PDF
875
  logger.info(f"Creating PDF from {len(screenshots)} captured pages")
876
 
877
- from PIL import Image
878
- from reportlab.lib.pagesizes import letter
879
- from reportlab.pdfgen import canvas as pdf_canvas
880
-
881
  # Use the size of the first screenshot to set PDF dimensions
882
  if screenshots:
883
  try:
884
  img = Image.open(screenshots[0])
885
  width, height = img.size
886
 
887
- c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
888
 
889
  for screenshot in screenshots:
890
  try:
@@ -1000,20 +880,7 @@ class DownloadManager:
1000
 
1001
  # Try standard approaches for non-view-only files
1002
  try:
1003
- # Try with gdown first
1004
- import gdown
1005
- output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
1006
- if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
1007
- with open(save_path, 'rb') as f:
1008
- content = f.read(100) # Read first 100 bytes
1009
- if b'<!DOCTYPE html>' not in content: # Check not HTML error page
1010
- logger.info(f"Successfully downloaded with gdown: {url}")
1011
- return True
1012
- except Exception as e:
1013
- logger.warning(f"gdown download failed: {e}")
1014
-
1015
- # Try with requests and session cookies
1016
- try:
1017
  session = requests.Session()
1018
  session.headers.update({'User-Agent': get_random_user_agent()})
1019
 
@@ -1322,9 +1189,6 @@ class DownloadManager:
1322
  screenshots.append(screenshot_path)
1323
 
1324
  # Combine screenshots into PDF
1325
- from PIL import Image
1326
- from reportlab.pdfgen import canvas
1327
-
1328
  c = canvas.Canvas(save_path)
1329
  for screenshot in screenshots:
1330
  img = Image.open(screenshot)
@@ -1350,9 +1214,6 @@ class DownloadManager:
1350
 
1351
  # Convert to requested format if needed
1352
  if file_type == 'pdf':
1353
- from PIL import Image
1354
- from reportlab.pdfgen import canvas
1355
-
1356
  # Create PDF from screenshot
1357
  img = Image.open(screenshot_path)
1358
  width, height = img.size
@@ -1757,11 +1618,10 @@ class DownloadManager:
1757
  # Use a longer timeout for ASP.NET pages which can be slower
1758
  sub_timeout = timeout * 2 if is_aspnet else timeout
1759
 
1760
- # Extract files from sublink with appropriate timeout
1761
- async with async_timeout(sub_timeout):
1762
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1763
- all_files.extend(sub_files)
1764
- file_count_text.text(f"Found {len(all_files)} total files")
1765
  except Exception as e:
1766
  logger.warning(f"Error processing sublink {sublink}: {e}")
1767
 
@@ -1789,54 +1649,34 @@ class DownloadManager:
1789
  if not st.session_state.get('keep_progress', False):
1790
  progress_text.empty()
1791
  progress_bar.empty()
1792
- # Utility Functions for New Features
1793
- def extract_keywords(text, n=5):
1794
- doc = nlp_model(text)
1795
- keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
1796
- return keywords
1797
 
1798
- def analyze_sentiment(text):
1799
- sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
1800
- result = sentiment_analyzer(text[:512])[0]
1801
- return result['label'], result['score']
1802
-
1803
- def get_file_hash(file_path):
1804
- hasher = hashlib.md5()
1805
- with open(file_path, 'rb') as f:
1806
- hasher.update(f.read())
1807
- return hasher.hexdigest()
1808
-
1809
- # Main Function
1810
  def main():
1811
- if 'initialized' not in st.session_state:
 
 
 
 
 
 
 
 
1812
  st.session_state.initialized = True
1813
  st.session_state.discovered_files = []
1814
  st.session_state.current_url = None
1815
- st.session_state.google_creds = None
1816
  st.session_state.selected_files = []
1817
  st.session_state.do_deep_search = False
1818
  st.session_state.deep_search_url = None
1819
  st.session_state.search_results = []
1820
 
1821
- st.title("Advanced File Downloader")
1822
-
1823
  with st.sidebar:
1824
- mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
1825
  with st.expander("Advanced Options", expanded=True):
1826
  custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
1827
  max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
1828
  sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
1829
  use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
1830
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
1831
- with st.expander("Google Drive Integration", expanded=False):
1832
- if st.button("Start Google Sign-In", key="google_signin_btn"):
1833
- auth_url = get_google_auth_url()
1834
- st.markdown(f"[Click here to authorize]({auth_url})")
1835
- auth_code = st.text_input("Enter authorization code", key="auth_code_input")
1836
- if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
1837
- creds, msg = exchange_code_for_credentials(auth_code)
1838
- st.session_state.google_creds = creds
1839
- st.write(msg)
1840
 
1841
  if mode == "Manual URL":
1842
  st.header("Manual URL Mode")
@@ -1849,11 +1689,19 @@ def main():
1849
  valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
1850
  if custom_ext_list != valid_ext_list:
1851
  st.warning("Invalid extensions ignored. Use format like '.csv'.")
1852
- async def run_deep_search():
1853
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
1854
- files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
1855
- return files
1856
- files = asyncio.run(run_deep_search())
 
 
 
 
 
 
 
 
1857
  if files:
1858
  st.session_state.discovered_files = files
1859
  st.session_state.current_url = url
@@ -1863,78 +1711,112 @@ def main():
1863
 
1864
  if st.session_state.discovered_files:
1865
  files = st.session_state.discovered_files
1866
- st.success(f"Found {len(files)} files!")
1867
  col1, col2 = st.columns([1, 4])
1868
  with col1:
1869
  if st.button("Select All", key="select_all_btn"):
1870
  st.session_state.selected_files = list(range(len(files)))
1871
  if st.button("Clear Selection", key="clear_selection_btn"):
1872
  st.session_state.selected_files = []
1873
- selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect")
1874
- st.session_state.selected_files = selected_files
1875
- if selected_files:
1876
- col1, col2, col3, col4 = st.columns(4)
1877
- with col1:
1878
- download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
1879
- with col2:
1880
- create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
1881
- with col3:
1882
- delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
1883
- with col4:
1884
- upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1885
  if st.button("Download Selected", key="download_btn"):
1886
  if not os.path.exists(download_dir):
1887
  os.makedirs(download_dir)
 
1888
  async def download_files():
1889
  downloaded_paths = []
1890
  progress_bar = st.progress(0)
1891
  status_text = st.empty()
 
1892
  async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
1893
- for i, idx in enumerate(selected_files):
1894
- progress = (i + 1) / len(selected_files)
1895
  file_info = files[idx]
1896
- status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
1897
  progress_bar.progress(progress)
 
1898
  path = await dm.download_file(file_info, download_dir, url)
1899
  if path:
1900
  downloaded_paths.append(path)
 
1901
  status_text.empty()
1902
  progress_bar.empty()
1903
  return downloaded_paths
1904
- downloaded = asyncio.run(download_files())
 
 
 
1905
  if downloaded:
1906
  st.success(f"Successfully downloaded {len(downloaded)} files")
1907
- if create_zip:
 
 
1908
  zip_path = create_zip_file(downloaded, download_dir)
1909
  st.success(f"Created ZIP file: {zip_path}")
 
 
1910
  with open(zip_path, "rb") as f:
1911
  zip_data = f.read()
1912
- st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
1913
- if upload_to_drive and st.session_state.google_creds:
1914
- drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
1915
- folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
1916
- drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
1917
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
1918
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
1919
- else:
1920
- st.error(drive_id)
1921
- if delete_after:
1922
- for path in downloaded:
1923
- try:
1924
- os.remove(path)
1925
- except Exception as e:
1926
- st.warning(f"Could not delete {path}: {e}")
1927
- st.info("Deleted original files after ZIP creation")
1928
  else:
 
 
1929
  for path in downloaded:
1930
  with open(path, "rb") as f:
1931
  file_data = f.read()
1932
- st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
 
 
 
 
 
 
 
 
 
 
1933
 
1934
  elif mode == "Bing Search":
1935
  st.header("Bing Search Mode")
1936
  query = st.text_input("Enter search query", key="search_query_input")
1937
  num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
 
1938
  if st.button("Search", key="search_btn"):
1939
  if query:
1940
  async def run_search():
@@ -1944,6 +1826,8 @@ def main():
1944
  if urls:
1945
  st.session_state.search_results = urls
1946
  st.success(f"Found {len(urls)} results!")
 
 
1947
  for i, url in enumerate(urls, 1):
1948
  with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
1949
  if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
@@ -1951,29 +1835,43 @@ def main():
1951
  st.session_state.do_deep_search = True
1952
  else:
1953
  st.warning("No search results found.")
 
1954
  asyncio.run(run_search())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1955
 
1956
- else: # PDF Summarizer mode
1957
- if summarizer is None:
1958
- st.error("PDF summarization is not available due to model loading errors.")
1959
- else:
1960
- st.header("PDF Summarizer")
1961
- pdf_url = st.text_input("Enter PDF URL", key="pdf_url_input")
1962
- if st.button("Summarize", key="summarize_btn"):
1963
- if pdf_url:
1964
- with st.spinner("Generating summary..."):
1965
- try:
1966
- response = requests.get(pdf_url, stream=True)
1967
- temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
1968
- with open(temp_pdf.name, "wb") as f:
1969
- f.write(response.content)
1970
- reader = PdfReader(temp_pdf.name)
1971
- text = " ".join([page.extract_text() or "" for page in reader.pages])
1972
- os.remove(temp_pdf.name)
1973
- summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
1974
- st.write("Summary:", summary[0]['summary_text'])
1975
- except Exception as e:
1976
- st.error(f"Error summarizing PDF: {e}")
1977
 
1978
  if __name__ == "__main__":
1979
  main()
 
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8
  import asyncio
9
  import logging
10
+ from urllib.parse import urlparse, urljoin, unquote
11
  import re
12
  from pathlib import Path
13
  from io import BytesIO
 
19
  import mimetypes
20
  import requests
21
  import datetime
22
+ import traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  import base64
24
  import shutil
25
+ from PIL import Image
26
+ from reportlab.lib.pagesizes import letter
27
  from reportlab.pdfgen import canvas
28
 
29
  # -------------------- Logging Setup --------------------
30
  logging.basicConfig(
 
31
  level=logging.INFO,
32
  format='%(asctime)s - %(levelname)s - %(message)s'
33
  )
34
  logger = logging.getLogger(__name__)
35
 
36
+ # -------------------- Utility Functions --------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def get_random_user_agent():
38
  USER_AGENTS = [
39
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
 
58
  zipf.write(file_path, os.path.basename(file_path))
59
  return zip_path
60
 
61
+ # -------------------- Playwright Setup --------------------
62
+ def install_playwright_dependencies():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  try:
64
+ # Set environment variable for Playwright browsers path
65
+ os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
66
+
67
+ # Install system dependencies
68
+ subprocess.run(['pip', 'install', 'playwright'], check=True)
69
+ subprocess.run(['playwright', 'install', 'chromium'], check=True)
70
+ subprocess.run(['playwright', 'install-deps', 'chromium'], check=True)
71
+
72
+ st.success("Playwright dependencies installed successfully!")
73
  except Exception as e:
74
+ st.error(f"Error installing Playwright dependencies: {e}")
75
+ st.info("You may need to manually install dependencies. Check console for details.")
76
+ logger.error(f"Playwright setup error: {e}")
77
+ traceback.print_exc()
78
 
79
+ # -------------------- Download Manager Class --------------------
 
 
 
 
 
80
  class DownloadManager:
81
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
82
  self.use_proxy = use_proxy
 
183
  links = set()
184
 
185
  # Use requests for a faster initial scan
 
 
 
 
186
  headers = {"User-Agent": get_random_user_agent()}
187
  response = requests.get(url, headers=headers, timeout=30)
188
 
 
286
  # If filename is URL encoded (common with Chinese/international sites)
287
  if '%' in filename:
288
  try:
 
289
  filename = unquote(filename)
290
  except Exception:
291
  pass
 
622
  await page.screenshot(path=screenshot_path)
623
 
624
  # Convert to PDF
 
 
 
625
  img = Image.open(screenshot_path)
626
  width, height = img.size
627
+ c = canvas.Canvas(save_path, pagesize=(width, height))
628
  c.drawImage(screenshot_path, 0, 0, width, height)
629
  c.save()
630
 
 
758
  # Combine screenshots into PDF
759
  logger.info(f"Creating PDF from {len(screenshots)} captured pages")
760
 
 
 
 
 
761
  # Use the size of the first screenshot to set PDF dimensions
762
  if screenshots:
763
  try:
764
  img = Image.open(screenshots[0])
765
  width, height = img.size
766
 
767
+ c = canvas.Canvas(save_path, pagesize=(width, height))
768
 
769
  for screenshot in screenshots:
770
  try:
 
880
 
881
  # Try standard approaches for non-view-only files
882
  try:
883
+ # Try with requests and session cookies
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  session = requests.Session()
885
  session.headers.update({'User-Agent': get_random_user_agent()})
886
 
 
1189
  screenshots.append(screenshot_path)
1190
 
1191
  # Combine screenshots into PDF
 
 
 
1192
  c = canvas.Canvas(save_path)
1193
  for screenshot in screenshots:
1194
  img = Image.open(screenshot)
 
1214
 
1215
  # Convert to requested format if needed
1216
  if file_type == 'pdf':
 
 
 
1217
  # Create PDF from screenshot
1218
  img = Image.open(screenshot_path)
1219
  width, height = img.size
 
1618
  # Use a longer timeout for ASP.NET pages which can be slower
1619
  sub_timeout = timeout * 2 if is_aspnet else timeout
1620
 
1621
+ # Extract files from sublink
1622
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1623
+ all_files.extend(sub_files)
1624
+ file_count_text.text(f"Found {len(all_files)} total files")
 
1625
  except Exception as e:
1626
  logger.warning(f"Error processing sublink {sublink}: {e}")
1627
 
 
1649
  if not st.session_state.get('keep_progress', False):
1650
  progress_text.empty()
1651
  progress_bar.empty()
 
 
 
 
 
1652
 
1653
+ # -------------------- Main App --------------------
 
 
 
 
 
 
 
 
 
 
 
1654
  def main():
1655
+ st.title("Advanced File Downloader")
1656
+
1657
+ # Initialize playwright if needed
1658
+ if "playwright_installed" not in st.session_state:
1659
+ with st.spinner("Setting up browser automation. This may take a minute..."):
1660
+ install_playwright_dependencies()
1661
+ st.session_state.playwright_installed = True
1662
+
1663
+ if "initialized" not in st.session_state:
1664
  st.session_state.initialized = True
1665
  st.session_state.discovered_files = []
1666
  st.session_state.current_url = None
 
1667
  st.session_state.selected_files = []
1668
  st.session_state.do_deep_search = False
1669
  st.session_state.deep_search_url = None
1670
  st.session_state.search_results = []
1671
 
 
 
1672
  with st.sidebar:
1673
+ mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
1674
  with st.expander("Advanced Options", expanded=True):
1675
  custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
1676
  max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
1677
  sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
1678
  use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
1679
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
 
 
 
 
 
 
 
 
 
1680
 
1681
  if mode == "Manual URL":
1682
  st.header("Manual URL Mode")
 
1689
  valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
1690
  if custom_ext_list != valid_ext_list:
1691
  st.warning("Invalid extensions ignored. Use format like '.csv'.")
1692
+
1693
+ @st.cache_resource
1694
+ def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
1695
+ async def _run():
1696
+ async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
1697
+ files = await dm.deep_search(url, ext_list, max_links, timeout_val)
1698
+ return files
1699
+ return asyncio.run(_run())
1700
+
1701
+ with st.spinner("Searching for files..."):
1702
+ files = run_deep_search(url, valid_ext_list, max_sublinks,
1703
+ sublink_timeout, use_proxy, proxy)
1704
+
1705
  if files:
1706
  st.session_state.discovered_files = files
1707
  st.session_state.current_url = url
 
1711
 
1712
  if st.session_state.discovered_files:
1713
  files = st.session_state.discovered_files
 
1714
  col1, col2 = st.columns([1, 4])
1715
  with col1:
1716
  if st.button("Select All", key="select_all_btn"):
1717
  st.session_state.selected_files = list(range(len(files)))
1718
  if st.button("Clear Selection", key="clear_selection_btn"):
1719
  st.session_state.selected_files = []
1720
+
1721
+ # Create a formatted display of files with metadata
1722
+ file_options = []
1723
+ for i, file in enumerate(files):
1724
+ filename = file['filename']
1725
+ size = file['size']
1726
+ meta = file.get('metadata', {})
1727
+
1728
+ # Format display string with relevant metadata
1729
+ if meta and 'Pages' in meta:
1730
+ file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages"
1731
+ else:
1732
+ file_info = f"{filename} ({size})"
1733
+
1734
+ file_options.append((i, file_info))
1735
+
1736
+ selected_indices = st.multiselect(
1737
+ "Select files to download",
1738
+ options=[i for i, _ in file_options],
1739
+ default=st.session_state.selected_files,
1740
+ format_func=lambda i: next(info for idx, info in file_options if idx == i),
1741
+ key="file_multiselect"
1742
+ )
1743
+
1744
+ st.session_state.selected_files = selected_indices
1745
+
1746
+ if selected_indices:
1747
+ download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
1748
+ create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
1749
+
1750
  if st.button("Download Selected", key="download_btn"):
1751
  if not os.path.exists(download_dir):
1752
  os.makedirs(download_dir)
1753
+
1754
  async def download_files():
1755
  downloaded_paths = []
1756
  progress_bar = st.progress(0)
1757
  status_text = st.empty()
1758
+
1759
  async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
1760
+ for i, idx in enumerate(selected_indices):
1761
+ progress = (i + 1) / len(selected_indices)
1762
  file_info = files[idx]
1763
+ status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})")
1764
  progress_bar.progress(progress)
1765
+
1766
  path = await dm.download_file(file_info, download_dir, url)
1767
  if path:
1768
  downloaded_paths.append(path)
1769
+
1770
  status_text.empty()
1771
  progress_bar.empty()
1772
  return downloaded_paths
1773
+
1774
+ with st.spinner("Downloading files..."):
1775
+ downloaded = asyncio.run(download_files())
1776
+
1777
  if downloaded:
1778
  st.success(f"Successfully downloaded {len(downloaded)} files")
1779
+
1780
+ # Create file downloads
1781
+ if create_zip and len(downloaded) > 1:
1782
  zip_path = create_zip_file(downloaded, download_dir)
1783
  st.success(f"Created ZIP file: {zip_path}")
1784
+
1785
+ # Provide download link for the zip file
1786
  with open(zip_path, "rb") as f:
1787
  zip_data = f.read()
1788
+
1789
+ zip_filename = os.path.basename(zip_path)
1790
+ st.download_button(
1791
+ label="Download ZIP",
1792
+ data=zip_data,
1793
+ file_name=zip_filename,
1794
+ mime="application/zip",
1795
+ key="download_zip_btn"
1796
+ )
 
 
 
 
 
 
 
1797
  else:
1798
+ # Provide individual file downloads
1799
+ st.write("Download files individually:")
1800
  for path in downloaded:
1801
  with open(path, "rb") as f:
1802
  file_data = f.read()
1803
+
1804
+ file_name = os.path.basename(path)
1805
+ mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
1806
+
1807
+ st.download_button(
1808
+ label=f"Download {file_name}",
1809
+ data=file_data,
1810
+ file_name=file_name,
1811
+ mime=mime_type,
1812
+ key=f"download_file_{path}"
1813
+ )
1814
 
1815
  elif mode == "Bing Search":
1816
  st.header("Bing Search Mode")
1817
  query = st.text_input("Enter search query", key="search_query_input")
1818
  num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
1819
+
1820
  if st.button("Search", key="search_btn"):
1821
  if query:
1822
  async def run_search():
 
1826
  if urls:
1827
  st.session_state.search_results = urls
1828
  st.success(f"Found {len(urls)} results!")
1829
+
1830
+ # Create expanders for each result
1831
  for i, url in enumerate(urls, 1):
1832
  with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
1833
  if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
 
1835
  st.session_state.do_deep_search = True
1836
  else:
1837
  st.warning("No search results found.")
1838
+
1839
  asyncio.run(run_search())
1840
+
1841
+ # Handle deep search based on search results
1842
+ if st.session_state.do_deep_search and st.session_state.deep_search_url:
1843
+ url = st.session_state.deep_search_url
1844
+ st.info(f"Deep searching: {url}")
1845
+
1846
+ # Reset the flag to avoid re-running
1847
+ st.session_state.do_deep_search = False
1848
+
1849
+ # Set up custom extensions
1850
+ custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
1851
+ valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
1852
+
1853
+ @st.cache_resource
1854
+ def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
1855
+ async def _run():
1856
+ async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
1857
+ files = await dm.deep_search(url, ext_list, max_links, timeout_val)
1858
+ return files
1859
+ return asyncio.run(_run())
1860
+
1861
+ with st.spinner("Searching for files..."):
1862
+ files = run_deep_search(url, valid_ext_list, max_sublinks,
1863
+ sublink_timeout, use_proxy, proxy)
1864
+
1865
+ if files:
1866
+ st.session_state.discovered_files = files
1867
+ st.session_state.current_url = url
1868
+ st.success(f"Found {len(files)} files!")
1869
+ else:
1870
+ st.warning("No files found.")
1871
 
1872
+ # Add footer with attribution
1873
+ st.markdown('---')
1874
+ st.markdown('Created by [Euler314](https://github.com/euler314)')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1875
 
1876
  if __name__ == "__main__":
1877
  main()