Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import subprocess
|
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
8 |
import asyncio
|
9 |
import logging
|
10 |
-
from urllib.parse import urlparse
|
11 |
import re
|
12 |
from pathlib import Path
|
13 |
from io import BytesIO
|
@@ -19,97 +19,21 @@ import tempfile
|
|
19 |
import mimetypes
|
20 |
import requests
|
21 |
import datetime
|
22 |
-
import
|
23 |
-
import spacy.cli
|
24 |
-
from spacy.language import Language
|
25 |
-
import google_auth_oauthlib.flow
|
26 |
-
import googleapiclient.discovery
|
27 |
-
import google.auth.transport.requests
|
28 |
-
from async_timeout import timeout as async_timeout
|
29 |
-
import pandas as pd
|
30 |
-
from sentence_transformers import SentenceTransformer
|
31 |
-
from transformers import pipeline
|
32 |
-
import schedule
|
33 |
-
import threading
|
34 |
-
import time
|
35 |
-
import hashlib
|
36 |
-
from reportlab.lib.pagesizes import letter
|
37 |
-
from reportlab.pdfgen import canvas
|
38 |
-
from sklearn.cluster import KMeans
|
39 |
-
import numpy as np
|
40 |
import base64
|
41 |
import shutil
|
42 |
-
from PIL import Image
|
|
|
43 |
from reportlab.pdfgen import canvas
|
44 |
|
45 |
# -------------------- Logging Setup --------------------
|
46 |
logging.basicConfig(
|
47 |
-
filename='advanced_download_log.txt',
|
48 |
level=logging.INFO,
|
49 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
50 |
)
|
51 |
logger = logging.getLogger(__name__)
|
52 |
|
53 |
-
|
54 |
-
"web": {
|
55 |
-
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
56 |
-
"project_id": "huggingface-449214",
|
57 |
-
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
58 |
-
"token_uri": "https://oauth2.googleapis.com/token",
|
59 |
-
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
60 |
-
"client_secret": "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",
|
61 |
-
"redirect_uris": ["https://euler314-craw-web.hf.space/"]
|
62 |
-
}
|
63 |
-
}
|
64 |
-
|
65 |
-
# Playwright Setup
|
66 |
-
def install_playwright_dependencies():
|
67 |
-
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
68 |
-
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
69 |
-
packages = [
|
70 |
-
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
71 |
-
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
72 |
-
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
73 |
-
]
|
74 |
-
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
75 |
-
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
76 |
-
|
77 |
-
install_playwright_dependencies()
|
78 |
-
|
79 |
-
# Model Loading
|
80 |
-
@st.cache_resource
|
81 |
-
def load_models():
|
82 |
-
try:
|
83 |
-
# Load spaCy model
|
84 |
-
try:
|
85 |
-
nlp = spacy.load("en_core_web_sm")
|
86 |
-
except OSError:
|
87 |
-
st.info("Downloading spaCy model...")
|
88 |
-
spacy.cli.download("en_core_web_sm")
|
89 |
-
nlp = spacy.load("en_core_web_sm")
|
90 |
-
|
91 |
-
# Load SentenceTransformer
|
92 |
-
try:
|
93 |
-
semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
|
94 |
-
except Exception as e:
|
95 |
-
st.error(f"Error loading SentenceTransformer: {e}")
|
96 |
-
semantic_model = None
|
97 |
-
|
98 |
-
# Load Transformers pipeline
|
99 |
-
try:
|
100 |
-
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
101 |
-
except Exception as e:
|
102 |
-
st.error(f"Error loading Transformers: {e}")
|
103 |
-
summarizer = None
|
104 |
-
|
105 |
-
return nlp, semantic_model, summarizer
|
106 |
-
except Exception as e:
|
107 |
-
st.error(f"Error loading models: {e}")
|
108 |
-
return None, None, None
|
109 |
-
|
110 |
-
nlp_model, semantic_model, summarizer = load_models()
|
111 |
-
|
112 |
-
# Utility Functions
|
113 |
def get_random_user_agent():
|
114 |
USER_AGENTS = [
|
115 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
@@ -134,57 +58,25 @@ def create_zip_file(file_paths, output_dir):
|
|
134 |
zipf.write(file_path, os.path.basename(file_path))
|
135 |
return zip_path
|
136 |
|
137 |
-
#
|
138 |
-
def
|
139 |
-
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
140 |
-
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
141 |
-
{"web": client_config},
|
142 |
-
scopes=["https://www.googleapis.com/auth/drive.file"]
|
143 |
-
)
|
144 |
-
flow.redirect_uri = client_config["redirect_uris"][0]
|
145 |
-
authorization_url, _ = flow.authorization_url(
|
146 |
-
access_type="offline",
|
147 |
-
include_granted_scopes="true",
|
148 |
-
prompt="consent"
|
149 |
-
)
|
150 |
-
return authorization_url
|
151 |
-
|
152 |
-
def exchange_code_for_credentials(auth_code):
|
153 |
-
if not auth_code.strip():
|
154 |
-
return None, "No code provided."
|
155 |
-
try:
|
156 |
-
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
157 |
-
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
158 |
-
{"web": client_config},
|
159 |
-
scopes=["https://www.googleapis.com/auth/drive.file"]
|
160 |
-
)
|
161 |
-
flow.redirect_uri = client_config["redirect_uris"][0]
|
162 |
-
flow.fetch_token(code=auth_code.strip())
|
163 |
-
creds = flow.credentials
|
164 |
-
if not creds or not creds.valid:
|
165 |
-
return None, "Could not validate credentials. Check code and try again."
|
166 |
-
return creds, "Google Sign-In successful!"
|
167 |
-
except Exception as e:
|
168 |
-
return None, f"Error during token exchange: {e}"
|
169 |
-
|
170 |
-
def google_drive_upload(file_path, credentials, folder_id=None):
|
171 |
try:
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
179 |
except Exception as e:
|
180 |
-
|
|
|
|
|
|
|
181 |
|
182 |
-
|
183 |
-
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
184 |
-
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
185 |
-
return folder.get('id')
|
186 |
-
|
187 |
-
# DownloadManager Class
|
188 |
class DownloadManager:
|
189 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
190 |
self.use_proxy = use_proxy
|
@@ -291,10 +183,6 @@ class DownloadManager:
|
|
291 |
links = set()
|
292 |
|
293 |
# Use requests for a faster initial scan
|
294 |
-
import requests
|
295 |
-
from bs4 import BeautifulSoup
|
296 |
-
from urllib.parse import urljoin, urlparse
|
297 |
-
|
298 |
headers = {"User-Agent": get_random_user_agent()}
|
299 |
response = requests.get(url, headers=headers, timeout=30)
|
300 |
|
@@ -398,7 +286,6 @@ class DownloadManager:
|
|
398 |
# If filename is URL encoded (common with Chinese/international sites)
|
399 |
if '%' in filename:
|
400 |
try:
|
401 |
-
from urllib.parse import unquote
|
402 |
filename = unquote(filename)
|
403 |
except Exception:
|
404 |
pass
|
@@ -735,12 +622,9 @@ class DownloadManager:
|
|
735 |
await page.screenshot(path=screenshot_path)
|
736 |
|
737 |
# Convert to PDF
|
738 |
-
from PIL import Image
|
739 |
-
from reportlab.pdfgen import canvas as pdf_canvas
|
740 |
-
|
741 |
img = Image.open(screenshot_path)
|
742 |
width, height = img.size
|
743 |
-
c =
|
744 |
c.drawImage(screenshot_path, 0, 0, width, height)
|
745 |
c.save()
|
746 |
|
@@ -874,17 +758,13 @@ class DownloadManager:
|
|
874 |
# Combine screenshots into PDF
|
875 |
logger.info(f"Creating PDF from {len(screenshots)} captured pages")
|
876 |
|
877 |
-
from PIL import Image
|
878 |
-
from reportlab.lib.pagesizes import letter
|
879 |
-
from reportlab.pdfgen import canvas as pdf_canvas
|
880 |
-
|
881 |
# Use the size of the first screenshot to set PDF dimensions
|
882 |
if screenshots:
|
883 |
try:
|
884 |
img = Image.open(screenshots[0])
|
885 |
width, height = img.size
|
886 |
|
887 |
-
c =
|
888 |
|
889 |
for screenshot in screenshots:
|
890 |
try:
|
@@ -1000,20 +880,7 @@ class DownloadManager:
|
|
1000 |
|
1001 |
# Try standard approaches for non-view-only files
|
1002 |
try:
|
1003 |
-
# Try with
|
1004 |
-
import gdown
|
1005 |
-
output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
|
1006 |
-
if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
1007 |
-
with open(save_path, 'rb') as f:
|
1008 |
-
content = f.read(100) # Read first 100 bytes
|
1009 |
-
if b'<!DOCTYPE html>' not in content: # Check not HTML error page
|
1010 |
-
logger.info(f"Successfully downloaded with gdown: {url}")
|
1011 |
-
return True
|
1012 |
-
except Exception as e:
|
1013 |
-
logger.warning(f"gdown download failed: {e}")
|
1014 |
-
|
1015 |
-
# Try with requests and session cookies
|
1016 |
-
try:
|
1017 |
session = requests.Session()
|
1018 |
session.headers.update({'User-Agent': get_random_user_agent()})
|
1019 |
|
@@ -1322,9 +1189,6 @@ class DownloadManager:
|
|
1322 |
screenshots.append(screenshot_path)
|
1323 |
|
1324 |
# Combine screenshots into PDF
|
1325 |
-
from PIL import Image
|
1326 |
-
from reportlab.pdfgen import canvas
|
1327 |
-
|
1328 |
c = canvas.Canvas(save_path)
|
1329 |
for screenshot in screenshots:
|
1330 |
img = Image.open(screenshot)
|
@@ -1350,9 +1214,6 @@ class DownloadManager:
|
|
1350 |
|
1351 |
# Convert to requested format if needed
|
1352 |
if file_type == 'pdf':
|
1353 |
-
from PIL import Image
|
1354 |
-
from reportlab.pdfgen import canvas
|
1355 |
-
|
1356 |
# Create PDF from screenshot
|
1357 |
img = Image.open(screenshot_path)
|
1358 |
width, height = img.size
|
@@ -1757,11 +1618,10 @@ class DownloadManager:
|
|
1757 |
# Use a longer timeout for ASP.NET pages which can be slower
|
1758 |
sub_timeout = timeout * 2 if is_aspnet else timeout
|
1759 |
|
1760 |
-
# Extract files from sublink
|
1761 |
-
|
1762 |
-
|
1763 |
-
|
1764 |
-
file_count_text.text(f"Found {len(all_files)} total files")
|
1765 |
except Exception as e:
|
1766 |
logger.warning(f"Error processing sublink {sublink}: {e}")
|
1767 |
|
@@ -1789,54 +1649,34 @@ class DownloadManager:
|
|
1789 |
if not st.session_state.get('keep_progress', False):
|
1790 |
progress_text.empty()
|
1791 |
progress_bar.empty()
|
1792 |
-
# Utility Functions for New Features
|
1793 |
-
def extract_keywords(text, n=5):
|
1794 |
-
doc = nlp_model(text)
|
1795 |
-
keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
|
1796 |
-
return keywords
|
1797 |
|
1798 |
-
|
1799 |
-
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
1800 |
-
result = sentiment_analyzer(text[:512])[0]
|
1801 |
-
return result['label'], result['score']
|
1802 |
-
|
1803 |
-
def get_file_hash(file_path):
|
1804 |
-
hasher = hashlib.md5()
|
1805 |
-
with open(file_path, 'rb') as f:
|
1806 |
-
hasher.update(f.read())
|
1807 |
-
return hasher.hexdigest()
|
1808 |
-
|
1809 |
-
# Main Function
|
1810 |
def main():
|
1811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1812 |
st.session_state.initialized = True
|
1813 |
st.session_state.discovered_files = []
|
1814 |
st.session_state.current_url = None
|
1815 |
-
st.session_state.google_creds = None
|
1816 |
st.session_state.selected_files = []
|
1817 |
st.session_state.do_deep_search = False
|
1818 |
st.session_state.deep_search_url = None
|
1819 |
st.session_state.search_results = []
|
1820 |
|
1821 |
-
st.title("Advanced File Downloader")
|
1822 |
-
|
1823 |
with st.sidebar:
|
1824 |
-
mode = st.radio("Select Mode", ["Manual URL", "Bing Search"
|
1825 |
with st.expander("Advanced Options", expanded=True):
|
1826 |
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
|
1827 |
max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
|
1828 |
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
|
1829 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
1830 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
1831 |
-
with st.expander("Google Drive Integration", expanded=False):
|
1832 |
-
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
1833 |
-
auth_url = get_google_auth_url()
|
1834 |
-
st.markdown(f"[Click here to authorize]({auth_url})")
|
1835 |
-
auth_code = st.text_input("Enter authorization code", key="auth_code_input")
|
1836 |
-
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
|
1837 |
-
creds, msg = exchange_code_for_credentials(auth_code)
|
1838 |
-
st.session_state.google_creds = creds
|
1839 |
-
st.write(msg)
|
1840 |
|
1841 |
if mode == "Manual URL":
|
1842 |
st.header("Manual URL Mode")
|
@@ -1849,11 +1689,19 @@ def main():
|
|
1849 |
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
1850 |
if custom_ext_list != valid_ext_list:
|
1851 |
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
1852 |
-
|
1853 |
-
|
1854 |
-
|
1855 |
-
|
1856 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1857 |
if files:
|
1858 |
st.session_state.discovered_files = files
|
1859 |
st.session_state.current_url = url
|
@@ -1863,78 +1711,112 @@ def main():
|
|
1863 |
|
1864 |
if st.session_state.discovered_files:
|
1865 |
files = st.session_state.discovered_files
|
1866 |
-
st.success(f"Found {len(files)} files!")
|
1867 |
col1, col2 = st.columns([1, 4])
|
1868 |
with col1:
|
1869 |
if st.button("Select All", key="select_all_btn"):
|
1870 |
st.session_state.selected_files = list(range(len(files)))
|
1871 |
if st.button("Clear Selection", key="clear_selection_btn"):
|
1872 |
st.session_state.selected_files = []
|
1873 |
-
|
1874 |
-
|
1875 |
-
|
1876 |
-
|
1877 |
-
|
1878 |
-
|
1879 |
-
|
1880 |
-
|
1881 |
-
with
|
1882 |
-
|
1883 |
-
|
1884 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1885 |
if st.button("Download Selected", key="download_btn"):
|
1886 |
if not os.path.exists(download_dir):
|
1887 |
os.makedirs(download_dir)
|
|
|
1888 |
async def download_files():
|
1889 |
downloaded_paths = []
|
1890 |
progress_bar = st.progress(0)
|
1891 |
status_text = st.empty()
|
|
|
1892 |
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
1893 |
-
for i, idx in enumerate(
|
1894 |
-
progress = (i + 1) / len(
|
1895 |
file_info = files[idx]
|
1896 |
-
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(
|
1897 |
progress_bar.progress(progress)
|
|
|
1898 |
path = await dm.download_file(file_info, download_dir, url)
|
1899 |
if path:
|
1900 |
downloaded_paths.append(path)
|
|
|
1901 |
status_text.empty()
|
1902 |
progress_bar.empty()
|
1903 |
return downloaded_paths
|
1904 |
-
|
|
|
|
|
|
|
1905 |
if downloaded:
|
1906 |
st.success(f"Successfully downloaded {len(downloaded)} files")
|
1907 |
-
|
|
|
|
|
1908 |
zip_path = create_zip_file(downloaded, download_dir)
|
1909 |
st.success(f"Created ZIP file: {zip_path}")
|
|
|
|
|
1910 |
with open(zip_path, "rb") as f:
|
1911 |
zip_data = f.read()
|
1912 |
-
|
1913 |
-
|
1914 |
-
|
1915 |
-
|
1916 |
-
|
1917 |
-
|
1918 |
-
|
1919 |
-
|
1920 |
-
|
1921 |
-
if delete_after:
|
1922 |
-
for path in downloaded:
|
1923 |
-
try:
|
1924 |
-
os.remove(path)
|
1925 |
-
except Exception as e:
|
1926 |
-
st.warning(f"Could not delete {path}: {e}")
|
1927 |
-
st.info("Deleted original files after ZIP creation")
|
1928 |
else:
|
|
|
|
|
1929 |
for path in downloaded:
|
1930 |
with open(path, "rb") as f:
|
1931 |
file_data = f.read()
|
1932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1933 |
|
1934 |
elif mode == "Bing Search":
|
1935 |
st.header("Bing Search Mode")
|
1936 |
query = st.text_input("Enter search query", key="search_query_input")
|
1937 |
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
|
|
|
1938 |
if st.button("Search", key="search_btn"):
|
1939 |
if query:
|
1940 |
async def run_search():
|
@@ -1944,6 +1826,8 @@ def main():
|
|
1944 |
if urls:
|
1945 |
st.session_state.search_results = urls
|
1946 |
st.success(f"Found {len(urls)} results!")
|
|
|
|
|
1947 |
for i, url in enumerate(urls, 1):
|
1948 |
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
1949 |
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
|
@@ -1951,29 +1835,43 @@ def main():
|
|
1951 |
st.session_state.do_deep_search = True
|
1952 |
else:
|
1953 |
st.warning("No search results found.")
|
|
|
1954 |
asyncio.run(run_search())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1955 |
|
1956 |
-
|
1957 |
-
|
1958 |
-
|
1959 |
-
else:
|
1960 |
-
st.header("PDF Summarizer")
|
1961 |
-
pdf_url = st.text_input("Enter PDF URL", key="pdf_url_input")
|
1962 |
-
if st.button("Summarize", key="summarize_btn"):
|
1963 |
-
if pdf_url:
|
1964 |
-
with st.spinner("Generating summary..."):
|
1965 |
-
try:
|
1966 |
-
response = requests.get(pdf_url, stream=True)
|
1967 |
-
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
1968 |
-
with open(temp_pdf.name, "wb") as f:
|
1969 |
-
f.write(response.content)
|
1970 |
-
reader = PdfReader(temp_pdf.name)
|
1971 |
-
text = " ".join([page.extract_text() or "" for page in reader.pages])
|
1972 |
-
os.remove(temp_pdf.name)
|
1973 |
-
summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
|
1974 |
-
st.write("Summary:", summary[0]['summary_text'])
|
1975 |
-
except Exception as e:
|
1976 |
-
st.error(f"Error summarizing PDF: {e}")
|
1977 |
|
1978 |
if __name__ == "__main__":
|
1979 |
main()
|
|
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
8 |
import asyncio
|
9 |
import logging
|
10 |
+
from urllib.parse import urlparse, urljoin, unquote
|
11 |
import re
|
12 |
from pathlib import Path
|
13 |
from io import BytesIO
|
|
|
19 |
import mimetypes
|
20 |
import requests
|
21 |
import datetime
|
22 |
+
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
import base64
|
24 |
import shutil
|
25 |
+
from PIL import Image
|
26 |
+
from reportlab.lib.pagesizes import letter
|
27 |
from reportlab.pdfgen import canvas
|
28 |
|
29 |
# -------------------- Logging Setup --------------------
|
30 |
logging.basicConfig(
|
|
|
31 |
level=logging.INFO,
|
32 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
33 |
)
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
36 |
+
# -------------------- Utility Functions --------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def get_random_user_agent():
|
38 |
USER_AGENTS = [
|
39 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
|
|
58 |
zipf.write(file_path, os.path.basename(file_path))
|
59 |
return zip_path
|
60 |
|
61 |
+
# -------------------- Playwright Setup --------------------
|
62 |
+
def install_playwright_dependencies():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
try:
|
64 |
+
# Set environment variable for Playwright browsers path
|
65 |
+
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
66 |
+
|
67 |
+
# Install system dependencies
|
68 |
+
subprocess.run(['pip', 'install', 'playwright'], check=True)
|
69 |
+
subprocess.run(['playwright', 'install', 'chromium'], check=True)
|
70 |
+
subprocess.run(['playwright', 'install-deps', 'chromium'], check=True)
|
71 |
+
|
72 |
+
st.success("Playwright dependencies installed successfully!")
|
73 |
except Exception as e:
|
74 |
+
st.error(f"Error installing Playwright dependencies: {e}")
|
75 |
+
st.info("You may need to manually install dependencies. Check console for details.")
|
76 |
+
logger.error(f"Playwright setup error: {e}")
|
77 |
+
traceback.print_exc()
|
78 |
|
79 |
+
# -------------------- Download Manager Class --------------------
|
|
|
|
|
|
|
|
|
|
|
80 |
class DownloadManager:
|
81 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
82 |
self.use_proxy = use_proxy
|
|
|
183 |
links = set()
|
184 |
|
185 |
# Use requests for a faster initial scan
|
|
|
|
|
|
|
|
|
186 |
headers = {"User-Agent": get_random_user_agent()}
|
187 |
response = requests.get(url, headers=headers, timeout=30)
|
188 |
|
|
|
286 |
# If filename is URL encoded (common with Chinese/international sites)
|
287 |
if '%' in filename:
|
288 |
try:
|
|
|
289 |
filename = unquote(filename)
|
290 |
except Exception:
|
291 |
pass
|
|
|
622 |
await page.screenshot(path=screenshot_path)
|
623 |
|
624 |
# Convert to PDF
|
|
|
|
|
|
|
625 |
img = Image.open(screenshot_path)
|
626 |
width, height = img.size
|
627 |
+
c = canvas.Canvas(save_path, pagesize=(width, height))
|
628 |
c.drawImage(screenshot_path, 0, 0, width, height)
|
629 |
c.save()
|
630 |
|
|
|
758 |
# Combine screenshots into PDF
|
759 |
logger.info(f"Creating PDF from {len(screenshots)} captured pages")
|
760 |
|
|
|
|
|
|
|
|
|
761 |
# Use the size of the first screenshot to set PDF dimensions
|
762 |
if screenshots:
|
763 |
try:
|
764 |
img = Image.open(screenshots[0])
|
765 |
width, height = img.size
|
766 |
|
767 |
+
c = canvas.Canvas(save_path, pagesize=(width, height))
|
768 |
|
769 |
for screenshot in screenshots:
|
770 |
try:
|
|
|
880 |
|
881 |
# Try standard approaches for non-view-only files
|
882 |
try:
|
883 |
+
# Try with requests and session cookies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
884 |
session = requests.Session()
|
885 |
session.headers.update({'User-Agent': get_random_user_agent()})
|
886 |
|
|
|
1189 |
screenshots.append(screenshot_path)
|
1190 |
|
1191 |
# Combine screenshots into PDF
|
|
|
|
|
|
|
1192 |
c = canvas.Canvas(save_path)
|
1193 |
for screenshot in screenshots:
|
1194 |
img = Image.open(screenshot)
|
|
|
1214 |
|
1215 |
# Convert to requested format if needed
|
1216 |
if file_type == 'pdf':
|
|
|
|
|
|
|
1217 |
# Create PDF from screenshot
|
1218 |
img = Image.open(screenshot_path)
|
1219 |
width, height = img.size
|
|
|
1618 |
# Use a longer timeout for ASP.NET pages which can be slower
|
1619 |
sub_timeout = timeout * 2 if is_aspnet else timeout
|
1620 |
|
1621 |
+
# Extract files from sublink
|
1622 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
1623 |
+
all_files.extend(sub_files)
|
1624 |
+
file_count_text.text(f"Found {len(all_files)} total files")
|
|
|
1625 |
except Exception as e:
|
1626 |
logger.warning(f"Error processing sublink {sublink}: {e}")
|
1627 |
|
|
|
1649 |
if not st.session_state.get('keep_progress', False):
|
1650 |
progress_text.empty()
|
1651 |
progress_bar.empty()
|
|
|
|
|
|
|
|
|
|
|
1652 |
|
1653 |
+
# -------------------- Main App --------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1654 |
def main():
|
1655 |
+
st.title("Advanced File Downloader")
|
1656 |
+
|
1657 |
+
# Initialize playwright if needed
|
1658 |
+
if "playwright_installed" not in st.session_state:
|
1659 |
+
with st.spinner("Setting up browser automation. This may take a minute..."):
|
1660 |
+
install_playwright_dependencies()
|
1661 |
+
st.session_state.playwright_installed = True
|
1662 |
+
|
1663 |
+
if "initialized" not in st.session_state:
|
1664 |
st.session_state.initialized = True
|
1665 |
st.session_state.discovered_files = []
|
1666 |
st.session_state.current_url = None
|
|
|
1667 |
st.session_state.selected_files = []
|
1668 |
st.session_state.do_deep_search = False
|
1669 |
st.session_state.deep_search_url = None
|
1670 |
st.session_state.search_results = []
|
1671 |
|
|
|
|
|
1672 |
with st.sidebar:
|
1673 |
+
mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
|
1674 |
with st.expander("Advanced Options", expanded=True):
|
1675 |
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
|
1676 |
max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
|
1677 |
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
|
1678 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
1679 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1680 |
|
1681 |
if mode == "Manual URL":
|
1682 |
st.header("Manual URL Mode")
|
|
|
1689 |
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
1690 |
if custom_ext_list != valid_ext_list:
|
1691 |
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
1692 |
+
|
1693 |
+
@st.cache_resource
|
1694 |
+
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
|
1695 |
+
async def _run():
|
1696 |
+
async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
|
1697 |
+
files = await dm.deep_search(url, ext_list, max_links, timeout_val)
|
1698 |
+
return files
|
1699 |
+
return asyncio.run(_run())
|
1700 |
+
|
1701 |
+
with st.spinner("Searching for files..."):
|
1702 |
+
files = run_deep_search(url, valid_ext_list, max_sublinks,
|
1703 |
+
sublink_timeout, use_proxy, proxy)
|
1704 |
+
|
1705 |
if files:
|
1706 |
st.session_state.discovered_files = files
|
1707 |
st.session_state.current_url = url
|
|
|
1711 |
|
1712 |
if st.session_state.discovered_files:
|
1713 |
files = st.session_state.discovered_files
|
|
|
1714 |
col1, col2 = st.columns([1, 4])
|
1715 |
with col1:
|
1716 |
if st.button("Select All", key="select_all_btn"):
|
1717 |
st.session_state.selected_files = list(range(len(files)))
|
1718 |
if st.button("Clear Selection", key="clear_selection_btn"):
|
1719 |
st.session_state.selected_files = []
|
1720 |
+
|
1721 |
+
# Create a formatted display of files with metadata
|
1722 |
+
file_options = []
|
1723 |
+
for i, file in enumerate(files):
|
1724 |
+
filename = file['filename']
|
1725 |
+
size = file['size']
|
1726 |
+
meta = file.get('metadata', {})
|
1727 |
+
|
1728 |
+
# Format display string with relevant metadata
|
1729 |
+
if meta and 'Pages' in meta:
|
1730 |
+
file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages"
|
1731 |
+
else:
|
1732 |
+
file_info = f"{filename} ({size})"
|
1733 |
+
|
1734 |
+
file_options.append((i, file_info))
|
1735 |
+
|
1736 |
+
selected_indices = st.multiselect(
|
1737 |
+
"Select files to download",
|
1738 |
+
options=[i for i, _ in file_options],
|
1739 |
+
default=st.session_state.selected_files,
|
1740 |
+
format_func=lambda i: next(info for idx, info in file_options if idx == i),
|
1741 |
+
key="file_multiselect"
|
1742 |
+
)
|
1743 |
+
|
1744 |
+
st.session_state.selected_files = selected_indices
|
1745 |
+
|
1746 |
+
if selected_indices:
|
1747 |
+
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
1748 |
+
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
1749 |
+
|
1750 |
if st.button("Download Selected", key="download_btn"):
|
1751 |
if not os.path.exists(download_dir):
|
1752 |
os.makedirs(download_dir)
|
1753 |
+
|
1754 |
async def download_files():
|
1755 |
downloaded_paths = []
|
1756 |
progress_bar = st.progress(0)
|
1757 |
status_text = st.empty()
|
1758 |
+
|
1759 |
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
1760 |
+
for i, idx in enumerate(selected_indices):
|
1761 |
+
progress = (i + 1) / len(selected_indices)
|
1762 |
file_info = files[idx]
|
1763 |
+
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})")
|
1764 |
progress_bar.progress(progress)
|
1765 |
+
|
1766 |
path = await dm.download_file(file_info, download_dir, url)
|
1767 |
if path:
|
1768 |
downloaded_paths.append(path)
|
1769 |
+
|
1770 |
status_text.empty()
|
1771 |
progress_bar.empty()
|
1772 |
return downloaded_paths
|
1773 |
+
|
1774 |
+
with st.spinner("Downloading files..."):
|
1775 |
+
downloaded = asyncio.run(download_files())
|
1776 |
+
|
1777 |
if downloaded:
|
1778 |
st.success(f"Successfully downloaded {len(downloaded)} files")
|
1779 |
+
|
1780 |
+
# Create file downloads
|
1781 |
+
if create_zip and len(downloaded) > 1:
|
1782 |
zip_path = create_zip_file(downloaded, download_dir)
|
1783 |
st.success(f"Created ZIP file: {zip_path}")
|
1784 |
+
|
1785 |
+
# Provide download link for the zip file
|
1786 |
with open(zip_path, "rb") as f:
|
1787 |
zip_data = f.read()
|
1788 |
+
|
1789 |
+
zip_filename = os.path.basename(zip_path)
|
1790 |
+
st.download_button(
|
1791 |
+
label="Download ZIP",
|
1792 |
+
data=zip_data,
|
1793 |
+
file_name=zip_filename,
|
1794 |
+
mime="application/zip",
|
1795 |
+
key="download_zip_btn"
|
1796 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1797 |
else:
|
1798 |
+
# Provide individual file downloads
|
1799 |
+
st.write("Download files individually:")
|
1800 |
for path in downloaded:
|
1801 |
with open(path, "rb") as f:
|
1802 |
file_data = f.read()
|
1803 |
+
|
1804 |
+
file_name = os.path.basename(path)
|
1805 |
+
mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
|
1806 |
+
|
1807 |
+
st.download_button(
|
1808 |
+
label=f"Download {file_name}",
|
1809 |
+
data=file_data,
|
1810 |
+
file_name=file_name,
|
1811 |
+
mime=mime_type,
|
1812 |
+
key=f"download_file_{path}"
|
1813 |
+
)
|
1814 |
|
1815 |
elif mode == "Bing Search":
|
1816 |
st.header("Bing Search Mode")
|
1817 |
query = st.text_input("Enter search query", key="search_query_input")
|
1818 |
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
|
1819 |
+
|
1820 |
if st.button("Search", key="search_btn"):
|
1821 |
if query:
|
1822 |
async def run_search():
|
|
|
1826 |
if urls:
|
1827 |
st.session_state.search_results = urls
|
1828 |
st.success(f"Found {len(urls)} results!")
|
1829 |
+
|
1830 |
+
# Create expanders for each result
|
1831 |
for i, url in enumerate(urls, 1):
|
1832 |
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
1833 |
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
|
|
|
1835 |
st.session_state.do_deep_search = True
|
1836 |
else:
|
1837 |
st.warning("No search results found.")
|
1838 |
+
|
1839 |
asyncio.run(run_search())
|
1840 |
+
|
1841 |
+
# Handle deep search based on search results
|
1842 |
+
if st.session_state.do_deep_search and st.session_state.deep_search_url:
|
1843 |
+
url = st.session_state.deep_search_url
|
1844 |
+
st.info(f"Deep searching: {url}")
|
1845 |
+
|
1846 |
+
# Reset the flag to avoid re-running
|
1847 |
+
st.session_state.do_deep_search = False
|
1848 |
+
|
1849 |
+
# Set up custom extensions
|
1850 |
+
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
|
1851 |
+
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
1852 |
+
|
1853 |
+
@st.cache_resource
|
1854 |
+
def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
|
1855 |
+
async def _run():
|
1856 |
+
async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
|
1857 |
+
files = await dm.deep_search(url, ext_list, max_links, timeout_val)
|
1858 |
+
return files
|
1859 |
+
return asyncio.run(_run())
|
1860 |
+
|
1861 |
+
with st.spinner("Searching for files..."):
|
1862 |
+
files = run_deep_search(url, valid_ext_list, max_sublinks,
|
1863 |
+
sublink_timeout, use_proxy, proxy)
|
1864 |
+
|
1865 |
+
if files:
|
1866 |
+
st.session_state.discovered_files = files
|
1867 |
+
st.session_state.current_url = url
|
1868 |
+
st.success(f"Found {len(files)} files!")
|
1869 |
+
else:
|
1870 |
+
st.warning("No files found.")
|
1871 |
|
1872 |
+
# Add footer with attribution
|
1873 |
+
st.markdown('---')
|
1874 |
+
st.markdown('Created by [Euler314](https://github.com/euler314)')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1875 |
|
1876 |
if __name__ == "__main__":
|
1877 |
main()
|