|
import streamlit as st |
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide") |
|
|
|
|
|
import os |
|
import subprocess |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
import asyncio |
|
import logging |
|
from urllib.parse import urlparse |
|
import re |
|
from pathlib import Path |
|
from io import BytesIO |
|
import random |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import zipfile |
|
import tempfile |
|
import mimetypes |
|
import requests |
|
import datetime |
|
import spacy |
|
import spacy.cli |
|
from spacy.language import Language |
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
from async_timeout import timeout as async_timeout |
|
|
|
logging.basicConfig( |
|
filename='advanced_download_log.txt', |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", |
|
"project_id": "huggingface-449214", |
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth", |
|
"token_uri": "https://oauth2.googleapis.com/token", |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": "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", |
|
"redirect_uris": ["https://euler314-craw-web.hf.space/"] |
|
} |
|
} |
|
|
|
|
|
def install_playwright_dependencies(): |
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu' |
|
try: |
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
os.makedirs('/usr/lib/playwright', exist_ok=True) |
|
symlinks = { |
|
'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so', |
|
'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so', |
|
'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so', |
|
'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so', |
|
'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0', |
|
'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0', |
|
'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2', |
|
'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0', |
|
'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1', |
|
'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1' |
|
} |
|
for link_name, target in symlinks.items(): |
|
link_path = os.path.join('/usr/lib/playwright', link_name) |
|
if not os.path.exists(link_path): |
|
os.symlink(target, link_path) |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
browser_path = os.path.expanduser("~/.cache/ms-playwright") |
|
os.makedirs(browser_path, exist_ok=True) |
|
subprocess.run(['chmod', '-R', '755', browser_path], check=True) |
|
except subprocess.CalledProcessError as e: |
|
print(f"Error installing dependencies: {e}") |
|
except Exception as e: |
|
print(f"Error: {e}") |
|
|
|
|
|
install_playwright_dependencies() |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
try: |
|
|
|
try: |
|
nlp = spacy.load("en_core_web_sm") |
|
except OSError: |
|
st.info("Downloading spaCy model...") |
|
spacy.cli.download("en_core_web_sm") |
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
try: |
|
from sentence_transformers import SentenceTransformer |
|
model_name = 'all-MiniLM-L6-v2' |
|
cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers') |
|
if os.path.exists(os.path.join(cache_dir, model_name)): |
|
semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name)) |
|
else: |
|
st.warning(f"Downloading SentenceTransformer model {model_name}...") |
|
semantic_model = SentenceTransformer(model_name) |
|
except Exception as e: |
|
st.error(f"Error loading SentenceTransformer: {e}") |
|
semantic_model = None |
|
|
|
|
|
try: |
|
from transformers import pipeline |
|
model_name = "facebook/bart-large-cnn" |
|
cache_dir = os.path.expanduser('~/.cache/huggingface/transformers') |
|
if os.path.exists(os.path.join(cache_dir, model_name)): |
|
summarizer = pipeline("summarization", model=model_name) |
|
else: |
|
st.warning(f"Downloading Transformer model {model_name}...") |
|
summarizer = pipeline("summarization") |
|
except Exception as e: |
|
st.error(f"Error loading Transformers: {e}") |
|
summarizer = None |
|
|
|
return nlp, semantic_model, summarizer |
|
|
|
except Exception as e: |
|
st.error(f"Error loading models: {e}") |
|
return None, None, None |
|
|
|
|
|
with st.spinner("Loading models..."): |
|
nlp_model, semantic_model, summarizer = load_models() |
|
|
|
|
|
def get_random_user_agent(): |
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
] |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
def create_zip_file(file_paths, output_dir): |
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") |
|
|
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file_path in file_paths: |
|
zipf.write(file_path, os.path.basename(file_path)) |
|
|
|
return zip_path |
|
|
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
def google_drive_upload(zip_path: str, credentials): |
|
try: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) |
|
file_metadata = {'name': os.path.basename(zip_path)} |
|
media = googleapiclient.http.MediaFileUpload(zip_path, resumable=True) |
|
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() |
|
return created.get("id", "") |
|
except Exception as e: |
|
return f"Error uploading to Drive: {str(e)}" |
|
|
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
self.base_domains = set() |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
opts = { |
|
"headless": True, |
|
"args": [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-gpu', |
|
'--no-zygote', |
|
'--single-process' |
|
] |
|
} |
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
|
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
self.context = await self.browser.new_context(user_agent=get_random_user_agent()) |
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': 'https://www.bing.com/' |
|
}) |
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
def get_base_domain(self, url): |
|
"""Extract base domain and add variations to self.base_domains""" |
|
parsed = urlparse(url) |
|
domain = parsed.netloc.split(':')[0] |
|
|
|
|
|
base_parts = domain.split('.') |
|
if len(base_parts) > 2: |
|
main_domain = '.'.join(base_parts[-2:]) |
|
self.base_domains.add(main_domain) |
|
|
|
self.base_domains.add(domain) |
|
|
|
|
|
if base_parts[0] == 'www': |
|
self.base_domains.add('.'.join(base_parts[1:])) |
|
else: |
|
self.base_domains.add(f"www.{domain}") |
|
else: |
|
self.base_domains.add(domain) |
|
|
|
return domain |
|
|
|
def is_related_domain(self, url): |
|
"""Check if URL belongs to any of the known domain variations""" |
|
parsed = urlparse(url) |
|
domain = parsed.netloc.split(':')[0] |
|
|
|
|
|
parts = domain.split('.') |
|
for i in range(len(parts) - 1): |
|
check_domain = '.'.join(parts[i:]) |
|
if check_domain in self.base_domains: |
|
return True |
|
return False |
|
|
|
async def get_real_url(self, url): |
|
"""Follow redirects and get the final URL""" |
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.goto(url, wait_until='networkidle', timeout=30000) |
|
final_url = page.url |
|
|
|
|
|
content = await page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
meta_refresh = soup.find('meta', {'http-equiv': 'refresh'}) |
|
if meta_refresh: |
|
content = meta_refresh.get('content', '') |
|
if 'url=' in content.lower(): |
|
final_url = content.split('url=')[-1].strip("'").strip('"') |
|
|
|
return final_url, response.headers if response else {} |
|
except Exception as e: |
|
logger.error(f"Error getting real URL for {url}: {e}") |
|
return url, {} |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception: |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
resp = await page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception: |
|
return {} |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
final_url, headers = await self.get_real_url(url) |
|
|
|
|
|
self.get_base_domain(final_url) |
|
|
|
|
|
content_type = headers.get('content-type', '').lower() |
|
if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']): |
|
found_files.append({ |
|
'url': final_url, |
|
'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(final_url), |
|
'metadata': {} |
|
}) |
|
return found_files |
|
|
|
|
|
await self.page.goto(final_url, timeout=30000, wait_until='networkidle') |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', |
|
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
parsed_base = urlparse(final_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
links = set() |
|
|
|
for a in soup.find_all('a', href=True): |
|
links.add(a['href']) |
|
|
|
scripts = soup.find_all('script') |
|
for script in scripts: |
|
if script.string: |
|
urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string) |
|
links.update(urls) |
|
|
|
for href in links: |
|
href = href.strip() |
|
|
|
|
|
if not href or href.startswith(('javascript:', '#', 'mailto:')): |
|
continue |
|
|
|
|
|
if '.php' in href.lower() or 'download' in href.lower() or 'visit' in href.lower(): |
|
try: |
|
|
|
if not href.startswith(('http://', 'https://')): |
|
if href.startswith('/'): |
|
href = base_url + href |
|
else: |
|
href = base_url + '/' + href |
|
|
|
|
|
real_url, real_headers = await self.get_real_url(href) |
|
|
|
|
|
content_type = real_headers.get('content-type', '').lower() |
|
if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']): |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {} |
|
}) |
|
except Exception as e: |
|
logger.error(f"Error processing PHP/script link {href}: {e}") |
|
continue |
|
|
|
|
|
elif any(href.lower().endswith(ext) for ext in all_exts): |
|
|
|
if not href.startswith(('http://', 'https://')): |
|
if href.startswith('/'): |
|
href = base_url + href |
|
else: |
|
href = base_url + '/' + href |
|
|
|
|
|
if self.is_related_domain(href): |
|
size_str = await self.get_file_size(href) |
|
meta = {} |
|
if href.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(href) |
|
|
|
found_files.append({ |
|
'url': href, |
|
'filename': os.path.basename(href.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
return [] |
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
real_url, _ = await self.get_real_url(file_url) |
|
|
|
if "drive.google.com" in real_url: |
|
import gdown |
|
try: |
|
st.write(f"Downloading from Google Drive: {fname}") |
|
output = gdown.download(real_url, path, quiet=False) |
|
if output: |
|
return path |
|
return None |
|
except Exception as e: |
|
logger.error(f"Google Drive download error: {e}") |
|
return None |
|
|
|
async with self.context.new_page() as page: |
|
st.write(f"Downloading: {fname}") |
|
|
|
headers = { |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
|
|
response = await page.request.get(real_url, headers=headers, timeout=30000) |
|
|
|
if response.status == 200: |
|
content = await response.body() |
|
with open(path, 'wb') as f: |
|
f.write(content) |
|
return path |
|
else: |
|
logger.error(f"Download failed with status {response.status}: {real_url}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
async def get_sublinks(self, url, limit=100): |
|
try: |
|
|
|
real_url, _ = await self.get_real_url(url) |
|
await self.page.goto(real_url, timeout=30000) |
|
|
|
|
|
await self.page.wait_for_load_state('networkidle') |
|
|
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
parsed_base = urlparse(real_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
current_path = os.path.dirname(parsed_base.path) |
|
|
|
links = set() |
|
|
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
if href and not href.startswith(('javascript:', '#', 'mailto:')): |
|
links.add(href) |
|
|
|
|
|
scripts = soup.find_all('script') |
|
for script in scripts: |
|
if script.string: |
|
urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string) |
|
links.update(urls) |
|
|
|
|
|
forms = soup.find_all('form', action=True) |
|
for form in forms: |
|
links.add(form['action']) |
|
|
|
|
|
clean_links = set() |
|
for href in links: |
|
try: |
|
|
|
if not href.strip(): |
|
continue |
|
|
|
|
|
if href.startswith('http'): |
|
full_url = href |
|
elif href.startswith('//'): |
|
full_url = parsed_base.scheme + ':' + href |
|
elif href.startswith('/'): |
|
full_url = base_url + href |
|
else: |
|
|
|
if current_path and current_path != '/': |
|
full_url = base_url + current_path + '/' + href |
|
else: |
|
full_url = base_url + '/' + href |
|
|
|
|
|
full_url = full_url.split('#')[0] |
|
|
|
|
|
if self.is_related_domain(full_url): |
|
clean_links.add(full_url) |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing link {href}: {e}") |
|
continue |
|
|
|
|
|
sorted_links = sorted(list(clean_links)) |
|
return sorted_links[:limit] |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting sublinks: {e}") |
|
return [] |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30): |
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
|
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
file_count_text = st.empty() |
|
|
|
try: |
|
|
|
self.get_base_domain(url) |
|
|
|
|
|
real_url, _ = await self.get_real_url(url) |
|
|
|
|
|
progress_text.text("Analyzing main page...") |
|
main_files = await self.extract_downloadable_files(real_url, custom_ext_list) |
|
initial_count = len(main_files) |
|
file_count_text.text(f"Found {initial_count} files on main page") |
|
|
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(real_url, limit=sublink_limit) |
|
total_links = len(sublinks) |
|
|
|
progress_text.text(f"Found {total_links} sublinks to process") |
|
|
|
if not sublinks: |
|
progress_bar.progress(1.0) |
|
return main_files |
|
|
|
|
|
all_files = main_files.copy() |
|
|
|
|
|
sem = asyncio.Semaphore(10) |
|
|
|
async def process_sublink(sublink, index): |
|
async with sem: |
|
try: |
|
progress = index/total_links |
|
progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}") |
|
progress_bar.progress(progress) |
|
|
|
|
|
async with async_timeout.timeout(timeout): |
|
|
|
real_sublink, _ = await self.get_real_url(sublink) |
|
sub_files = await self.extract_downloadable_files(real_sublink, custom_ext_list) |
|
|
|
if sub_files: |
|
logger.info(f"Found {len(sub_files)} files at {real_sublink}") |
|
st.write(f"Found {len(sub_files)} files at {real_sublink}") |
|
|
|
return sub_files |
|
except asyncio.TimeoutError: |
|
logger.warning(f"Timeout processing sublink: {sublink}") |
|
return [] |
|
except Exception as e: |
|
logger.error(f"Error processing sublink {sublink}: {e}") |
|
return [] |
|
|
|
|
|
tasks = [process_sublink(sublink, i+1) for i, sublink in enumerate(sublinks)] |
|
sub_results = await asyncio.gather(*tasks) |
|
|
|
|
|
for sub_files in sub_results: |
|
all_files.extend(sub_files) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
|
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
final_count = len(unique_files) |
|
progress_text.text(f"Deep search complete!") |
|
file_count_text.text(f"Found {final_count} unique files") |
|
progress_bar.progress(1.0) |
|
|
|
|
|
unique_files.sort(key=lambda x: x['filename'].lower()) |
|
|
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
progress_text.text(f"Error during deep search: {str(e)}") |
|
return [] |
|
|
|
finally: |
|
|
|
await asyncio.sleep(2) |
|
try: |
|
progress_text.empty() |
|
progress_bar.empty() |
|
file_count_text.empty() |
|
except: |
|
pass |
|
def main(): |
|
if 'initialized' not in st.session_state: |
|
st.session_state.initialized = True |
|
st.session_state.discovered_files = [] |
|
st.session_state.current_url = None |
|
st.session_state.google_creds = None |
|
st.session_state.selected_files = [] |
|
|
|
st.title("Advanced File Downloader") |
|
|
|
|
|
with st.sidebar: |
|
|
|
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select") |
|
|
|
|
|
with st.expander("Advanced Options", expanded=True): |
|
custom_extensions = st.text_input( |
|
"Custom File Extensions", |
|
placeholder=".csv, .txt, .epub", |
|
key="custom_ext_input" |
|
) |
|
max_sublinks = st.number_input( |
|
"Maximum Sublinks to Process", |
|
min_value=1, |
|
max_value=10000, |
|
value=100, |
|
step=50, |
|
help="Maximum number of sublinks to process from the main page", |
|
key="max_sublinks_input" |
|
) |
|
sublink_timeout = st.number_input( |
|
"Search Timeout (seconds per sublink)", |
|
min_value=1, |
|
max_value=3000, |
|
value=30, |
|
step=5, |
|
help="Maximum time to spend searching each sublink", |
|
key="timeout_input" |
|
) |
|
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox") |
|
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") |
|
|
|
|
|
with st.expander("Google Drive Integration", expanded=False): |
|
if st.button("Start Google Sign-In", key="google_signin_btn"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
|
|
auth_code = st.text_input("Enter authorization code", key="auth_code_input") |
|
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code: |
|
creds, msg = exchange_code_for_credentials(auth_code) |
|
st.session_state.google_creds = creds |
|
st.write(msg) |
|
|
|
|
|
if mode == "Manual URL": |
|
st.header("Manual URL Mode") |
|
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input") |
|
|
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): |
|
if url: |
|
async def run_deep_search(): |
|
try: |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
files = await dm.deep_search( |
|
url=url, |
|
custom_ext_list=custom_extensions.split(',') if custom_extensions else [], |
|
sublink_limit=int(max_sublinks), |
|
timeout=int(sublink_timeout) |
|
) |
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
return files |
|
except Exception as e: |
|
st.error(f"Error during deep search: {str(e)}") |
|
return None |
|
|
|
files = asyncio.run(run_deep_search()) |
|
if files: |
|
st.success(f"Found {len(files)} files!") |
|
|
|
|
|
col1, col2 = st.columns([1, 4]) |
|
with col1: |
|
if st.button("Select All", key="select_all_btn"): |
|
st.session_state.selected_files = list(range(len(files))) |
|
st.experimental_rerun() |
|
if st.button("Clear Selection", key="clear_selection_btn"): |
|
st.session_state.selected_files = [] |
|
st.experimental_rerun() |
|
|
|
|
|
selected_files = st.multiselect( |
|
"Select files to download", |
|
options=list(range(len(files))), |
|
default=st.session_state.selected_files, |
|
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", |
|
key="file_multiselect" |
|
) |
|
|
|
|
|
st.session_state.selected_files = selected_files |
|
|
|
if selected_files: |
|
col1, col2, col3, col4 = st.columns(4) |
|
with col1: |
|
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") |
|
with col2: |
|
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") |
|
with col3: |
|
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox") |
|
with col4: |
|
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") |
|
|
|
if st.button("Download Selected", key="download_btn"): |
|
if not os.path.exists(download_dir): |
|
os.makedirs(download_dir) |
|
|
|
async def download_files(): |
|
downloaded_paths = [] |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
for i, idx in enumerate(selected_files): |
|
progress = (i + 1) / len(selected_files) |
|
file_info = files[idx] |
|
|
|
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})") |
|
progress_bar.progress(progress) |
|
|
|
path = await dm.download_file( |
|
file_info, |
|
download_dir, |
|
url |
|
) |
|
if path: |
|
downloaded_paths.append(path) |
|
|
|
status_text.empty() |
|
progress_bar.empty() |
|
return downloaded_paths |
|
|
|
downloaded = asyncio.run(download_files()) |
|
|
|
if downloaded: |
|
st.success(f"Successfully downloaded {len(downloaded)} files") |
|
|
|
if create_zip or upload_to_drive: |
|
zip_path = create_zip_file(downloaded, download_dir) |
|
st.success(f"Created ZIP file: {zip_path}") |
|
|
|
if upload_to_drive and st.session_state.get('google_creds'): |
|
with st.spinner("Uploading to Google Drive..."): |
|
drive_id = google_drive_upload(zip_path, st.session_state.google_creds) |
|
if not isinstance(drive_id, str) or not drive_id.startswith("Error"): |
|
st.success(f"Uploaded to Google Drive. File ID: {drive_id}") |
|
else: |
|
st.error(drive_id) |
|
|
|
if delete_after: |
|
for path in downloaded: |
|
try: |
|
os.remove(path) |
|
except Exception as e: |
|
st.warning(f"Could not delete {path}: {e}") |
|
st.info("Deleted original files after ZIP creation") |
|
else: |
|
st.warning("No files found.") |
|
|
|
|
|
elif st.session_state.discovered_files: |
|
files = st.session_state.discovered_files |
|
st.success(f"Found {len(files)} files!") |
|
|
|
|
|
col1, col2 = st.columns([1, 4]) |
|
with col1: |
|
if st.button("Select All", key="select_all_btn2"): |
|
st.session_state.selected_files = list(range(len(files))) |
|
st.experimental_rerun() |
|
if st.button("Clear Selection", key="clear_selection_btn2"): |
|
st.session_state.selected_files = [] |
|
st.experimental_rerun() |
|
|
|
|
|
selected_files = st.multiselect( |
|
"Select files to download", |
|
options=list(range(len(files))), |
|
default=st.session_state.selected_files, |
|
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", |
|
key="file_multiselect2" |
|
) |
|
|
|
|
|
st.session_state.selected_files = selected_files |
|
|
|
if selected_files: |
|
col1, col2, col3, col4 = st.columns(4) |
|
with col1: |
|
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input2") |
|
with col2: |
|
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox2") |
|
with col3: |
|
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox2") |
|
with col4: |
|
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox2") |
|
|
|
if st.button("Download Selected", key="download_btn2"): |
|
if not os.path.exists(download_dir): |
|
os.makedirs(download_dir) |
|
|
|
async def download_files(): |
|
downloaded_paths = [] |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
|
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
for i, idx in enumerate(selected_files): |
|
progress = (i + 1) / len(selected_files) |
|
file_info = files[idx] |
|
|
|
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})") |
|
progress_bar.progress(progress) |
|
|
|
path = await dm.download_file( |
|
file_info, |
|
download_dir, |
|
st.session_state.current_url |
|
) |
|
if path: |
|
downloaded_paths.append(path) |
|
|
|
status_text.empty() |
|
progress_bar.empty() |
|
return downloaded_paths |
|
|
|
downloaded = asyncio.run(download_files()) |
|
|
|
if downloaded: |
|
st.success(f"Successfully downloaded {len(downloaded)} files") |
|
|
|
if create_zip or upload_to_drive: |
|
zip_path = create_zip_file(downloaded, download_dir) |
|
st.success(f"Created ZIP file: {zip_path}") |
|
|
|
if upload_to_drive and st.session_state.get('google_creds'): |
|
with st.spinner("Uploading to Google Drive..."): |
|
drive_id = google_drive_upload(zip_path, st.session_state.google_creds) |
|
if not isinstance(drive_id, str) or not drive_id.startswith("Error"): |
|
st.success(f"Uploaded to Google Drive. File ID: {drive_id}") |
|
else: |
|
st.error(drive_id) |
|
|
|
if delete_after: |
|
for path in downloaded: |
|
try: |
|
os.remove(path) |
|
except Exception as e: |
|
st.warning(f"Could not delete {path}: {e}") |
|
st.info("Deleted original files after ZIP creation") |
|
|
|
elif mode == "Bing Search": |
|
st.header("Bing Search Mode") |
|
query = st.text_input("Enter search query", key="search_query_input") |
|
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider") |
|
|
|
if st.button("Search", key="search_btn"): |
|
if query: |
|
async def run_search(): |
|
async with DownloadManager( |
|
use_proxy=use_proxy, |
|
proxy=proxy, |
|
query=query, |
|
num_results=num_results |
|
) as dm: |
|
with st.spinner("Searching..."): |
|
urls = await dm.search_bing() |
|
if urls: |
|
st.success(f"Found {len(urls)} results!") |
|
for i, url in enumerate(urls, 1): |
|
with st.expander(f"Result {i}: {url}", expanded=i==1): |
|
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"): |
|
files = await dm.deep_search( |
|
url=url, |
|
custom_ext_list=custom_extensions.split(',') if custom_extensions else [], |
|
sublink_limit=max_sublinks, |
|
timeout=sublink_timeout |
|
) |
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
st.session_state.selected_files = [] |
|
st.experimental_rerun() |
|
else: |
|
st.warning("No files found on this page.") |
|
else: |
|
st.warning("No search results found.") |
|
|
|
asyncio.run(run_search()) |
|
|
|
else: |
|
if summarizer is None: |
|
st.error("PDF summarization is not available due to model loading errors.") |
|
else: |
|
st.header("PDF Summarizer") |
|
pdf_url = st.text_input("Enter PDF URL", key="pdf_url_input") |
|
|
|
if st.button("Summarize", key="summarize_btn"): |
|
if pdf_url: |
|
with st.spinner("Generating summary..."): |
|
try: |
|
response = requests.get(pdf_url, stream=True) |
|
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") |
|
with open(temp_pdf.name, "wb") as f: |
|
f.write(response.content) |
|
reader = PdfReader(temp_pdf.name) |
|
text = " ".join([page.extract_text() or "" for page in reader.pages]) |
|
os.remove(temp_pdf.name) |
|
limited_text = text[:3000] |
|
summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False) |
|
st.write("Summary:") |
|
st.write(summary[0]['summary_text']) |
|
except Exception as e: |
|
st.error(f"Error summarizing PDF: {e}") |
|
|
|
if __name__ == "__main__": |
|
try: |
|
main() |
|
except Exception as e: |
|
st.error(f"An error occurred: {str(e)}") |
|
logger.error(f"Application error: {str(e)}", exc_info=True) |