Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Mar 8

Commit

5641dea

verified ·

1 Parent(s): 3b03ee1

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -260

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import subprocess
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 import asyncio
 import logging
-from urllib.parse import urlparse
 import re
 from pathlib import Path
 from io import BytesIO
@@ -19,97 +19,21 @@ import tempfile
 import mimetypes
 import requests
 import datetime
-import spacy
-import spacy.cli
-from spacy.language import Language
-import google_auth_oauthlib.flow
-import googleapiclient.discovery
-import google.auth.transport.requests
-from async_timeout import timeout as async_timeout
-import pandas as pd
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline
-import schedule
-import threading
-import time
-import hashlib
-from reportlab.lib.pagesizes import letter
-from reportlab.pdfgen import canvas
-from sklearn.cluster import KMeans
-import numpy as np
 import base64
 import shutil
-from PIL import Image  # Make sure to pip install Pillow
 from reportlab.pdfgen import canvas
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
-    filename='advanced_download_log.txt',
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
-GOOGLE_OAUTH_CONFIG = {
-    "web": {
-        "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
-        "project_id": "huggingface-449214",
-        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-        "token_uri": "https://oauth2.googleapis.com/token",
-        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
-        "client_secret": "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",
-        "redirect_uris": ["https://euler314-craw-web.hf.space/"]
-    }
-}
-# Playwright Setup
-def install_playwright_dependencies():
-    os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
-    subprocess.run(['apt-get', 'update', '-y'], check=True)
-    packages = [
-        'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
-        'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
-        'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
-    ]
-    subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
-    subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
-install_playwright_dependencies()
-# Model Loading
-@st.cache_resource
-def load_models():
-    try:
-        # Load spaCy model
-        try:
-            nlp = spacy.load("en_core_web_sm")
-        except OSError:
-            st.info("Downloading spaCy model...")
-            spacy.cli.download("en_core_web_sm")
-            nlp = spacy.load("en_core_web_sm")
-        # Load SentenceTransformer
-        try:
-            semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
-        except Exception as e:
-            st.error(f"Error loading SentenceTransformer: {e}")
-            semantic_model = None
-        # Load Transformers pipeline
-        try:
-            summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-        except Exception as e:
-            st.error(f"Error loading Transformers: {e}")
-            summarizer = None
-        return nlp, semantic_model, summarizer
-    except Exception as e:
-        st.error(f"Error loading models: {e}")
-        return None, None, None
-nlp_model, semantic_model, summarizer = load_models()
-# Utility Functions
 def get_random_user_agent():
     USER_AGENTS = [
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
@@ -134,57 +58,25 @@ def create_zip_file(file_paths, output_dir):
             zipf.write(file_path, os.path.basename(file_path))
     return zip_path
-# Google Drive Functions
-def get_google_auth_url():
-    client_config = GOOGLE_OAUTH_CONFIG["web"]
-    flow = google_auth_oauthlib.flow.Flow.from_client_config(
-        {"web": client_config},
-        scopes=["https://www.googleapis.com/auth/drive.file"]
-    )
-    flow.redirect_uri = client_config["redirect_uris"][0]
-    authorization_url, _ = flow.authorization_url(
-        access_type="offline",
-        include_granted_scopes="true",
-        prompt="consent"
-    )
-    return authorization_url
-def exchange_code_for_credentials(auth_code):
-    if not auth_code.strip():
-        return None, "No code provided."
-    try:
-        client_config = GOOGLE_OAUTH_CONFIG["web"]
-        flow = google_auth_oauthlib.flow.Flow.from_client_config(
-            {"web": client_config},
-            scopes=["https://www.googleapis.com/auth/drive.file"]
-        )
-        flow.redirect_uri = client_config["redirect_uris"][0]
-        flow.fetch_token(code=auth_code.strip())
-        creds = flow.credentials
-        if not creds or not creds.valid:
-            return None, "Could not validate credentials. Check code and try again."
-        return creds, "Google Sign-In successful!"
-    except Exception as e:
-        return None, f"Error during token exchange: {e}"
-def google_drive_upload(file_path, credentials, folder_id=None):
     try:
-        drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
-        file_metadata = {'name': os.path.basename(file_path)}
-        if folder_id:
-            file_metadata['parents'] = [folder_id]
-        media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
-        created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
-        return created.get("id", "")
     except Exception as e:
-        return f"Error uploading to Drive: {str(e)}"
-def create_drive_folder(drive_service, name):
-    folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
-    folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
-    return folder.get('id')
-# DownloadManager Class
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
@@ -291,10 +183,6 @@ class DownloadManager:
             links = set()
             # Use requests for a faster initial scan
-            import requests
-            from bs4 import BeautifulSoup
-            from urllib.parse import urljoin, urlparse
             headers = {"User-Agent": get_random_user_agent()}
             response = requests.get(url, headers=headers, timeout=30)
@@ -398,7 +286,6 @@ class DownloadManager:
                     # If filename is URL encoded (common with Chinese/international sites)
                     if '%' in filename:
                         try:
-                            from urllib.parse import unquote
                             filename = unquote(filename)
                         except Exception:
                             pass
@@ -735,12 +622,9 @@ class DownloadManager:
                             await page.screenshot(path=screenshot_path)
                         # Convert to PDF
-                        from PIL import Image
-                        from reportlab.pdfgen import canvas as pdf_canvas
                         img = Image.open(screenshot_path)
                         width, height = img.size
-                        c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
                         c.drawImage(screenshot_path, 0, 0, width, height)
                         c.save()
@@ -874,17 +758,13 @@ class DownloadManager:
                     # Combine screenshots into PDF
                     logger.info(f"Creating PDF from {len(screenshots)} captured pages")
-                    from PIL import Image
-                    from reportlab.lib.pagesizes import letter
-                    from reportlab.pdfgen import canvas as pdf_canvas
                     # Use the size of the first screenshot to set PDF dimensions
                     if screenshots:
                         try:
                             img = Image.open(screenshots[0])
                             width, height = img.size
-                            c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
                             for screenshot in screenshots:
                                 try:
@@ -1000,20 +880,7 @@ class DownloadManager:
         # Try standard approaches for non-view-only files
         try:
-            # Try with gdown first
-            import gdown
-            output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
-            if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
-                with open(save_path, 'rb') as f:
-                    content = f.read(100)  # Read first 100 bytes
-                    if b'<!DOCTYPE html>' not in content:  # Check not HTML error page
-                        logger.info(f"Successfully downloaded with gdown: {url}")
-                        return True
-        except Exception as e:
-            logger.warning(f"gdown download failed: {e}")
-        # Try with requests and session cookies
-        try:
             session = requests.Session()
             session.headers.update({'User-Agent': get_random_user_agent()})
@@ -1322,9 +1189,6 @@ class DownloadManager:
                         screenshots.append(screenshot_path)
                     # Combine screenshots into PDF
-                    from PIL import Image
-                    from reportlab.pdfgen import canvas
                     c = canvas.Canvas(save_path)
                     for screenshot in screenshots:
                         img = Image.open(screenshot)
@@ -1350,9 +1214,6 @@ class DownloadManager:
                     # Convert to requested format if needed
                     if file_type == 'pdf':
-                        from PIL import Image
-                        from reportlab.pdfgen import canvas
                         # Create PDF from screenshot
                         img = Image.open(screenshot_path)
                         width, height = img.size
@@ -1757,11 +1618,10 @@ class DownloadManager:
                     # Use a longer timeout for ASP.NET pages which can be slower
                     sub_timeout = timeout * 2 if is_aspnet else timeout
-                    # Extract files from sublink with appropriate timeout
-                    async with async_timeout(sub_timeout):
-                        sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
-                        all_files.extend(sub_files)
-                        file_count_text.text(f"Found {len(all_files)} total files")
                 except Exception as e:
                     logger.warning(f"Error processing sublink {sublink}: {e}")
@@ -1789,54 +1649,34 @@ class DownloadManager:
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
                 progress_bar.empty()
-# Utility Functions for New Features
-def extract_keywords(text, n=5):
-    doc = nlp_model(text)
-    keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
-    return keywords
-def analyze_sentiment(text):
-    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
-    result = sentiment_analyzer(text[:512])[0]
-    return result['label'], result['score']
-def get_file_hash(file_path):
-    hasher = hashlib.md5()
-    with open(file_path, 'rb') as f:
-        hasher.update(f.read())
-    return hasher.hexdigest()
-# Main Function
 def main():
-    if 'initialized' not in st.session_state:
         st.session_state.initialized = True
         st.session_state.discovered_files = []
         st.session_state.current_url = None
-        st.session_state.google_creds = None
         st.session_state.selected_files = []
         st.session_state.do_deep_search = False
         st.session_state.deep_search_url = None
         st.session_state.search_results = []
-    st.title("Advanced File Downloader")
     with st.sidebar:
-        mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
         with st.expander("Advanced Options", expanded=True):
             custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
             max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
             sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
             use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
-        with st.expander("Google Drive Integration", expanded=False):
-            if st.button("Start Google Sign-In", key="google_signin_btn"):
-                auth_url = get_google_auth_url()
-                st.markdown(f"[Click here to authorize]({auth_url})")
-            auth_code = st.text_input("Enter authorization code", key="auth_code_input")
-            if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
-                creds, msg = exchange_code_for_credentials(auth_code)
-                st.session_state.google_creds = creds
-                st.write(msg)
     if mode == "Manual URL":
         st.header("Manual URL Mode")
@@ -1849,11 +1689,19 @@ def main():
                     valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
                     if custom_ext_list != valid_ext_list:
                         st.warning("Invalid extensions ignored. Use format like '.csv'.")
-                    async def run_deep_search():
-                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                            files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
-                            return files
-                    files = asyncio.run(run_deep_search())
                     if files:
                         st.session_state.discovered_files = files
                         st.session_state.current_url = url
@@ -1863,78 +1711,112 @@ def main():
         if st.session_state.discovered_files:
             files = st.session_state.discovered_files
-            st.success(f"Found {len(files)} files!")
             col1, col2 = st.columns([1, 4])
             with col1:
                 if st.button("Select All", key="select_all_btn"):
                     st.session_state.selected_files = list(range(len(files)))
                 if st.button("Clear Selection", key="clear_selection_btn"):
                     st.session_state.selected_files = []
-            selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect")
-            st.session_state.selected_files = selected_files
-            if selected_files:
-                col1, col2, col3, col4 = st.columns(4)
-                with col1:
-                    download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
-                with col2:
-                    create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
-                with col3:
-                    delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
-                with col4:
-                    upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
                 if st.button("Download Selected", key="download_btn"):
                     if not os.path.exists(download_dir):
                         os.makedirs(download_dir)
                     async def download_files():
                         downloaded_paths = []
                         progress_bar = st.progress(0)
                         status_text = st.empty()
                         async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                            for i, idx in enumerate(selected_files):
-                                progress = (i + 1) / len(selected_files)
                                 file_info = files[idx]
-                                status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
                                 progress_bar.progress(progress)
                                 path = await dm.download_file(file_info, download_dir, url)
                                 if path:
                                     downloaded_paths.append(path)
                             status_text.empty()
                             progress_bar.empty()
                             return downloaded_paths
-                    downloaded = asyncio.run(download_files())
                     if downloaded:
                         st.success(f"Successfully downloaded {len(downloaded)} files")
-                        if create_zip:
                             zip_path = create_zip_file(downloaded, download_dir)
                             st.success(f"Created ZIP file: {zip_path}")
                             with open(zip_path, "rb") as f:
                                 zip_data = f.read()
-                            st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
-                            if upload_to_drive and st.session_state.google_creds:
-                                drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
-                                folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
-                                drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
-                                if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
-                                    st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
-                                else:
-                                    st.error(drive_id)
-                            if delete_after:
-                                for path in downloaded:
-                                    try:
-                                        os.remove(path)
-                                    except Exception as e:
-                                        st.warning(f"Could not delete {path}: {e}")
-                                st.info("Deleted original files after ZIP creation")
                         else:
                             for path in downloaded:
                                 with open(path, "rb") as f:
                                     file_data = f.read()
-                                st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
         query = st.text_input("Enter search query", key="search_query_input")
         num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
         if st.button("Search", key="search_btn"):
             if query:
                 async def run_search():
@@ -1944,6 +1826,8 @@ def main():
                             if urls:
                                 st.session_state.search_results = urls
                                 st.success(f"Found {len(urls)} results!")
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
                                         if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
@@ -1951,29 +1835,43 @@ def main():
                                             st.session_state.do_deep_search = True
                             else:
                                 st.warning("No search results found.")
                 asyncio.run(run_search())
-    else:  # PDF Summarizer mode
-        if summarizer is None:
-            st.error("PDF summarization is not available due to model loading errors.")
-        else:
-            st.header("PDF Summarizer")
-            pdf_url = st.text_input("Enter PDF URL", key="pdf_url_input")
-            if st.button("Summarize", key="summarize_btn"):
-                if pdf_url:
-                    with st.spinner("Generating summary..."):
-                        try:
-                            response = requests.get(pdf_url, stream=True)
-                            temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
-                            with open(temp_pdf.name, "wb") as f:
-                                f.write(response.content)
-                            reader = PdfReader(temp_pdf.name)
-                            text = " ".join([page.extract_text() or "" for page in reader.pages])
-                            os.remove(temp_pdf.name)
-                            summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
-                            st.write("Summary:", summary[0]['summary_text'])
-                        except Exception as e:
-                            st.error(f"Error summarizing PDF: {e}")
 if __name__ == "__main__":
     main()

 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 import asyncio
 import logging
+from urllib.parse import urlparse, urljoin, unquote
 import re
 from pathlib import Path
 from io import BytesIO
 import mimetypes
 import requests
 import datetime
+import traceback
 import base64
 import shutil
+from PIL import Image
+from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
+# -------------------- Utility Functions --------------------
 def get_random_user_agent():
     USER_AGENTS = [
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
             zipf.write(file_path, os.path.basename(file_path))
     return zip_path
+# -------------------- Playwright Setup --------------------
+def install_playwright_dependencies():
     try:
+        # Set environment variable for Playwright browsers path
+        os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
+        # Install system dependencies
+        subprocess.run(['pip', 'install', 'playwright'], check=True)
+        subprocess.run(['playwright', 'install', 'chromium'], check=True)
+        subprocess.run(['playwright', 'install-deps', 'chromium'], check=True)
+        st.success("Playwright dependencies installed successfully!")
     except Exception as e:
+        st.error(f"Error installing Playwright dependencies: {e}")
+        st.info("You may need to manually install dependencies. Check console for details.")
+        logger.error(f"Playwright setup error: {e}")
+        traceback.print_exc()
+# -------------------- Download Manager Class --------------------
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
             links = set()
             # Use requests for a faster initial scan
             headers = {"User-Agent": get_random_user_agent()}
             response = requests.get(url, headers=headers, timeout=30)
                     # If filename is URL encoded (common with Chinese/international sites)
                     if '%' in filename:
                         try:
                             filename = unquote(filename)
                         except Exception:
                             pass
                             await page.screenshot(path=screenshot_path)
                         # Convert to PDF
                         img = Image.open(screenshot_path)
                         width, height = img.size
+                        c = canvas.Canvas(save_path, pagesize=(width, height))
                         c.drawImage(screenshot_path, 0, 0, width, height)
                         c.save()
                     # Combine screenshots into PDF
                     logger.info(f"Creating PDF from {len(screenshots)} captured pages")
                     # Use the size of the first screenshot to set PDF dimensions
                     if screenshots:
                         try:
                             img = Image.open(screenshots[0])
                             width, height = img.size
+                            c = canvas.Canvas(save_path, pagesize=(width, height))
                             for screenshot in screenshots:
                                 try:
         # Try standard approaches for non-view-only files
         try:
+            # Try with requests and session cookies
             session = requests.Session()
             session.headers.update({'User-Agent': get_random_user_agent()})
                         screenshots.append(screenshot_path)
                     # Combine screenshots into PDF
                     c = canvas.Canvas(save_path)
                     for screenshot in screenshots:
                         img = Image.open(screenshot)
                     # Convert to requested format if needed
                     if file_type == 'pdf':
                         # Create PDF from screenshot
                         img = Image.open(screenshot_path)
                         width, height = img.size
                     # Use a longer timeout for ASP.NET pages which can be slower
                     sub_timeout = timeout * 2 if is_aspnet else timeout
+                    # Extract files from sublink
+                    sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
+                    all_files.extend(sub_files)
+                    file_count_text.text(f"Found {len(all_files)} total files")
                 except Exception as e:
                     logger.warning(f"Error processing sublink {sublink}: {e}")
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
                 progress_bar.empty()
+# -------------------- Main App --------------------
 def main():
+    st.title("Advanced File Downloader")
+    # Initialize playwright if needed
+    if "playwright_installed" not in st.session_state:
+        with st.spinner("Setting up browser automation. This may take a minute..."):
+            install_playwright_dependencies()
+        st.session_state.playwright_installed = True
+    if "initialized" not in st.session_state:
         st.session_state.initialized = True
         st.session_state.discovered_files = []
         st.session_state.current_url = None
         st.session_state.selected_files = []
         st.session_state.do_deep_search = False
         st.session_state.deep_search_url = None
         st.session_state.search_results = []
     with st.sidebar:
+        mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
         with st.expander("Advanced Options", expanded=True):
             custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
             max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
             sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
             use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
     if mode == "Manual URL":
         st.header("Manual URL Mode")
                     valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
                     if custom_ext_list != valid_ext_list:
                         st.warning("Invalid extensions ignored. Use format like '.csv'.")
+                    @st.cache_resource
+                    def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
+                        async def _run():
+                            async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
+                                files = await dm.deep_search(url, ext_list, max_links, timeout_val)
+                                return files
+                        return asyncio.run(_run())
+                    with st.spinner("Searching for files..."):
+                        files = run_deep_search(url, valid_ext_list, max_sublinks,
+                                               sublink_timeout, use_proxy, proxy)
                     if files:
                         st.session_state.discovered_files = files
                         st.session_state.current_url = url
         if st.session_state.discovered_files:
             files = st.session_state.discovered_files
             col1, col2 = st.columns([1, 4])
             with col1:
                 if st.button("Select All", key="select_all_btn"):
                     st.session_state.selected_files = list(range(len(files)))
                 if st.button("Clear Selection", key="clear_selection_btn"):
                     st.session_state.selected_files = []
+            # Create a formatted display of files with metadata
+            file_options = []
+            for i, file in enumerate(files):
+                filename = file['filename']
+                size = file['size']
+                meta = file.get('metadata', {})
+                # Format display string with relevant metadata
+                if meta and 'Pages' in meta:
+                    file_info = f"{filename} ({size}) - {meta.get('Pages', '')} pages"
+                else:
+                    file_info = f"{filename} ({size})"
+                file_options.append((i, file_info))
+            selected_indices = st.multiselect(
+                "Select files to download",
+                options=[i for i, _ in file_options],
+                default=st.session_state.selected_files,
+                format_func=lambda i: next(info for idx, info in file_options if idx == i),
+                key="file_multiselect"
+            )
+            st.session_state.selected_files = selected_indices
+            if selected_indices:
+                download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
+                create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
                 if st.button("Download Selected", key="download_btn"):
                     if not os.path.exists(download_dir):
                         os.makedirs(download_dir)
                     async def download_files():
                         downloaded_paths = []
                         progress_bar = st.progress(0)
                         status_text = st.empty()
                         async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                            for i, idx in enumerate(selected_indices):
+                                progress = (i + 1) / len(selected_indices)
                                 file_info = files[idx]
+                                status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_indices)})")
                                 progress_bar.progress(progress)
                                 path = await dm.download_file(file_info, download_dir, url)
                                 if path:
                                     downloaded_paths.append(path)
                             status_text.empty()
                             progress_bar.empty()
                             return downloaded_paths
+                    with st.spinner("Downloading files..."):
+                        downloaded = asyncio.run(download_files())
                     if downloaded:
                         st.success(f"Successfully downloaded {len(downloaded)} files")
+                        # Create file downloads
+                        if create_zip and len(downloaded) > 1:
                             zip_path = create_zip_file(downloaded, download_dir)
                             st.success(f"Created ZIP file: {zip_path}")
+                            # Provide download link for the zip file
                             with open(zip_path, "rb") as f:
                                 zip_data = f.read()
+                            zip_filename = os.path.basename(zip_path)
+                            st.download_button(
+                                label="Download ZIP",
+                                data=zip_data,
+                                file_name=zip_filename,
+                                mime="application/zip",
+                                key="download_zip_btn"
+                            )
                         else:
+                            # Provide individual file downloads
+                            st.write("Download files individually:")
                             for path in downloaded:
                                 with open(path, "rb") as f:
                                     file_data = f.read()
+                                file_name = os.path.basename(path)
+                                mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
+                                st.download_button(
+                                    label=f"Download {file_name}",
+                                    data=file_data,
+                                    file_name=file_name,
+                                    mime=mime_type,
+                                    key=f"download_file_{path}"
+                                )
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
         query = st.text_input("Enter search query", key="search_query_input")
         num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
         if st.button("Search", key="search_btn"):
             if query:
                 async def run_search():
                             if urls:
                                 st.session_state.search_results = urls
                                 st.success(f"Found {len(urls)} results!")
+                                # Create expanders for each result
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
                                         if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
                                             st.session_state.do_deep_search = True
                             else:
                                 st.warning("No search results found.")
                 asyncio.run(run_search())
+            # Handle deep search based on search results
+            if st.session_state.do_deep_search and st.session_state.deep_search_url:
+                url = st.session_state.deep_search_url
+                st.info(f"Deep searching: {url}")
+                # Reset the flag to avoid re-running
+                st.session_state.do_deep_search = False
+                # Set up custom extensions
+                custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
+                valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
+                @st.cache_resource
+                def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val):
+                    async def _run():
+                        async with DownloadManager(use_proxy=use_proxy_val, proxy=proxy_val) as dm:
+                            files = await dm.deep_search(url, ext_list, max_links, timeout_val)
+                            return files
+                    return asyncio.run(_run())
+                with st.spinner("Searching for files..."):
+                    files = run_deep_search(url, valid_ext_list, max_sublinks,
+                                           sublink_timeout, use_proxy, proxy)
+                if files:
+                    st.session_state.discovered_files = files
+                    st.session_state.current_url = url
+                    st.success(f"Found {len(files)} files!")
+                else:
+                    st.warning("No files found.")
+    # Add footer with attribution
+    st.markdown('---')
+    st.markdown('Created by [Euler314](https://github.com/euler314)')
 if __name__ == "__main__":
     main()