Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Mar 9

Commit

18121e0

verified ·

1 Parent(s): e5712d5

Update app.py

Browse files

Files changed (1) hide show

app.py +403 -862

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 import os
 import asyncio
@@ -21,31 +22,32 @@ from PIL import Image
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
-# Advanced imports
-from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
-from bs4 import BeautifulSoup
-from PyPDF2 import PdfReader
-import google_auth_oauthlib.flow
-import googleapiclient.discovery
-import google.auth.transport.requests
-import googleapiclient.http
 import requests
-import celery
-from celery import Celery
-import splash
-import pyppeteer
-import mitmproxy
-from mitmproxy import http
 # Configure page and logging
 st.set_page_config(page_title="Advanced File Downloader", layout="wide")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Initialize Celery for distributed task processing
-celery_app = Celery('file_downloader', broker='redis://localhost:6379/0')
-# Configure Google OAuth
 GOOGLE_OAUTH_CONFIG = {
     "web": {
         "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
@@ -58,7 +60,7 @@ GOOGLE_OAUTH_CONFIG = {
     }
 }
-# -------------------- User Agent Settings --------------------
 USER_AGENTS = [
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
     'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
@@ -66,14 +68,9 @@ USER_AGENTS = [
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
     'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
-    'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
 ]
-# -------------------- Proxy Management --------------------
-PROXY_POOL = []
-CURRENT_PROXY_INDEX = 0
-# -------------------- Network Interception Configuration --------------------
 NETWORK_INTERCEPTOR_CONFIG = {
     "enabled": False,
     "intercept_types": ["xhr", "fetch", "document", "media"],
@@ -81,7 +78,7 @@ NETWORK_INTERCEPTOR_CONFIG = {
     "intercept_folder": "./intercepted_data"
 }
-# -------------------- Utility Functions --------------------
 def get_random_user_agent():
     return random.choice(USER_AGENTS)
@@ -117,8 +114,11 @@ def is_valid_file_url(url, extensions):
     """Check if URL is a valid file URL based on extension"""
     return any(url.lower().endswith(ext) for ext in extensions)
-# -------------------- Google Drive Functions --------------------
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
     flow = google_auth_oauthlib.flow.Flow.from_client_config(
         {"web": client_config},
@@ -133,6 +133,9 @@ def get_google_auth_url():
     return authorization_url
 def exchange_code_for_credentials(auth_code):
     if not auth_code.strip():
         return None, "No code provided."
     try:
@@ -151,6 +154,9 @@ def exchange_code_for_credentials(auth_code):
         return None, f"Error during token exchange: {e}"
 def google_drive_upload(file_path, credentials, folder_id=None):
     try:
         drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
         file_metadata = {'name': os.path.basename(file_path)}
@@ -163,164 +169,59 @@ def google_drive_upload(file_path, credentials, folder_id=None):
         return f"Error uploading to Drive: {str(e)}"
 def create_drive_folder(drive_service, name):
     folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
     folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
     return folder.get('id')
-# -------------------- Setup Functions --------------------
-def setup_dependencies():
-    """Install required system dependencies"""
     try:
         # Install system dependencies
         subprocess.run(['apt-get', 'update', '-y'], check=True)
         packages = [
             'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
             'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
-            'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0',
-            'redis-server', 'python3-dev', 'build-essential'
         ]
         subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
-        # Install Python packages
-        subprocess.run(['pip', 'install', 'playwright', 'pyppeteer', 'splash', 'celery[redis]', 'mitmproxy'], check=True)
-        # Install browsers
-        subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
-        subprocess.run(['python3', '-m', 'pyppeteer', 'install'], check=True)
         st.success("Dependencies installed successfully!")
         return True
     except Exception as e:
         st.error(f"Error installing dependencies: {e}")
-        st.info("You may need to manually install dependencies. Check console for details.")
         logger.error(f"Setup error: {e}")
         traceback.print_exc()
         return False
-def check_services():
-    """Check if required services are running"""
-    try:
-        # Check Redis for Celery
-        redis_running = subprocess.run(['redis-cli', 'ping'], capture_output=True, text=True).stdout.strip() == 'PONG'
-        if not redis_running:
-            # Try to start Redis
-            subprocess.run(['service', 'redis-server', 'start'], check=True)
-        # Create directories for intercepted data
-        os.makedirs(NETWORK_INTERCEPTOR_CONFIG['intercept_folder'], exist_ok=True)
-        return True
-    except Exception as e:
-        logger.error(f"Service check error: {e}")
-        return False
-# -------------------- Network Interception Classes --------------------
-class NetworkInterceptor:
-    """Class to intercept network traffic using mitmproxy"""
-    def __init__(self, intercept_types=None, save_path=None):
-        self.intercept_types = intercept_types or ["xhr", "fetch", "document"]
-        self.save_path = save_path or "./intercepted_data"
-        os.makedirs(self.save_path, exist_ok=True)
-        self.captured_data = []
-    def intercept_request(self, flow):
-        """Process intercepted requests"""
-        try:
-            url = flow.request.url
-            method = flow.request.method
-            content_type = flow.request.headers.get("Content-Type", "")
-            # Log the request
-            self.captured_data.append({
-                "type": "request",
-                "url": url,
-                "method": method,
-                "headers": dict(flow.request.headers),
-                "timestamp": time.time()
-            })
-            logger.info(f"Intercepted {method} request to {url}")
-        except Exception as e:
-            logger.error(f"Error intercepting request: {e}")
-    def intercept_response(self, flow):
-        """Process intercepted responses"""
-        try:
-            url = flow.request.url
-            status_code = flow.response.status_code
-            content_type = flow.response.headers.get("Content-Type", "")
-            # Only process responses of interest based on content type
-            if any(t in content_type.lower() for t in ["application/pdf", "application/msword",
-                                                      "application/vnd.openxmlformats",
-                                                      "application/zip"]):
-                # Save the file
-                filename = os.path.basename(urlparse(url).path)
-                if not filename or filename == '/':
-                    filename = f"file_{int(time.time())}"
-                    # Try to add extension based on content type
-                    if "pdf" in content_type:
-                        filename += ".pdf"
-                    elif "msword" in content_type:
-                        filename += ".doc"
-                    elif "openxmlformats" in content_type and "wordprocessingml" in content_type:
-                        filename += ".docx"
-                    elif "zip" in content_type:
-                        filename += ".zip"
-                file_path = os.path.join(self.save_path, filename)
-                with open(file_path, "wb") as f:
-                    f.write(flow.response.content)
-                logger.info(f"Saved intercepted file: {file_path}")
-                # Record metadata about the captured file
-                self.captured_data.append({
-                    "type": "file",
-                    "url": url,
-                    "content_type": content_type,
-                    "size": len(flow.response.content),
-                    "path": file_path,
-                    "timestamp": time.time()
-                })
-        except Exception as e:
-            logger.error(f"Error intercepting response: {e}")
-    def get_captured_files(self):
-        """Return list of captured files"""
-        return [item for item in self.captured_data if item["type"] == "file"]
-# -------------------- Browser Automation Classes --------------------
-class MultiEngineBrowser:
-    """Class that supports multiple browser engines (Playwright, Pyppeteer, Splash)"""
-    def __init__(self, engine="playwright", use_proxy=False, proxy=None, stealth=True):
-        self.engine = engine
         self.use_proxy = use_proxy
         self.proxy = proxy
-        self.stealth = stealth
         self.browser = None
         self.context = None
         self.page = None
-    async def setup(self):
-        """Initialize browser based on selected engine"""
-        if self.engine == "playwright":
-            return await self.setup_playwright()
-        elif self.engine == "pyppeteer":
-            return await self.setup_pyppeteer()
-        elif self.engine == "splash":
-            return await self.setup_splash()
-        else:
-            raise ValueError(f"Unsupported browser engine: {self.engine}")
-    async def setup_playwright(self):
-        """Setup Playwright browser"""
-        from playwright.async_api import async_playwright
         self.playwright = await async_playwright().start()
         browser_args = [
             '--no-sandbox',
             '--disable-setuid-sandbox',
@@ -329,7 +230,7 @@ class MultiEngineBrowser:
             '--disable-features=IsolateOrigins,site-per-process',
         ]
-        if self.stealth:
             browser_args.extend([
                 '--disable-blink-features=AutomationControlled',
                 '--disable-features=IsolateOrigins'
@@ -343,8 +244,10 @@ class MultiEngineBrowser:
         if self.use_proxy and self.proxy:
             launch_options["proxy"] = {"server": self.proxy}
         self.browser = await self.playwright.chromium.launch(**launch_options)
         context_options = {
             "viewport": {"width": 1920, "height": 1080},
             "user_agent": get_random_user_agent(),
@@ -353,10 +256,10 @@ class MultiEngineBrowser:
             "accept_downloads": True
         }
         self.context = await self.browser.new_context(**context_options)
-        # Apply stealth features
-        if self.stealth:
             await self.context.add_init_script("""
                 Object.defineProperty(navigator, 'webdriver', { get: () => false });
                 Object.defineProperty(navigator, 'plugins', {
@@ -366,221 +269,50 @@ class MultiEngineBrowser:
                 window.chrome = { runtime: {} };
             """)
         self.page = await self.context.new_page()
-        return self.page
-    async def setup_pyppeteer(self):
-        """Setup Pyppeteer browser"""
-        from pyppeteer import launch
-        browser_args = [
-            '--no-sandbox',
-            '--disable-setuid-sandbox',
-            '--disable-dev-shm-usage',
-            '--disable-web-security',
-        ]
-        if self.stealth:
-            browser_args.extend([
-                '--disable-blink-features=AutomationControlled',
-                '--disable-features=IsolateOrigins'
-            ])
-        launch_options = {
-            "headless": True,
-            "args": browser_args,
-            "ignoreHTTPSErrors": True,
-            "userDataDir": tempfile.mkdtemp()
-        }
-        if self.use_proxy and self.proxy:
-            browser_args.append(f'--proxy-server={self.proxy}')
-        self.browser = await launch(launch_options)
-        self.page = await self.browser.newPage()
-        # Set user agent
-        await self.page.setUserAgent(get_random_user_agent())
-        # Set viewport
-        await self.page.setViewport({"width": 1920, "height": 1080})
-        # Apply stealth features
-        if self.stealth:
-            await self.page.evaluateOnNewDocument("""
-                Object.defineProperty(navigator, 'webdriver', { get: () => false });
-                Object.defineProperty(navigator, 'plugins', {
-                    get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
-                });
-                Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
-                window.chrome = { runtime: {} };
-            """)
-        return self.page
-    async def setup_splash(self):
-        """Setup Splash browser through API"""
-        # Splash is typically used via HTTP API
-        # We'll use requests for this
-        self.splash_url = "http://localhost:8050/render.html"
-        return None  # No actual page object for Splash
-    async def goto(self, url, wait_until=None, timeout=30000):
-        """Navigate to a URL"""
-        if self.engine == "playwright":
-            return await self.page.goto(url, wait_until=wait_until or 'networkidle', timeout=timeout)
-        elif self.engine == "pyppeteer":
-            return await self.page.goto(url, waitUntil=wait_until or 'networkidle0', timeout=timeout)
-        elif self.engine == "splash":
-            # Use Splash HTTP API
-            params = {
-                "url": url,
-                "wait": min(timeout/1000, 30),  # Splash uses seconds
-                "timeout": min(timeout/1000, 60),
-                "resource_timeout": min(timeout/1000, 30),
-                "html": 1,
-                "png": 0,
-                "render_all": 1
-            }
-            if self.use_proxy and self.proxy:
-                params["proxy"] = self.proxy
-            headers = {"User-Agent": get_random_user_agent()}
-            response = requests.get(self.splash_url, params=params, headers=headers)
-            self.last_html = response.text
-            return response
-    async def content(self):
-        """Get page content"""
-        if self.engine == "playwright":
-            return await self.page.content()
-        elif self.engine == "pyppeteer":
-            return await self.page.content()
-        elif self.engine == "splash":
-            return self.last_html
-    async def close(self):
-        """Close browser"""
-        if self.engine == "playwright":
-            if self.browser:
-                await self.browser.close()
-            if self.playwright:
-                await self.playwright.stop()
-        elif self.engine == "pyppeteer":
-            if self.browser:
-                await self.browser.close()
-        # No cleanup needed for Splash as it's stateless
-# -------------------- Download Manager Class --------------------
-class DownloadManager:
-    def __init__(self, browser_engine="playwright", use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
-        self.browser_engine = browser_engine
-        self.use_proxy = use_proxy
-        self.proxy = proxy
-        self.query = query
-        self.num_results = num_results
-        self.use_stealth = use_stealth
-        self.browser = None
-        self.network_interceptor = None
-        # Configure network interception if enabled
-        if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
-            self.network_interceptor = NetworkInterceptor(
-                intercept_types=NETWORK_INTERCEPTOR_CONFIG["intercept_types"],
-                save_path=NETWORK_INTERCEPTOR_CONFIG["intercept_folder"]
-            )
-    async def __aenter__(self):
-        # Initialize multi-engine browser
-        self.browser = MultiEngineBrowser(
-            engine=self.browser_engine,
-            use_proxy=self.use_proxy,
-            proxy=self.proxy,
-            stealth=self.use_stealth
-        )
-        self.page = await self.browser.setup()
-        # Set headers for better stealth
-        if self.browser_engine == "playwright":
-            await self.page.set_extra_http_headers({
-                'Accept-Language': 'en-US,en;q=0.9',
-                'Accept-Encoding': 'gzip, deflate, br',
-                'DNT': '1',
-                'Referer': 'https://www.google.com/',
-                'Sec-Fetch-Dest': 'document',
-                'Sec-Fetch-Mode': 'navigate',
-                'Sec-Fetch-Site': 'cross-site',
-                'Sec-Fetch-User': '?1',
-                'Upgrade-Insecure-Requests': '1'
-            })
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        await self.browser.close()
-    async def search_web(self, search_engine="bing"):
-        """Search web using specified search engine"""
         urls = []
         try:
-            if search_engine == "bing":
-                search_url = f"https://www.bing.com/search?q={self.query}"
-            elif search_engine == "google":
-                search_url = f"https://www.google.com/search?q={self.query}"
-            else:
-                raise ValueError(f"Unsupported search engine: {search_engine}")
-            await self.browser.goto(search_url, timeout=30000)
-            if self.browser_engine == "playwright":
-                if search_engine == "bing":
-                    links = await self.page.query_selector_all("li.b_algo h2 a")
-                    for link in links[:self.num_results]:
-                        href = await link.get_attribute('href')
-                        if href:
-                            urls.append(href)
-                elif search_engine == "google":
-                    links = await self.page.query_selector_all("div.g a[href^='http']")
-                    for link in links[:self.num_results]:
-                        href = await link.get_attribute('href')
-                        if href:
-                            urls.append(href)
-            elif self.browser_engine == "pyppeteer":
-                if search_engine == "bing":
-                    links = await self.page.querySelectorAll("li.b_algo h2 a")
-                    for link in links[:self.num_results]:
-                        href = await self.page.evaluate('el => el.getAttribute("href")', link)
-                        if href:
-                            urls.append(href)
-                elif search_engine == "google":
-                    links = await self.page.querySelectorAll("div.g a[href^='http']")
-                    for link in links[:self.num_results]:
-                        href = await self.page.evaluate('el => el.getAttribute("href")', link)
-                        if href:
-                            urls.append(href)
-            elif self.browser_engine == "splash":
-                # Parse the HTML with BeautifulSoup
-                soup = BeautifulSoup(self.browser.last_html, 'html.parser')
-                if search_engine == "bing":
-                    links = soup.select("li.b_algo h2 a")
-                    for link in links[:self.num_results]:
-                        href = link.get("href")
-                        if href:
-                            urls.append(href)
-                elif search_engine == "google":
-                    links = soup.select("div.g a[href^='http']")
-                    for link in links[:self.num_results]:
-                        href = link.get("href")
-                        if href:
-                            urls.append(href)
             return urls
         except Exception as e:
-            logger.error(f"Error searching web: {e}")
             return []
     async def get_file_size(self, url):
         try:
             headers = {'User-Agent': get_random_user_agent()}
             response = requests.head(url, headers=headers, timeout=15)
@@ -593,6 +325,10 @@ class DownloadManager:
             return "Unknown Size"
     async def get_pdf_metadata(self, url):
         try:
             headers = {'User-Agent': get_random_user_agent()}
             response = requests.get(url, headers=headers, timeout=15, stream=True)
@@ -610,6 +346,7 @@ class DownloadManager:
             return {}
     async def extract_real_download_url(self, url):
         try:
             headers = {'User-Agent': get_random_user_agent()}
             response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
@@ -619,7 +356,7 @@ class DownloadManager:
             return url
     async def get_edu_exam_links(self, url):
-        """Specialized method for educational exam websites that follows a common pattern."""
         try:
             logger.info(f"Fetching exam links from {url}")
             links = set()
@@ -630,7 +367,7 @@ class DownloadManager:
                 response = requests.get(url, headers=headers, timeout=30)
                 if response.status_code == 200:
-                    # Parse with BeautifulSoup for efficiency
                     soup = BeautifulSoup(response.text, "html.parser")
                     parsed_base = urlparse(url)
                     base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
@@ -640,26 +377,22 @@ class DownloadManager:
                         href = a["href"]
                         full_url = urljoin(url, href)
-                        # Look for text clues
                         link_text = a.get_text().lower()
-                        # Special patterns for exam sites (expanded list)
                         url_patterns = [
                             "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
                             "/test/", "/download/", "/files/", "/assignments/",
-                            "paper_", "question_", "exam_", "test_", "past_",
-                            "assignment_", "sample_", "study_material", "notes_",
-                            "/resource/", "/subject/", "/course/", "/material/"
                         ]
                         text_patterns = [
                             "exam", "paper", "test", "question", "past", "download",
-                            "assignment", "sample", "study", "material", "notes",
-                            "subject", "course", "resource", "pdf", "document",
-                            "view", "open", "get", "solution", "answer"
                         ]
-                        # Check URL and text patterns
                         if any(pattern in full_url.lower() for pattern in url_patterns) or \
                            any(pattern in link_text for pattern in text_patterns) or \
                            any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
@@ -667,48 +400,74 @@ class DownloadManager:
             except Exception as e:
                 logger.warning(f"Request-based extraction failed: {e}")
-            # Use browser-based approach if needed
             if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
                 logger.info("Using browser for enhanced link extraction")
-                # Navigate to the page
-                await self.browser.goto(url, timeout=45000)
-                # Get page content and parse with BeautifulSoup
-                content = await self.browser.content()
                 soup = BeautifulSoup(content, "html.parser")
                 parsed_base = urlparse(url)
                 base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-                # Process all links on the page
                 for a in soup.find_all("a", href=True):
                     href = a["href"]
                     full_url = urljoin(url, href)
                     link_text = a.get_text().lower()
-                    # Apply the same filtering criteria
                     url_patterns = [
                         "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
                         "/test/", "/download/", "/files/", "/assignments/",
-                        "paper_", "question_", "exam_", "test_", "past_",
-                        "assignment_", "sample_", "study_material", "notes_",
-                        "/resource/", "/subject/", "/course/", "/material/"
                     ]
                     text_patterns = [
                         "exam", "paper", "test", "question", "past", "download",
-                        "assignment", "sample", "study", "material", "notes",
-                        "subject", "course", "resource", "pdf", "document",
-                        "view", "open", "get", "solution", "answer"
                     ]
-                    # Check URL and text patterns
                     if any(pattern in full_url.lower() for pattern in url_patterns) or \
                        any(pattern in link_text for pattern in text_patterns) or \
                        any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                         links.add(full_url)
-            # Filter to likely exam documents
             filtered_links = []
             for link in links:
                 # Common file extensions
@@ -719,8 +478,7 @@ class DownloadManager:
                 # Common paths for exam documents
                 if any(pattern in link.lower() for pattern in [
                     "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
-                    "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
-                    "/resource/", "/material/", "/notes/", "/subjectmaterial/"
                 ]):
                     filtered_links.append(link)
@@ -732,6 +490,7 @@ class DownloadManager:
             return []
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
             # Special handling for educational exam sites
@@ -765,7 +524,7 @@ class DownloadManager:
                     # Get metadata for PDFs
                     meta = {}
-                    if real_url.lower().endswith('.pdf'):
                         try:
                             meta = await self.get_pdf_metadata(real_url)
                         except Exception:
@@ -776,18 +535,18 @@ class DownloadManager:
                         'filename': filename,
                         'size': size_str,
                         'metadata': meta,
-                        'source_url': url  # Add source URL for better tracking
                     })
                 # If we found exam files with the specialized method, return them
                 if found_files:
                     return found_files
-            # Standard extraction method for all pages
-            await self.browser.goto(url, timeout=30000)
             # Get page content
-            content = await self.browser.content()
             soup = BeautifulSoup(content, 'html.parser')
             # Define file extensions to look for
@@ -807,7 +566,7 @@ class DownloadManager:
                 # Handle PHP and download links separately
                 if '.php' in href.lower() or 'download' in href.lower():
-                    full_url = href if href.startswith('http') else urljoin(base_url, href)
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                         filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
@@ -822,10 +581,10 @@ class DownloadManager:
                 # Check for direct file extensions
                 if any(href.lower().endswith(ext) for ext in all_exts):
-                    file_url = href if href.startswith('http') else urljoin(base_url, href)
                     size_str = await self.get_file_size(file_url)
                     meta = {}
-                    if file_url.lower().endswith('.pdf'):
                         meta = await self.get_pdf_metadata(file_url)
                     found_files.append({
                         'url': file_url,
@@ -845,7 +604,7 @@ class DownloadManager:
                             break
                     if file_id:
-                        # Determine if it's a view-only file
                         is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
                         filename = f"gdrive_{file_id}"
@@ -869,7 +628,7 @@ class DownloadManager:
                 for elem in soup.find_all(elem_tag):
                     src = elem.get('src') or elem.get('data')
                     if src and any(src.lower().endswith(ext) for ext in all_exts):
-                        file_url = src if src.startswith('http') else urljoin(base_url, src)
                         found_files.append({
                             'url': file_url,
                             'filename': os.path.basename(file_url.split('?')[0]),
@@ -893,12 +652,12 @@ class DownloadManager:
             return []
     async def download_file(self, file_info, save_dir, referer=None):
-        """Download a file and provide a direct download link"""
         file_url = file_info['url']
         fname = file_info['filename']
         referer = referer or file_info.get('source_url', 'https://www.google.com')
-        # Create unique filename to avoid overwriting
         path = os.path.join(save_dir, fname)
         base, ext = os.path.splitext(fname)
         counter = 1
@@ -911,7 +670,7 @@ class DownloadManager:
         try:
             # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
-                # For view-only Google Drive files, use specialized method
                 is_view_only = file_info.get('metadata', {}).get('view_only', False)
                 if is_view_only:
                     result_path = await self.download_viewonly_google_drive(file_info, path)
@@ -967,7 +726,7 @@ class DownloadManager:
             return None
     async def download_viewonly_google_drive(self, file_info, save_path):
-        """Download view-only Google Drive documents"""
         try:
             # Extract file ID
             file_id = file_info.get('metadata', {}).get('file_id')
@@ -993,173 +752,147 @@ class DownloadManager:
             logger.info(f"Downloading view-only Google Drive file: {file_id}")
-            # Create a dedicated browser session
-            if self.browser_engine == "playwright":
-                from playwright.async_api import async_playwright
-                async with async_playwright() as p:
-                    browser = await p.chromium.launch(
-                        headless=True,
-                        args=[
-                            '--no-sandbox',
-                            '--disable-setuid-sandbox',
-                            '--disable-dev-shm-usage',
-                            '--disable-web-security',
-                            '--disable-features=IsolateOrigins,site-per-process',
-                            '--disable-site-isolation-trials',
-                            '--disable-blink-features=AutomationControlled'
-                        ]
-                    )
-                    # Create context with options for better handling
-                    context = await browser.new_context(
-                        viewport={'width': 1600, 'height': 1200},
-                        user_agent=get_random_user_agent(),
-                        accept_downloads=True,
-                        ignore_https_errors=True
-                    )
-                    # Add stealth script
-                    await context.add_init_script("""
-                        Object.defineProperty(navigator, 'webdriver', { get: () => false });
-                        Object.defineProperty(navigator, 'plugins', {
-                            get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
-                        });
-                        Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
-                        window.chrome = { runtime: {} };
-                    """)
-                    page = await context.new_page()
-                    try:
-                        # Visit the file
-                        await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
-                        await page.wait_for_load_state('networkidle')
-                        # Wait for content to load
-                        await page.wait_for_timeout(5000)
-                        # Create temporary directory for processing
-                        temp_dir = tempfile.mkdtemp()
-                        # For PDF handling
-                        if file_type == 'pdf':
-                            # Create directory for screenshots
-                            screenshots_dir = os.path.join(temp_dir, "screenshots")
-                            os.makedirs(screenshots_dir, exist_ok=True)
-                            # Get page count
-                            total_pages = await page.evaluate("""
-                                () => {
-                                    // Look for page counters in the interface
-                                    const pageCounters = document.querySelectorAll('*');
-                                    for (const el of pageCounters) {
-                                        const text = el.textContent || '';
-                                        const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
-                                        if (match && match[2]) {
-                                            return parseInt(match[2]);
-                                        }
-                                    }
-                                    // Look for paginated pages
-                                    const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                                    if (pages.length > 0) return pages.length;
-                                    // Default if we can't determine
-                                    return 20;
-                                }
-                            """)
-                            logger.info(f"PDF has approximately {total_pages} pages")
-                            # Take screenshots of each page
-                            screenshots = []
-                            # First try with the page element method
-                            for i in range(min(total_pages, 100)):  # Limit to 100 pages for safety
-                                try:
-                                    # Navigate to specific page
-                                    if i > 0:
-                                        await page.evaluate(f"document.querySelector('.drive-viewer-paginated-page:nth-child({i+1})').scrollIntoView()")
-                                        await page.wait_for_timeout(500)
-                                    # Wait for the page to render
-                                    await page.wait_for_timeout(500)
-                                    # Take screenshot
-                                    screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
-                                    # Try to find the page element
-                                    page_element = await page.query_selector(f'.drive-viewer-paginated-page:nth-child({i+1})')
-                                    if page_element:
-                                        await page_element.screenshot(path=screenshot_path)
                                     else:
-                                        # Fallback to viewport screenshot
-                                        await page.screenshot(path=screenshot_path)
-                                    screenshots.append(screenshot_path)
-                                    # Check if we should continue to next page
-                                    if i < total_pages - 1:
-                                        next_button = await page.query_selector('button[aria-label="Next page"]')
-                                        if next_button:
-                                            # Check if button is disabled
-                                            is_disabled = await next_button.get_attribute('disabled')
-                                            if is_disabled:
-                                                logger.info(f"Reached last page at page {i+1}")
-                                                break
-                                            # Click next page
-                                            await next_button.click()
-                                            await page.wait_for_timeout(1000)
-                                        else:
-                                            logger.info("Next page button not found")
-                                            break
-                                except Exception as e:
-                                    logger.error(f"Error capturing page {i+1}: {e}")
-                                    continue
-                            # Create PDF from screenshots
-                            if screenshots:
-                                # Get dimensions from first screenshot
-                                first_img = Image.open(screenshots[0])
-                                width, height = first_img.size
-                                # Create PDF
-                                c = canvas.Canvas(save_path, pagesize=(width, height))
-                                for screenshot in screenshots:
-                                    c.drawImage(screenshot, 0, 0, width, height)
-                                    c.showPage()
-                                c.save()
-                                # Clean up screenshots
-                                for screenshot in screenshots:
-                                    os.remove(screenshot)
-                                # Clean up temp directory
-                                shutil.rmtree(temp_dir, ignore_errors=True)
-                                return save_path
-                            else:
-                                logger.error("No screenshots captured")
-                        else:
-                            # For non-PDF files, just take a screenshot
-                            screenshot_path = os.path.join(temp_dir, "file.png")
-                            await page.screenshot(path=screenshot_path)
-                            # Copy to destination
-                            shutil.copy(screenshot_path, save_path)
-                            # Clean up
-                            os.remove(screenshot_path)
                             shutil.rmtree(temp_dir, ignore_errors=True)
                             return save_path
-                    finally:
-                        await browser.close()
-            elif self.browser_engine == "pyppeteer":
-                # Similar implementation for Pyppeteer
-                pass
             return None
         except Exception as e:
@@ -1167,7 +900,7 @@ class DownloadManager:
             return None
     async def get_sublinks(self, url, limit=10000):
-        """Extract all sublinks from a website"""
         links = set()
         try:
             logger.info(f"Extracting sublinks from {url}")
@@ -1183,18 +916,17 @@ class DownloadManager:
                     logger.info(f"Found {len(links)} sublinks with specialized method")
                     return list(links)[:limit]
-            # Standard link extraction for all sites
-            await self.browser.goto(url, timeout=30000)
             # Get page content
-            content = await self.browser.content()
             soup = BeautifulSoup(content, 'html.parser')
-            # Get base URL for resolving relative links
             parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            # Extract all links from the page
             for a in soup.find_all('a', href=True):
                 href = a['href']
                 if href and not href.startswith('javascript:') and not href.startswith('#'):
@@ -1220,85 +952,12 @@ class DownloadManager:
             logger.error(f"Error extracting sublinks: {e}")
             return list(links)[:limit]
-    @celery_app.task
-    def download_file_task(file_info, save_dir, referer=None):
-        """Celery task for downloading files asynchronously"""
-        # This function runs in a separate worker process
-        file_url = file_info['url']
-        fname = file_info['filename']
-        referer = referer or file_info.get('source_url', 'https://www.google.com')
-        # Create unique filename
-        path = os.path.join(save_dir, fname)
-        base, ext = os.path.splitext(fname)
-        counter = 1
-        while os.path.exists(path):
-            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-            counter += 1
-        os.makedirs(save_dir, exist_ok=True)
-        try:
-            # Handle Google Drive files
-            if "drive.google.com" in file_url or "docs.google.com" in file_url:
-                # Extract file ID
-                file_id = None
-                for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
-                    match = re.search(pattern, file_url)
-                    if match:
-                        file_id = match.group(1)
-                        break
-                if file_id:
-                    # Try direct download
-                    download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
-                    headers = {
-                        'User-Agent': get_random_user_agent(),
-                        'Referer': referer
-                    }
-                    with requests.get(download_url, headers=headers, stream=True) as r:
-                        if r.status_code == 200:
-                            with open(path, 'wb') as f:
-                                for chunk in r.iter_content(chunk_size=8192):
-                                    f.write(chunk)
-                            # Check if this is HTML (common for Google Drive restrictions)
-                            with open(path, 'rb') as f:
-                                content_start = f.read(100).decode('utf-8', errors='ignore')
-                                if '<html' in content_start.lower():
-                                    os.remove(path)
-                                    return {'status': 'error', 'message': 'Received HTML instead of file'}
-                            return {'status': 'success', 'path': path}
-            # Standard download for regular files
-            headers = {
-                'User-Agent': get_random_user_agent(),
-                'Referer': referer,
-                'Accept': '*/*',
-                'Accept-Encoding': 'gzip, deflate, br'
-            }
-            with requests.get(file_url, headers=headers, stream=True) as r:
-                if r.status_code == 200:
-                    with open(path, 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=8192):
-                            f.write(chunk)
-                    return {'status': 'success', 'path': path}
-                else:
-                    return {'status': 'error', 'message': f"HTTP error: {r.status_code}"}
-        except Exception as e:
-            return {'status': 'error', 'message': str(e)}
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
-        """Perform deep search for files on a website and its subpages"""
         if not custom_ext_list:
             custom_ext_list = []
-        # Create progress indicators
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
@@ -1317,22 +976,23 @@ class DownloadManager:
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
-            # Initialize all_files with main_files to ensure they're included
             all_files = main_files.copy()
-            # Process each sublink
-            for i, sublink in enumerate(sublinks, 1):
-                progress = i / max(total_links, 1)  # Avoid division by zero
-                progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
-                progress_bar.progress(progress)
-                try:
-                    # Extract files from sublink
-                    sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
-                    all_files.extend(sub_files)
-                    file_count_text.text(f"Found {len(all_files)} total files")
-                except Exception as e:
-                    logger.warning(f"Error processing sublink {sublink}: {e}")
             # Deduplicate files
             seen_urls = set()
@@ -1360,7 +1020,7 @@ class DownloadManager:
                 progress_text.empty()
                 progress_bar.empty()
-# -------------------- Main App --------------------
 def main():
     st.title("Advanced File Downloader")
@@ -1369,91 +1029,70 @@ def main():
         st.session_state.initialized = True
         st.session_state.discovered_files = []
         st.session_state.current_url = None
-        st.session_state.google_creds = None
         st.session_state.selected_files = []
         st.session_state.do_deep_search = False
         st.session_state.deep_search_url = None
         st.session_state.search_results = []
         st.session_state.download_urls = {}  # For direct download links
-    # Install dependencies if needed
-    if "dependencies_installed" not in st.session_state:
-        with st.spinner("Setting up dependencies. This may take a minute..."):
-            st.session_state.dependencies_installed = setup_dependencies()
-            check_services()
-    # Sidebar options
     with st.sidebar:
-        mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Single File"], key="mode_select")
-        with st.expander("Search Options", expanded=True):
-            search_engine = st.selectbox("Search Engine", ["bing", "google"], index=0, key="search_engine")
-            browser_engine = st.selectbox("Browser Engine", ["playwright", "pyppeteer", "splash"], index=0, key="browser_engine")
             custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
                                             help="Enter extensions like .csv, .txt")
             max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
             sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
-        with st.expander("Advanced Options", expanded=False):
             use_proxy = st.checkbox("Use Proxy", key="use_proxy")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
             use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
                                     help="Makes browser harder to detect as automated")
-            enable_network_intercept = st.checkbox("Enable Network Interception", value=NETWORK_INTERCEPTOR_CONFIG["enabled"],
-                                                key="enable_intercept",
-                                                help="Intercept network traffic to find additional files")
-            if enable_network_intercept:
-                NETWORK_INTERCEPTOR_CONFIG["enabled"] = True
-                intercept_types = st.multiselect("Intercept Types",
-                                               ["xhr", "fetch", "document", "media", "stylesheet", "image", "font"],
-                                               default=["xhr", "fetch", "document", "media"],
-                                               key="intercept_types")
-                NETWORK_INTERCEPTOR_CONFIG["intercept_types"] = intercept_types
-            else:
-                NETWORK_INTERCEPTOR_CONFIG["enabled"] = False
-        with st.expander("Google Drive Integration", expanded=False):
-            if st.button("Start Google Sign-In", key="google_signin_btn"):
-                auth_url = get_google_auth_url()
-                st.markdown(f"[Click here to authorize]({auth_url})")
-            auth_code = st.text_input("Enter authorization code", key="auth_code_input")
-            if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
-                creds, msg = exchange_code_for_credentials(auth_code)
-                st.session_state.google_creds = creds
-                st.write(msg)
     # Main content area
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
-        col1, col2 = st.columns([3, 1])
-        with col1:
-            if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
-                if url:
-                    # Process custom extensions
-                    custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
-                    with st.spinner("Searching for files..."):
-                        async def run_deep_search():
-                            async with DownloadManager(
-                                browser_engine=browser_engine,
-                                use_proxy=use_proxy,
-                                proxy=proxy,
-                                use_stealth=use_stealth
-                            ) as dm:
-                                files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
-                                return files
-                        # Run the search
-                        files = asyncio.run(run_deep_search())
-                        if files:
-                            st.session_state.discovered_files = files
-                            st.session_state.current_url = url
-                            st.success(f"Found {len(files)} files!")
-                        else:
-                            st.warning("No files found.")
         # Display and process discovered files
         if st.session_state.discovered_files:
@@ -1482,12 +1121,6 @@ def main():
                     file_info = f"{filename} ({size})"
                 file_options.append((i, file_info))
-                # Generate direct download URL for this file
-                if i not in st.session_state.download_urls:
-                    # Generate a unique key for this file
-                    file_key = base64.urlsafe_b64encode(f"{file['url']}_{time.time()}".encode()).decode()
-                    st.session_state.download_urls[i] = file_key
             # File selection multiselect
             selected_indices = st.multiselect(
@@ -1500,7 +1133,7 @@ def main():
             st.session_state.selected_files = selected_indices
-            # Display individual files with direct download links
             if files:
                 st.subheader("Available Files")
                 for i, file in enumerate(files):
@@ -1508,8 +1141,8 @@ def main():
                         st.write(f"Source: {file.get('source_url', 'Unknown')}")
                         st.write(f"URL: {file['url']}")
-                        # Download button for this specific file
-                        if st.button(f"Download this file", key=f"download_single_{i}"):
                             with st.spinner(f"Downloading {file['filename']}..."):
                                 # Create downloads directory
                                 download_dir = "./downloads"
@@ -1518,7 +1151,6 @@ def main():
                                 # Download the file
                                 async def download_single():
                                     async with DownloadManager(
-                                        browser_engine=browser_engine,
                                         use_proxy=use_proxy,
                                         proxy=proxy,
                                         use_stealth=use_stealth
@@ -1551,15 +1183,13 @@ def main():
             if selected_indices:
                 st.subheader("Batch Download Options")
-                col1, col2, col3, col4 = st.columns(4)
                 with col1:
                     download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
                 with col2:
                     create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
                 with col3:
                     delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
-                with col4:
-                    upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
                 if st.button("Download Selected Files", key="batch_download_btn"):
                     with st.spinner(f"Downloading {len(selected_indices)} files..."):
@@ -1573,7 +1203,6 @@ def main():
                         async def download_batch():
                             async with DownloadManager(
-                                browser_engine=browser_engine,
                                 use_proxy=use_proxy,
                                 proxy=proxy,
                                 use_stealth=use_stealth
@@ -1614,24 +1243,6 @@ def main():
                                     key="download_zip_btn"
                                 )
-                                # Upload to Google Drive if requested
-                                if upload_to_drive and st.session_state.google_creds:
-                                    with st.spinner("Uploading to Google Drive..."):
-                                        drive_service = googleapiclient.discovery.build(
-                                            "drive", "v3", credentials=st.session_state.google_creds
-                                        )
-                                        folder_id = create_drive_folder(
-                                            drive_service, f"Downloads_{get_domain(url)}"
-                                        )
-                                        drive_id = google_drive_upload(
-                                            zip_path, st.session_state.google_creds, folder_id
-                                        )
-                                        if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
-                                            st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
-                                        else:
-                                            st.error(drive_id)
                                 # Delete original files if requested
                                 if delete_after:
                                     for path in downloaded_paths:
@@ -1650,17 +1261,16 @@ def main():
         if st.button("Search", key="web_search_btn"):
             if query:
-                with st.spinner("Searching the web..."):
                     async def run_search():
                         async with DownloadManager(
-                            browser_engine=browser_engine,
                             use_proxy=use_proxy,
                             proxy=proxy,
                             query=query,
                             num_results=num_results,
                             use_stealth=use_stealth
                         ) as dm:
-                            urls = await dm.search_web(search_engine)
                             return urls
                     urls = asyncio.run(run_search())
@@ -1693,7 +1303,6 @@ def main():
             with st.spinner("Searching for files..."):
                 async def deep_search_result():
                     async with DownloadManager(
-                        browser_engine=browser_engine,
                         use_proxy=use_proxy,
                         proxy=proxy,
                         use_stealth=use_stealth
@@ -1709,131 +1318,63 @@ def main():
                 else:
                     st.warning("No files found on this page.")
-    elif mode == "Single File":
-        st.header("Single File Download")
         # View-only Google Drive download
-        with st.expander("Download View-Only Google Drive Document", expanded=True):
-            st.write("Download protected/view-only Google Drive documents")
-            file_id = st.text_input(
-                "Google Drive File ID",
-                placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
-                key="drive_file_id"
-            )
-            if st.button("Download Document", key="drive_download_btn") and file_id:
-                with st.spinner("Downloading view-only document... (this may take a minute)"):
-                    # Create download directory
-                    download_dir = "./downloads"
-                    os.makedirs(download_dir, exist_ok=True)
-                    # Set output path
-                    output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
-                    # Download the file
-                    async def download_drive_file():
-                        async with DownloadManager(
-                            browser_engine=browser_engine,
-                            use_proxy=use_proxy,
-                            proxy=proxy,
-                            use_stealth=use_stealth
-                        ) as dm:
-                            file_info = {
-                                'url': f"https://drive.google.com/file/d/{file_id}/view",
-                                'filename': f"gdrive_{file_id}.pdf",
-                                'metadata': {'file_id': file_id, 'view_only': True}
-                            }
-                            return await dm.download_viewonly_google_drive(file_info, output_path)
-                    result_path = asyncio.run(download_drive_file())
-                    if result_path:
-                        st.success("Document downloaded successfully!")
-                        # Provide download link
-                        with open(result_path, "rb") as f:
-                            file_bytes = f.read()
-                        st.download_button(
-                            label="Download PDF",
-                            data=file_bytes,
-                            file_name=os.path.basename(result_path),
-                            mime="application/pdf",
-                            key="drive_pdf_download"
-                        )
-                    else:
-                        st.error("Failed to download the document. Please check the file ID and try again.")
-        # Direct URL download
-        with st.expander("Download from Direct URL", expanded=True):
-            st.write("Download a file from a direct URL")
-            file_url = st.text_input(
-                "File URL",
-                placeholder="https://example.com/file.pdf",
-                key="direct_url"
-            )
-            file_name = st.text_input(
-                "Save as (optional)",
-                placeholder="Leave blank to use original filename",
-                key="save_filename"
-            )
-            if st.button("Download File", key="direct_download_btn") and file_url:
-                with st.spinner("Downloading file..."):
-                    # Create download directory
-                    download_dir = "./downloads"
-                    os.makedirs(download_dir, exist_ok=True)
-                    # Determine filename
-                    if not file_name:
-                        file_name = os.path.basename(urlparse(file_url).path)
-                        if not file_name or file_name == '/':
-                            file_name = f"downloaded_file_{int(time.time())}{get_file_extension(file_url)}"
-                    # Create file info
-                    file_info = {
-                        'url': file_url,
-                        'filename': file_name,
-                        'metadata': {}
-                    }
-                    # Download the file
-                    async def download_direct_file():
-                        async with DownloadManager(
-                            browser_engine=browser_engine,
-                            use_proxy=use_proxy,
-                            proxy=proxy,
-                            use_stealth=use_stealth
-                        ) as dm:
-                            return await dm.download_file(file_info, download_dir)
-                    file_path = asyncio.run(download_direct_file())
-                    if file_path:
-                        st.success(f"File downloaded successfully to {file_path}")
-                        # Provide download link
-                        with open(file_path, "rb") as f:
-                            file_bytes = f.read()
-                        mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
-                        st.download_button(
-                            label=f"Download {os.path.basename(file_path)}",
-                            data=file_bytes,
-                            file_name=os.path.basename(file_path),
-                            mime=mime_type,
-                            key="direct_file_download"
-                        )
-                    else:
-                        st.error("Failed to download the file. Please check the URL and try again.")
     # Footer
     st.markdown("---")
-    st.markdown("Created by [Euler314](https://github.com/euler314) | Enhanced with advanced scraping technologies")
 # Run the app
 if __name__ == "__main__":

+# app.py
 import streamlit as st
 import os
 import asyncio
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
+# Advanced imports - only import what's installed
 import requests
+from bs4 import BeautifulSoup
+from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
+# Optional imports with fallbacks
+try:
+    from PyPDF2 import PdfReader
+except ImportError:
+    PdfReader = None
+try:
+    import google_auth_oauthlib.flow
+    import googleapiclient.discovery
+    import google.auth.transport.requests
+    import googleapiclient.http
+    GOOGLE_DRIVE_AVAILABLE = True
+except ImportError:
+    GOOGLE_DRIVE_AVAILABLE = False
 # Configure page and logging
 st.set_page_config(page_title="Advanced File Downloader", layout="wide")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Google OAuth Config
 GOOGLE_OAUTH_CONFIG = {
     "web": {
         "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
     }
 }
+# User Agent Settings
 USER_AGENTS = [
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
     'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
     'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
 ]
+# Network Interception Configuration
 NETWORK_INTERCEPTOR_CONFIG = {
     "enabled": False,
     "intercept_types": ["xhr", "fetch", "document", "media"],
     "intercept_folder": "./intercepted_data"
 }
+# Utility Functions
 def get_random_user_agent():
     return random.choice(USER_AGENTS)
     """Check if URL is a valid file URL based on extension"""
     return any(url.lower().endswith(ext) for ext in extensions)
+# Google Drive Functions
 def get_google_auth_url():
+    if not GOOGLE_DRIVE_AVAILABLE:
+        return None
     client_config = GOOGLE_OAUTH_CONFIG["web"]
     flow = google_auth_oauthlib.flow.Flow.from_client_config(
         {"web": client_config},
     return authorization_url
 def exchange_code_for_credentials(auth_code):
+    if not GOOGLE_DRIVE_AVAILABLE:
+        return None, "Google Drive API not available. Install google-auth-oauthlib and google-api-python-client."
     if not auth_code.strip():
         return None, "No code provided."
     try:
         return None, f"Error during token exchange: {e}"
 def google_drive_upload(file_path, credentials, folder_id=None):
+    if not GOOGLE_DRIVE_AVAILABLE:
+        return "Google Drive API not available"
     try:
         drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
         file_metadata = {'name': os.path.basename(file_path)}
         return f"Error uploading to Drive: {str(e)}"
 def create_drive_folder(drive_service, name):
+    if not GOOGLE_DRIVE_AVAILABLE:
+        return None
     folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
     folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
     return folder.get('id')
+# Setup Playwright
+def setup_playwright_dependencies():
+    """Install required system dependencies for Playwright"""
     try:
         # Install system dependencies
         subprocess.run(['apt-get', 'update', '-y'], check=True)
         packages = [
             'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
             'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
+            'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
         ]
         subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
+        # Install Playwright browser
+        subprocess.run(['python', '-m', 'playwright', 'install', 'chromium'], check=True)
         st.success("Dependencies installed successfully!")
         return True
     except Exception as e:
         st.error(f"Error installing dependencies: {e}")
+        st.info("You may need to manually install dependencies.")
         logger.error(f"Setup error: {e}")
         traceback.print_exc()
         return False
+# Download Manager Class
+class DownloadManager:
+    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True):
         self.use_proxy = use_proxy
         self.proxy = proxy
+        self.query = query
+        self.num_results = num_results
+        self.use_stealth = use_stealth
+        self.playwright = None
         self.browser = None
         self.context = None
         self.page = None
+        # Create intercepted data folder if enabled
+        if NETWORK_INTERCEPTOR_CONFIG["enabled"]:
+            os.makedirs(NETWORK_INTERCEPTOR_CONFIG["intercept_folder"], exist_ok=True)
+    async def __aenter__(self):
         self.playwright = await async_playwright().start()
+        # Configure browser launch options
         browser_args = [
             '--no-sandbox',
             '--disable-setuid-sandbox',
             '--disable-features=IsolateOrigins,site-per-process',
         ]
+        if self.use_stealth:
             browser_args.extend([
                 '--disable-blink-features=AutomationControlled',
                 '--disable-features=IsolateOrigins'
         if self.use_proxy and self.proxy:
             launch_options["proxy"] = {"server": self.proxy}
+        # Launch browser
         self.browser = await self.playwright.chromium.launch(**launch_options)
+        # Configure context options
         context_options = {
             "viewport": {"width": 1920, "height": 1080},
             "user_agent": get_random_user_agent(),
             "accept_downloads": True
         }
+        # Create context and apply stealth features
         self.context = await self.browser.new_context(**context_options)
+        if self.use_stealth:
             await self.context.add_init_script("""
                 Object.defineProperty(navigator, 'webdriver', { get: () => false });
                 Object.defineProperty(navigator, 'plugins', {
                 window.chrome = { runtime: {} };
             """)
+        # Create page and set headers
         self.page = await self.context.new_page()
+        await self.page.set_extra_http_headers({
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Referer': 'https://www.google.com/',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'cross-site',
+            'Sec-Fetch-User': '?1',
+            'Upgrade-Insecure-Requests': '1'
+        })
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def search_bing(self):
+        """Search Bing for results"""
         urls = []
         try:
+            search_url = f"https://www.bing.com/search?q={self.query}"
+            await self.page.goto(search_url, timeout=30000)
+            await self.page.wait_for_load_state('networkidle')
+            # Extract search results
+            links = await self.page.query_selector_all("li.b_algo h2 a")
+            for link in links[:self.num_results]:
+                href = await link.get_attribute('href')
+                if href:
+                    urls.append(href)
             return urls
         except Exception as e:
+            logger.error(f"Error searching Bing: {e}")
             return []
     async def get_file_size(self, url):
+        """Get file size by making a HEAD request"""
         try:
             headers = {'User-Agent': get_random_user_agent()}
             response = requests.head(url, headers=headers, timeout=15)
             return "Unknown Size"
     async def get_pdf_metadata(self, url):
+        """Extract metadata from PDF files"""
+        if not PdfReader:
+            return {}
         try:
             headers = {'User-Agent': get_random_user_agent()}
             response = requests.get(url, headers=headers, timeout=15, stream=True)
             return {}
     async def extract_real_download_url(self, url):
+        """Follow redirects to get the final download URL"""
         try:
             headers = {'User-Agent': get_random_user_agent()}
             response = requests.head(url, headers=headers, timeout=15, allow_redirects=True)
             return url
     async def get_edu_exam_links(self, url):
+        """Specialized method for educational exam websites"""
         try:
             logger.info(f"Fetching exam links from {url}")
             links = set()
                 response = requests.get(url, headers=headers, timeout=30)
                 if response.status_code == 200:
+                    # Parse with BeautifulSoup
                     soup = BeautifulSoup(response.text, "html.parser")
                     parsed_base = urlparse(url)
                     base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
                         href = a["href"]
                         full_url = urljoin(url, href)
+                        # Get link text
                         link_text = a.get_text().lower()
+                        # Define patterns to look for
                         url_patterns = [
                             "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
                             "/test/", "/download/", "/files/", "/assignments/",
+                            "paper_", "question_", "exam_", "test_", "past_"
                         ]
                         text_patterns = [
                             "exam", "paper", "test", "question", "past", "download",
+                            "assignment", "sample", "study", "material", "notes"
                         ]
+                        # Check for matches
                         if any(pattern in full_url.lower() for pattern in url_patterns) or \
                            any(pattern in link_text for pattern in text_patterns) or \
                            any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
             except Exception as e:
                 logger.warning(f"Request-based extraction failed: {e}")
+            # Use browser if few links were found or for specific sites
             if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
                 logger.info("Using browser for enhanced link extraction")
+                # Navigate to page
+                await self.page.goto(url, timeout=45000)
+                # Get page content
+                content = await self.page.content()
                 soup = BeautifulSoup(content, "html.parser")
                 parsed_base = urlparse(url)
                 base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+                # Find links in page
                 for a in soup.find_all("a", href=True):
                     href = a["href"]
                     full_url = urljoin(url, href)
                     link_text = a.get_text().lower()
+                    # Use the same patterns as above
                     url_patterns = [
                         "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
                         "/test/", "/download/", "/files/", "/assignments/",
+                        "paper_", "question_", "exam_", "test_", "past_"
                     ]
                     text_patterns = [
                         "exam", "paper", "test", "question", "past", "download",
+                        "assignment", "sample", "study", "material", "notes"
                     ]
                     if any(pattern in full_url.lower() for pattern in url_patterns) or \
                        any(pattern in link_text for pattern in text_patterns) or \
                        any(full_url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                         links.add(full_url)
+                # Try to click on elements that might reveal more links
+                try:
+                    # Find and click buttons that might show more content
+                    buttons = await self.page.query_selector_all('input[type="button"], button')
+                    for button in buttons:
+                        button_text = await button.text_content() or ""
+                        button_value = await button.get_attribute("value") or ""
+                        # Only click on promising buttons
+                        if any(keyword in (button_text + button_value).lower() for keyword in
+                              ["show", "view", "display", "list", "exam", "paper", "test"]):
+                            try:
+                                await button.click()
+                                await self.page.wait_for_timeout(1000)
+                                # Get any new links
+                                new_content = await self.page.content()
+                                new_soup = BeautifulSoup(new_content, "html.parser")
+                                for a in new_soup.find_all("a", href=True):
+                                    href = a["href"]
+                                    full_url = urljoin(url, href)
+                                    # Check if it's a file link
+                                    if any(full_url.lower().endswith(ext) for ext in
+                                          ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+                                        links.add(full_url)
+                            except Exception as e:
+                                logger.warning(f"Error clicking button: {e}")
+                except Exception as e:
+                    logger.warning(f"Error with interactive elements: {e}")
+            # Filter links to likely contain exam documents
             filtered_links = []
             for link in links:
                 # Common file extensions
                 # Common paths for exam documents
                 if any(pattern in link.lower() for pattern in [
                     "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
+                    "/pastpapers/", "/questionpapers/", "/tests/"
                 ]):
                     filtered_links.append(link)
             return []
     async def extract_downloadable_files(self, url, custom_ext_list):
+        """Extract all downloadable files from a webpage"""
         found_files = []
         try:
             # Special handling for educational exam sites
                     # Get metadata for PDFs
                     meta = {}
+                    if real_url.lower().endswith('.pdf') and PdfReader:
                         try:
                             meta = await self.get_pdf_metadata(real_url)
                         except Exception:
                         'filename': filename,
                         'size': size_str,
                         'metadata': meta,
+                        'source_url': url  # Keep track of source page
                     })
                 # If we found exam files with the specialized method, return them
                 if found_files:
                     return found_files
+            # Standard extraction method for regular websites
+            await self.page.goto(url, timeout=30000, wait_until='networkidle')
             # Get page content
+            content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
             # Define file extensions to look for
                 # Handle PHP and download links separately
                 if '.php' in href.lower() or 'download' in href.lower():
+                    full_url = href if href.startswith('http') else urljoin(url, href)
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                         filename = os.path.basename(urlparse(real_url).path) or 'downloaded_file'
                 # Check for direct file extensions
                 if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else urljoin(url, href)
                     size_str = await self.get_file_size(file_url)
                     meta = {}
+                    if file_url.lower().endswith('.pdf') and PdfReader:
                         meta = await self.get_pdf_metadata(file_url)
                     found_files.append({
                         'url': file_url,
                             break
                     if file_id:
+                        # Determine if it's view-only
                         is_view_only = "View-only" in (await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}"))
                         filename = f"gdrive_{file_id}"
                 for elem in soup.find_all(elem_tag):
                     src = elem.get('src') or elem.get('data')
                     if src and any(src.lower().endswith(ext) for ext in all_exts):
+                        file_url = src if src.startswith('http') else urljoin(url, src)
                         found_files.append({
                             'url': file_url,
                             'filename': os.path.basename(file_url.split('?')[0]),
             return []
     async def download_file(self, file_info, save_dir, referer=None):
+        """Download a file and save it to disk"""
         file_url = file_info['url']
         fname = file_info['filename']
         referer = referer or file_info.get('source_url', 'https://www.google.com')
+        # Create unique filename
         path = os.path.join(save_dir, fname)
         base, ext = os.path.splitext(fname)
         counter = 1
         try:
             # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
+                # For view-only Google Drive files
                 is_view_only = file_info.get('metadata', {}).get('view_only', False)
                 if is_view_only:
                     result_path = await self.download_viewonly_google_drive(file_info, path)
             return None
     async def download_viewonly_google_drive(self, file_info, save_path):
+        """Download view-only Google Drive documents using Playwright"""
         try:
             # Extract file ID
             file_id = file_info.get('metadata', {}).get('file_id')
             logger.info(f"Downloading view-only Google Drive file: {file_id}")
+            # Create a dedicated browser instance for this operation
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(
+                    headless=True,
+                    args=[
+                        '--no-sandbox',
+                        '--disable-setuid-sandbox',
+                        '--disable-dev-shm-usage',
+                        '--disable-web-security',
+                        '--disable-features=IsolateOrigins,site-per-process',
+                        '--disable-blink-features=AutomationControlled'
+                    ]
+                )
+                # Create context
+                context = await browser.new_context(
+                    viewport={'width': 1600, 'height': 1200},
+                    user_agent=get_random_user_agent(),
+                    accept_downloads=True
+                )
+                # Add stealth script
+                await context.add_init_script("""
+                    Object.defineProperty(navigator, 'webdriver', { get: () => false });
+                    Object.defineProperty(navigator, 'plugins', {
+                        get: () => [1, 2, 3, 4, 5].map(() => ({ length: 1 }))
+                    });
+                    Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
+                    window.chrome = { runtime: {} };
+                """)
+                page = await context.new_page()
+                try:
+                    # Navigate to the file
+                    await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
+                    await page.wait_for_load_state('networkidle')
+                    await page.wait_for_timeout(5000)  # Wait for rendering
+                    # Create temp directory
+                    temp_dir = tempfile.mkdtemp()
+                    # For PDF files, take screenshots of each page
+                    if file_type == 'pdf':
+                        # Create directory for screenshots
+                        screenshots_dir = os.path.join(temp_dir, "screenshots")
+                        os.makedirs(screenshots_dir, exist_ok=True)
+                        # Get page count estimation
+                        total_pages = await page.evaluate("""
+                            () => {
+                                // Look for page counters
+                                const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+                                    const text = el.textContent || '';
+                                    return /\\d+\\s*\\/\\s*\\d+/.test(text);
+                                });
+                                if (pageCounters.length > 0) {
+                                    const text = pageCounters[0].textContent || '';
+                                    const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
+                                    if (match && match[2]) return parseInt(match[2]);
+                                }
+                                // Look for page elements
+                                const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+                                if (pages.length > 0) return pages.length;
+                                // Default
+                                return 20;
+                            }
+                        """)
+                        logger.info(f"PDF has approximately {total_pages} pages")
+                        # Capture screenshots page by page
+                        screenshots = []
+                        for i in range(min(total_pages, 100)):  # Limit to 100 pages
+                            try:
+                                # Go to specific page
+                                if i > 0:
+                                    next_button = await page.query_selector('button[aria-label="Next page"]')
+                                    if next_button:
+                                        await next_button.click()
+                                        await page.wait_for_timeout(1000)
                                     else:
+                                        break
+                                # Take screenshot
+                                screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
+                                # Try to find page element
+                                page_element = await page.query_selector('.drive-viewer-paginated-page')
+                                if page_element:
+                                    await page_element.screenshot(path=screenshot_path)
+                                else:
+                                    await page.screenshot(path=screenshot_path)
+                                screenshots.append(screenshot_path)
+                            except Exception as e:
+                                logger.error(f"Error capturing page {i+1}: {e}")
+                                continue
+                        # Create PDF from screenshots
+                        if screenshots:
+                            # Get dimensions from first screenshot
+                            first_img = Image.open(screenshots[0])
+                            width, height = first_img.size
+                            # Create PDF
+                            c = canvas.Canvas(save_path, pagesize=(width, height))
+                            for screenshot in screenshots:
+                                c.drawImage(screenshot, 0, 0, width, height)
+                                c.showPage()
+                            c.save()
+                            # Clean up screenshots
+                            for screenshot in screenshots:
+                                os.remove(screenshot)
+                            # Clean up temp directory
                             shutil.rmtree(temp_dir, ignore_errors=True)
                             return save_path
+                        else:
+                            logger.error("No screenshots captured")
+                    else:
+                        # For non-PDF files, just take a screenshot
+                        screenshot_path = os.path.join(temp_dir, "file.png")
+                        await page.screenshot(path=screenshot_path)
+                        # Copy to destination
+                        shutil.copy(screenshot_path, save_path)
+                        # Clean up
+                        os.remove(screenshot_path)
+                        shutil.rmtree(temp_dir, ignore_errors=True)
+                        return save_path
+                finally:
+                    await browser.close()
             return None
         except Exception as e:
             return None
     async def get_sublinks(self, url, limit=10000):
+        """Extract all sublinks from a webpage"""
         links = set()
         try:
             logger.info(f"Extracting sublinks from {url}")
                     logger.info(f"Found {len(links)} sublinks with specialized method")
                     return list(links)[:limit]
+            # Navigate to the page
+            await self.page.goto(url, timeout=30000)
             # Get page content
+            content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
+            # Extract all links from the page
             parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             for a in soup.find_all('a', href=True):
                 href = a['href']
                 if href and not href.startswith('javascript:') and not href.startswith('#'):
             logger.error(f"Error extracting sublinks: {e}")
             return list(links)[:limit]
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
+        """Perform deep search for files on website and its subpages"""
         if not custom_ext_list:
             custom_ext_list = []
+        # Set up progress indicators
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
+            # Always include main page files
             all_files = main_files.copy()
+            # Process each sublink if there are any
+            if sublinks:
+                for i, sublink in enumerate(sublinks, 1):
+                    progress = i / max(total_links, 1)  # Avoid division by zero
+                    progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
+                    progress_bar.progress(progress)
+                    try:
+                        # Extract files from sublink
+                        sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
+                        all_files.extend(sub_files)
+                        file_count_text.text(f"Found {len(all_files)} total files")
+                    except Exception as e:
+                        logger.warning(f"Error processing sublink {sublink}: {e}")
             # Deduplicate files
             seen_urls = set()
                 progress_text.empty()
                 progress_bar.empty()
+# Main App
 def main():
     st.title("Advanced File Downloader")
         st.session_state.initialized = True
         st.session_state.discovered_files = []
         st.session_state.current_url = None
         st.session_state.selected_files = []
         st.session_state.do_deep_search = False
         st.session_state.deep_search_url = None
         st.session_state.search_results = []
         st.session_state.download_urls = {}  # For direct download links
+    # Install Playwright if needed
+    if "playwright_installed" not in st.session_state:
+        with st.spinner("Setting up Playwright. This may take a minute..."):
+            st.session_state.playwright_installed = setup_playwright_dependencies()
+    # Sidebar settings
     with st.sidebar:
+        mode = st.radio("Select Mode", ["Manual URL", "Web Search", "Google Drive"], key="mode_select")
+        with st.expander("Advanced Options", expanded=True):
             custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input",
                                             help="Enter extensions like .csv, .txt")
             max_sublinks = st.number_input("Maximum Sublinks", min_value=1, max_value=10000, value=100, step=10, key="max_sublinks")
             sublink_timeout = st.number_input("Timeout (seconds)", min_value=1, max_value=300, value=30, step=5, key="timeout")
             use_proxy = st.checkbox("Use Proxy", key="use_proxy")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
             use_stealth = st.checkbox("Use Stealth Mode", value=True, key="use_stealth",
                                     help="Makes browser harder to detect as automated")
+        if GOOGLE_DRIVE_AVAILABLE:
+            with st.expander("Google Drive Integration", expanded=False):
+                if st.button("Start Google Sign-In", key="google_signin_btn"):
+                    auth_url = get_google_auth_url()
+                    st.markdown(f"[Click here to authorize]({auth_url})")
+                auth_code = st.text_input("Enter authorization code", key="auth_code_input")
+                if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
+                    creds, msg = exchange_code_for_credentials(auth_code)
+                    st.session_state.google_creds = creds
+                    st.write(msg)
     # Main content area
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com/downloads", key="url_input")
+        if st.button("Deep Search", key="deep_search_btn"):
+            if url:
+                # Process custom extensions
+                custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
+                with st.spinner("Searching for files..."):
+                    async def run_deep_search():
+                        async with DownloadManager(
+                            use_proxy=use_proxy,
+                            proxy=proxy,
+                            use_stealth=use_stealth
+                        ) as dm:
+                            files = await dm.deep_search(url, custom_ext_list, max_sublinks, sublink_timeout)
+                            return files
+                    files = asyncio.run(run_deep_search())
+                    if files:
+                        st.session_state.discovered_files = files
+                        st.session_state.current_url = url
+                        st.success(f"Found {len(files)} files!")
+                    else:
+                        st.warning("No files found.")
         # Display and process discovered files
         if st.session_state.discovered_files:
                     file_info = f"{filename} ({size})"
                 file_options.append((i, file_info))
             # File selection multiselect
             selected_indices = st.multiselect(
             st.session_state.selected_files = selected_indices
+            # Display individual download buttons
             if files:
                 st.subheader("Available Files")
                 for i, file in enumerate(files):
                         st.write(f"Source: {file.get('source_url', 'Unknown')}")
                         st.write(f"URL: {file['url']}")
+                        # Download button for this file
+                        if st.button(f"Download", key=f"download_single_{i}"):
                             with st.spinner(f"Downloading {file['filename']}..."):
                                 # Create downloads directory
                                 download_dir = "./downloads"
                                 # Download the file
                                 async def download_single():
                                     async with DownloadManager(
                                         use_proxy=use_proxy,
                                         proxy=proxy,
                                         use_stealth=use_stealth
             if selected_indices:
                 st.subheader("Batch Download Options")
+                col1, col2, col3 = st.columns(3)
                 with col1:
                     download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
                 with col2:
                     create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
                 with col3:
                     delete_after = st.checkbox("Delete after ZIP", key="delete_after_checkbox")
                 if st.button("Download Selected Files", key="batch_download_btn"):
                     with st.spinner(f"Downloading {len(selected_indices)} files..."):
                         async def download_batch():
                             async with DownloadManager(
                                 use_proxy=use_proxy,
                                 proxy=proxy,
                                 use_stealth=use_stealth
                                     key="download_zip_btn"
                                 )
                                 # Delete original files if requested
                                 if delete_after:
                                     for path in downloaded_paths:
         if st.button("Search", key="web_search_btn"):
             if query:
+                with st.spinner("Searching..."):
                     async def run_search():
                         async with DownloadManager(
                             use_proxy=use_proxy,
                             proxy=proxy,
                             query=query,
                             num_results=num_results,
                             use_stealth=use_stealth
                         ) as dm:
+                            urls = await dm.search_bing()
                             return urls
                     urls = asyncio.run(run_search())
             with st.spinner("Searching for files..."):
                 async def deep_search_result():
                     async with DownloadManager(
                         use_proxy=use_proxy,
                         proxy=proxy,
                         use_stealth=use_stealth
                 else:
                     st.warning("No files found on this page.")
+    elif mode == "Google Drive" and GOOGLE_DRIVE_AVAILABLE:
+        st.header("Google Drive Download")
         # View-only Google Drive download
+        st.write("Download protected/view-only Google Drive documents")
+        file_id = st.text_input(
+            "Google Drive File ID",
+            placeholder="Enter ID from drive.google.com/file/d/THIS_IS_THE_ID/view",
+            key="drive_file_id"
+        )
+        if st.button("Download Document", key="drive_download_btn") and file_id:
+            with st.spinner("Downloading view-only document... (this may take a minute)"):
+                # Create download directory
+                download_dir = "./downloads"
+                os.makedirs(download_dir, exist_ok=True)
+                # Set output path
+                output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
+                # Download the file
+                async def download_drive_file():
+                    async with DownloadManager(
+                        use_proxy=use_proxy,
+                        proxy=proxy,
+                        use_stealth=use_stealth
+                    ) as dm:
+                        file_info = {
+                            'url': f"https://drive.google.com/file/d/{file_id}/view",
+                            'filename': f"gdrive_{file_id}.pdf",
+                            'metadata': {'file_id': file_id, 'view_only': True}
+                        }
+                        return await dm.download_viewonly_google_drive(file_info, output_path)
+                result_path = asyncio.run(download_drive_file())
+                if result_path:
+                    st.success("Document downloaded successfully!")
+                    # Provide download link
+                    with open(result_path, "rb") as f:
+                        file_bytes = f.read()
+                    st.download_button(
+                        label="Download PDF",
+                        data=file_bytes,
+                        file_name=os.path.basename(result_path),
+                        mime="application/pdf",
+                        key="drive_pdf_download"
+                    )
+                else:
+                    st.error("Failed to download the document. Please check the file ID and try again.")
     # Footer
     st.markdown("---")
+    st.markdown("Created by [Euler314](https://github.com/euler314) | Advanced File Downloader")
 # Run the app
 if __name__ == "__main__":