Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 16

Commit

4ce7f57

verified ·

1 Parent(s): 39a7a36

Update app.py

Browse files

Files changed (1) hide show

app.py +376 -273

app.py CHANGED Viewed

@@ -217,6 +217,7 @@ class DownloadManager:
         self.browser = None
         self.context = None
         self.page = None
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
@@ -250,6 +251,63 @@ class DownloadManager:
         if self.playwright:
             await self.playwright.stop()
     async def get_file_size(self, url):
         try:
             async with self.context.new_page() as page:
@@ -280,40 +338,32 @@ class DownloadManager:
         except Exception:
             return {}
-    async def extract_real_download_url(self, url):
-        try:
-            async with self.context.new_page() as page:
-                response = await page.goto(url, wait_until='networkidle', timeout=30000)
-                if response and response.headers.get('location'):
-                    return response.headers['location']
-                return page.url
-        except Exception as e:
-            logger.error(f"Error extracting real download URL: {e}")
-            return url
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
-            response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
-            if not response:
-                return []
-            final_url = self.page.url
-            if '.php' in final_url or 'download' in final_url:
-                real_url = await self.extract_real_download_url(final_url)
-                if real_url != final_url:
-                    found_files.append({
-                        'url': real_url,
-                        'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                        'size': await self.get_file_size(real_url),
-                        'metadata': {}
-                    })
-                    return found_files
-            await self.page.wait_for_load_state('networkidle', timeout=30000)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
             default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
                           '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
@@ -321,70 +371,73 @@ class DownloadManager:
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             for a in soup.find_all('a', href=True):
-                href = a['href'].strip()
-                # Handle PHP scripts and redirects
-                if '.php' in href.lower() or 'download' in href.lower():
-                    full_url = href if href.startswith('http') else (
-                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
-                    )
-                    real_url = await self.extract_real_download_url(full_url)
-                    if real_url and real_url != full_url:
-                        found_files.append({
-                            'url': real_url,
-                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                            'size': await self.get_file_size(real_url),
-                            'metadata': {}
-                        })
-                        continue
-                # Handle direct file links
-                if any(href.lower().endswith(ext) for ext in all_exts):
-                    file_url = href if href.startswith('http') else (
-                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
-                    )
-                    size_str = await self.get_file_size(file_url)
-                    meta = {}
-                    if file_url.lower().endswith('.pdf'):
-                        meta = await self.get_pdf_metadata(file_url)
-                    found_files.append({
-                        'url': file_url,
-                        'filename': os.path.basename(file_url.split('?')[0]),
-                        'size': size_str,
-                        'metadata': meta
-                    })
-                # Handle Google Drive links
-                elif ("drive.google.com" in href) or ("docs.google.com" in href):
-                    file_id = None
-                    for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
-                        match = re.search(pattern, href)
-                        if match:
-                            file_id = match.group(1)
-                            break
-                    if file_id:
-                        direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-                        filename = file_id
-                        try:
-                            response = await self.page.request.head(direct_url, timeout=15000)
-                            cd = response.headers.get("Content-Disposition", "")
-                            if cd:
-                                mt = re.search(r'filename\*?="?([^";]+)', cd)
-                                if mt:
-                                    filename = mt.group(1).strip('"').strip()
                             found_files.append({
-                                'url': direct_url,
-                                'filename': filename,
-                                'size': await self.get_file_size(direct_url),
                                 'metadata': {}
                             })
-                        except Exception as e:
-                            logger.error(f"Error processing Google Drive link: {e}")
             # Make results unique based on URLs
             seen_urls = set()
@@ -395,11 +448,11 @@ class DownloadManager:
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
-    async def download_file(self, file_info, save_dir, referer):
         file_url = file_info['url']
         fname = file_info['filename']
         path = os.path.join(save_dir, fname)
@@ -412,11 +465,14 @@ class DownloadManager:
         os.makedirs(save_dir, exist_ok=True)
         try:
-            if "drive.google.com" in file_url:
                 import gdown
                 try:
                     st.write(f"Downloading from Google Drive: {fname}")
-                    output = gdown.download(file_url, path, quiet=False)
                     if output:
                         return path
                     return None
@@ -433,7 +489,7 @@ class DownloadManager:
                     'Referer': referer
                 }
-                response = await page.request.get(file_url, headers=headers, timeout=30000)
                 if response.status == 200:
                     content = await response.body()
@@ -441,61 +497,86 @@ class DownloadManager:
                         f.write(content)
                     return path
                 else:
-                    logger.error(f"Download failed with status {response.status}: {file_url}")
                     return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
-    async def search_bing(self):
-        if not self.query:
-            return [], []
-        search_query = self.query
-        if "filetype:pdf" not in search_query.lower():
-            search_query += " filetype:pdf"
-        search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
-        try:
-            await self.page.goto(search_url, timeout=30000)
-            await self.page.wait_for_selector('li.b_algo', timeout=30000)
-            results = []
-            elements = await self.page.query_selector_all('li.b_algo')
-            for element in elements:
-                link = await element.query_selector('h2 a')
-                if link:
-                    url = await link.get_attribute('href')
-                    if url:
-                        results.append(url)
-            return results[:self.num_results]
-        except Exception as e:
-            logger.error(f"Bing search error: {e}")
-            return []
     async def get_sublinks(self, url, limit=100):
         try:
-            await self.page.goto(url, timeout=30000)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
-            parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             links = set()
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
-                if href.startswith('http'):
                     links.add(href)
-                elif href.startswith('/'):
-                    links.add(f"{base_url}{href}")
-            return list(links)[:limit]
         except Exception as e:
             logger.error(f"Error getting sublinks: {e}")
@@ -510,15 +591,21 @@ class DownloadManager:
         file_count_text = st.empty()
         try:
             # Search main page
             progress_text.text("Analyzing main page...")
-            main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
             file_count_text.text(f"Found {initial_count} files on main page")
             # Get and search sublinks
             progress_text.text("Getting sublinks...")
-            sublinks = await self.get_sublinks(url, limit=sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
@@ -542,7 +629,14 @@ class DownloadManager:
                         # Set timeout for this sublink
                         async with async_timeout.timeout(timeout):
-                            sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                             return sub_files
                     except asyncio.TimeoutError:
                         logger.warning(f"Timeout processing sublink: {sublink}")
@@ -603,160 +697,169 @@ def main():
     st.title("Advanced File Downloader")
-    # Mode Selection
-    mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
-    # Advanced Options
-    with st.expander("Advanced Options"):
-        custom_extensions = st.text_input(
-            "Custom File Extensions",
-            placeholder=".csv, .txt, .epub",
-            key="custom_ext_input"
-        )
-        max_sublinks = st.number_input(
-            "Maximum Sublinks to Process",
-            min_value=1,
-            max_value=10000,
-            value=100,
-            step=50,
-            help="Maximum number of sublinks to process from the main page",
-            key="max_sublinks_input"
-        )
-        sublink_timeout = st.number_input(
-            "Search Timeout (seconds per sublink)",
-            min_value=1,
-            max_value=3000,
-            value=30,
-            step=5,
-            help="Maximum time to spend searching each sublink",
-            key="timeout_input"
-        )
-        use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
-        proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
-    # Google Drive Integration
-    with st.expander("Google Drive Integration"):
-        if st.button("Start Google Sign-In", key="google_signin_btn"):
-            auth_url = get_google_auth_url()
-            st.markdown(f"[Click here to authorize]({auth_url})")
-        auth_code = st.text_input("Enter authorization code", key="auth_code_input")
-        if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
-            creds, msg = exchange_code_for_credentials(auth_code)
-            st.session_state.google_creds = creds
-            st.write(msg)
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
-        if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
-            if url:
-                async def run_deep_search():
-                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                        files = await dm.deep_search(
-                            url=url,
-                            custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
-                            sublink_limit=max_sublinks,
-                            timeout=sublink_timeout
-                        )
-                        if files:
-                            st.session_state.discovered_files = files
-                            st.session_state.current_url = url
-                        return files
-                files = asyncio.run(run_deep_search())
-                if files:
-                    st.success(f"Found {len(files)} files!")
-                    # Select All/Clear Selection buttons
-                    col1, col2 = st.columns([1, 4])
-                    with col1:
-                        if st.button("Select All", key="select_all_btn"):
-                            st.session_state.selected_files = list(range(len(files)))
-                            st.experimental_rerun()
-                        if st.button("Clear Selection", key="clear_selection_btn"):
-                            st.session_state.selected_files = []
-                            st.experimental_rerun()
-                    # File selection
-                    selected_files = st.multiselect(
-                        "Select files to download",
-                        options=list(range(len(files))),
-                        default=st.session_state.selected_files,
-                        format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
-                        key="file_multiselect"
-                    )
-                    # Update session state
-                    st.session_state.selected_files = selected_files
-                    if selected_files:
-                        col1, col2, col3, col4 = st.columns(4)
                         with col1:
-                            download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
-                        with col2:
-                            create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
-                        with col3:
-                            delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
-                        with col4:
-                            upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
-                        if st.button("Download Selected", key="download_btn"):
-                            if not os.path.exists(download_dir):
-                                os.makedirs(download_dir)
-                            async def download_files():
-                                downloaded_paths = []
-                                progress_bar = st.progress(0)
-                                status_text = st.empty()
-                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                    for i, idx in enumerate(selected_files):
-                                        progress = (i + 1) / len(selected_files)
-                                        file_info = files[idx]
-                                        status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
-                                        progress_bar.progress(progress)
-                                        path = await dm.download_file(
-                                            file_info,
-                                            download_dir,
-                                            url
-                                        )
-                                        if path:
-                                            downloaded_paths.append(path)
-                                    status_text.empty()
-                                    progress_bar.empty()
-                                    return downloaded_paths
-                            downloaded = asyncio.run(download_files())
-                            if downloaded:
-                                st.success(f"Successfully downloaded {len(downloaded)} files")
-                                if create_zip or upload_to_drive:
-                                    zip_path = create_zip_file(downloaded, download_dir)
-                                    st.success(f"Created ZIP file: {zip_path}")
-                                    if upload_to_drive and st.session_state.get('google_creds'):
-                                        with st.spinner("Uploading to Google Drive..."):
-                                            drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
-                                            if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
-                                                st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
-                                            else:
-                                                st.error(drive_id)
-                                    if delete_after:
-                                        for path in downloaded:
-                                            try:
-                                                os.remove(path)
-                                            except Exception as e:
-                                                st.warning(f"Could not delete {path}: {e}")
-                                        st.info("Deleted original files after ZIP creation")
-                else:
-                    st.warning("No files found.")
         # Display current files if they exist in session state
         elif st.session_state.discovered_files:

         self.browser = None
         self.context = None
         self.page = None
+        self.base_domains = set()  # Store base domains and their variations
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
         if self.playwright:
             await self.playwright.stop()
+    def get_base_domain(self, url):
+        """Extract base domain and add variations to self.base_domains"""
+        parsed = urlparse(url)
+        domain = parsed.netloc.split(':')[0]  # Remove port if present
+        # Add the main domain and possible variations
+        base_parts = domain.split('.')
+        if len(base_parts) > 2:
+            main_domain = '.'.join(base_parts[-2:])
+            self.base_domains.add(main_domain)
+            # Add variations like files.domain.com for domain.com
+            self.base_domains.add(domain)
+            # Handle www and non-www versions
+            if base_parts[0] == 'www':
+                self.base_domains.add('.'.join(base_parts[1:]))
+            else:
+                self.base_domains.add(f"www.{domain}")
+        else:
+            self.base_domains.add(domain)
+        return domain
+    def is_related_domain(self, url):
+        """Check if URL belongs to any of the known domain variations"""
+        parsed = urlparse(url)
+        domain = parsed.netloc.split(':')[0]
+        # Check if this domain or any of its parts match our base domains
+        parts = domain.split('.')
+        for i in range(len(parts) - 1):
+            check_domain = '.'.join(parts[i:])
+            if check_domain in self.base_domains:
+                return True
+        return False
+    async def get_real_url(self, url):
+        """Follow redirects and get the final URL"""
+        try:
+            async with self.context.new_page() as page:
+                response = await page.goto(url, wait_until='networkidle', timeout=30000)
+                final_url = page.url
+                # Check for meta refresh redirects
+                content = await page.content()
+                soup = BeautifulSoup(content, 'html.parser')
+                meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
+                if meta_refresh:
+                    content = meta_refresh.get('content', '')
+                    if 'url=' in content.lower():
+                        final_url = content.split('url=')[-1].strip("'").strip('"')
+                return final_url, response.headers if response else {}
+        except Exception as e:
+            logger.error(f"Error getting real URL for {url}: {e}")
+            return url, {}
     async def get_file_size(self, url):
         try:
             async with self.context.new_page() as page:
         except Exception:
             return {}
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
+            # Follow redirects and get the final URL
+            final_url, headers = await self.get_real_url(url)
+            # Add this domain to our known domains
+            self.get_base_domain(final_url)
+            # Check if the URL itself is a file
+            content_type = headers.get('content-type', '').lower()
+            if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
+                found_files.append({
+                    'url': final_url,
+                    'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
+                    'size': await self.get_file_size(final_url),
+                    'metadata': {}
+                })
+                return found_files
+            # Load the page
+            await self.page.goto(final_url, timeout=30000, wait_until='networkidle')
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
+            # Define extensions to look for
             default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
                           '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            # Find all links including those in scripts and other elements
+            links = set()
+            # Regular links
             for a in soup.find_all('a', href=True):
+                links.add(a['href'])
+            # Script-embedded links
+            scripts = soup.find_all('script')
+            for script in scripts:
+                if script.string:
+                    urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
+                    links.update(urls)
+            for href in links:
+                href = href.strip()
+                # Skip empty or javascript links
+                if not href or href.startswith(('javascript:', '#', 'mailto:')):
+                    continue
+                # Handle both direct file links and PHP/script downloads
+                if '.php' in href.lower() or 'download' in href.lower() or 'visit' in href.lower():
+                    try:
+                        # Convert to absolute URL if needed
+                        if not href.startswith(('http://', 'https://')):
+                            if href.startswith('/'):
+                                href = base_url + href
+                            else:
+                                href = base_url + '/' + href
+                        # Follow the link to get the real file
+                        real_url, real_headers = await self.get_real_url(href)
+                        # Check if it leads to a file
+                        content_type = real_headers.get('content-type', '').lower()
+                        if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
                             found_files.append({
+                                'url': real_url,
+                                'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                                'size': await self.get_file_size(real_url),
                                 'metadata': {}
                             })
+                    except Exception as e:
+                        logger.error(f"Error processing PHP/script link {href}: {e}")
+                        continue
+                # Handle direct file links
+                elif any(href.lower().endswith(ext) for ext in all_exts):
+                    # Convert to absolute URL if needed
+                    if not href.startswith(('http://', 'https://')):
+                        if href.startswith('/'):
+                            href = base_url + href
+                        else:
+                            href = base_url + '/' + href
+                    # Verify if it's from a related domain
+                    if self.is_related_domain(href):
+                        size_str = await self.get_file_size(href)
+                        meta = {}
+                        if href.lower().endswith('.pdf'):
+                            meta = await self.get_pdf_metadata(href)
+                        found_files.append({
+                            'url': href,
+                            'filename': os.path.basename(href.split('?')[0]),
+                            'size': size_str,
+                            'metadata': meta
+                        })
             # Make results unique based on URLs
             seen_urls = set()
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
+async def download_file(self, file_info, save_dir, referer):
         file_url = file_info['url']
         fname = file_info['filename']
         path = os.path.join(save_dir, fname)
         os.makedirs(save_dir, exist_ok=True)
         try:
+            # Get the real URL first
+            real_url, _ = await self.get_real_url(file_url)
+            if "drive.google.com" in real_url:
                 import gdown
                 try:
                     st.write(f"Downloading from Google Drive: {fname}")
+                    output = gdown.download(real_url, path, quiet=False)
                     if output:
                         return path
                     return None
                     'Referer': referer
                 }
+                response = await page.request.get(real_url, headers=headers, timeout=30000)
                 if response.status == 200:
                     content = await response.body()
                         f.write(content)
                     return path
                 else:
+                    logger.error(f"Download failed with status {response.status}: {real_url}")
                     return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
     async def get_sublinks(self, url, limit=100):
         try:
+            # Get the real URL first
+            real_url, _ = await self.get_real_url(url)
+            await self.page.goto(real_url, timeout=30000)
+            # Wait for dynamic content
+            await self.page.wait_for_load_state('networkidle')
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
+            parsed_base = urlparse(real_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            current_path = os.path.dirname(parsed_base.path)
             links = set()
+            # Find links from various sources
+            # 1. Regular links
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
+                if href and not href.startswith(('javascript:', '#', 'mailto:')):
                     links.add(href)
+            # 2. Script-embedded links
+            scripts = soup.find_all('script')
+            for script in scripts:
+                if script.string:
+                    urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
+                    links.update(urls)
+            # 3. Form actions
+            forms = soup.find_all('form', action=True)
+            for form in forms:
+                links.add(form['action'])
+            # Process and clean links
+            clean_links = set()
+            for href in links:
+                try:
+                    # Skip empty links
+                    if not href.strip():
+                        continue
+                    # Convert to absolute URL
+                    if href.startswith('http'):
+                        full_url = href
+                    elif href.startswith('//'):
+                        full_url = parsed_base.scheme + ':' + href
+                    elif href.startswith('/'):
+                        full_url = base_url + href
+                    else:
+                        # Handle relative paths
+                        if current_path and current_path != '/':
+                            full_url = base_url + current_path + '/' + href
+                        else:
+                            full_url = base_url + '/' + href
+                    # Clean the URL
+                    full_url = full_url.split('#')[0]  # Remove fragments
+                    # Only add if it's a related domain
+                    if self.is_related_domain(full_url):
+                        clean_links.add(full_url)
+                except Exception as e:
+                    logger.error(f"Error processing link {href}: {e}")
+                    continue
+            # Sort links for consistency
+            sorted_links = sorted(list(clean_links))
+            return sorted_links[:limit]
         except Exception as e:
             logger.error(f"Error getting sublinks: {e}")
         file_count_text = st.empty()
         try:
+            # Initialize base domains with the original URL
+            self.get_base_domain(url)
+            # Get the real initial URL
+            real_url, _ = await self.get_real_url(url)
             # Search main page
             progress_text.text("Analyzing main page...")
+            main_files = await self.extract_downloadable_files(real_url, custom_ext_list)
             initial_count = len(main_files)
             file_count_text.text(f"Found {initial_count} files on main page")
             # Get and search sublinks
             progress_text.text("Getting sublinks...")
+            sublinks = await self.get_sublinks(real_url, limit=sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
                         # Set timeout for this sublink
                         async with async_timeout.timeout(timeout):
+                            # Get real URL before processing
+                            real_sublink, _ = await self.get_real_url(sublink)
+                            sub_files = await self.extract_downloadable_files(real_sublink, custom_ext_list)
+                            if sub_files:
+                                logger.info(f"Found {len(sub_files)} files at {real_sublink}")
+                                st.write(f"Found {len(sub_files)} files at {real_sublink}")
                             return sub_files
                     except asyncio.TimeoutError:
                         logger.warning(f"Timeout processing sublink: {sublink}")
     st.title("Advanced File Downloader")
+    # Sidebar
+    with st.sidebar:
+        # Mode Selection
+        mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
+        # Advanced Options
+        with st.expander("Advanced Options", expanded=True):
+            custom_extensions = st.text_input(
+                "Custom File Extensions",
+                placeholder=".csv, .txt, .epub",
+                key="custom_ext_input"
+            )
+            max_sublinks = st.number_input(
+                "Maximum Sublinks to Process",
+                min_value=1,
+                max_value=10000,
+                value=100,
+                step=50,
+                help="Maximum number of sublinks to process from the main page",
+                key="max_sublinks_input"
+            )
+            sublink_timeout = st.number_input(
+                "Search Timeout (seconds per sublink)",
+                min_value=1,
+                max_value=3000,
+                value=30,
+                step=5,
+                help="Maximum time to spend searching each sublink",
+                key="timeout_input"
+            )
+            use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
+            proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
+        # Google Drive Integration
+        with st.expander("Google Drive Integration", expanded=False):
+            if st.button("Start Google Sign-In", key="google_signin_btn"):
+                auth_url = get_google_auth_url()
+                st.markdown(f"[Click here to authorize]({auth_url})")
+            auth_code = st.text_input("Enter authorization code", key="auth_code_input")
+            if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
+                creds, msg = exchange_code_for_credentials(auth_code)
+                st.session_state.google_creds = creds
+                st.write(msg)
+    # Main content area
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
+                if url:
+                    async def run_deep_search():
+                        try:
+                            async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                files = await dm.deep_search(
+                                    url=url,
+                                    custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
+                                    sublink_limit=int(max_sublinks),
+                                    timeout=int(sublink_timeout)
+                                )
+                                if files:
+                                    st.session_state.discovered_files = files
+                                    st.session_state.current_url = url
+                                return files
+                        except Exception as e:
+                            st.error(f"Error during deep search: {str(e)}")
+                            return None
+                    files = asyncio.run(run_deep_search())
+                    if files:
+                        st.success(f"Found {len(files)} files!")
+                        # Select All/Clear Selection buttons
+                        col1, col2 = st.columns([1, 4])
                         with col1:
+                            if st.button("Select All", key="select_all_btn"):
+                                st.session_state.selected_files = list(range(len(files)))
+                                st.experimental_rerun()
+                            if st.button("Clear Selection", key="clear_selection_btn"):
+                                st.session_state.selected_files = []
+                                st.experimental_rerun()
+                        # File selection
+                        selected_files = st.multiselect(
+                            "Select files to download",
+                            options=list(range(len(files))),
+                            default=st.session_state.selected_files,
+                            format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
+                            key="file_multiselect"
+                        )
+                        # Update session state
+                        st.session_state.selected_files = selected_files
+                        if selected_files:
+                            col1, col2, col3, col4 = st.columns(4)
+                            with col1:
+                                download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
+                            with col2:
+                                create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
+                            with col3:
+                                delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
+                            with col4:
+                                upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
+                            if st.button("Download Selected", key="download_btn"):
+                                if not os.path.exists(download_dir):
+                                    os.makedirs(download_dir)
+                                async def download_files():
+                                    downloaded_paths = []
+                                    progress_bar = st.progress(0)
+                                    status_text = st.empty()
+                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                        for i, idx in enumerate(selected_files):
+                                            progress = (i + 1) / len(selected_files)
+                                            file_info = files[idx]
+                                            status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
+                                            progress_bar.progress(progress)
+                                            path = await dm.download_file(
+                                                file_info,
+                                                download_dir,
+                                                url
+                                            )
+                                            if path:
+                                                downloaded_paths.append(path)
+                                        status_text.empty()
+                                        progress_bar.empty()
+                                        return downloaded_paths
+                                downloaded = asyncio.run(download_files())
+                                if downloaded:
+                                    st.success(f"Successfully downloaded {len(downloaded)} files")
+                                    if create_zip or upload_to_drive:
+                                        zip_path = create_zip_file(downloaded, download_dir)
+                                        st.success(f"Created ZIP file: {zip_path}")
+                                        if upload_to_drive and st.session_state.get('google_creds'):
+                                            with st.spinner("Uploading to Google Drive..."):
+                                                drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
+                                                if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
+                                                    st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
+                                                else:
+                                                    st.error(drive_id)
+                                        if delete_after:
+                                            for path in downloaded:
+                                                try:
+                                                    os.remove(path)
+                                                except Exception as e:
+                                                    st.warning(f"Could not delete {path}: {e}")
+                                            st.info("Deleted original files after ZIP creation")
+                    else:
+                        st.warning("No files found.")
         # Display current files if they exist in session state
         elif st.session_state.discovered_files: