Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 2

Commit

ac436f8

verified ·

1 Parent(s): 9c2fa03

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -496

app.py CHANGED Viewed

@@ -26,6 +26,18 @@ import google_auth_oauthlib.flow
 import googleapiclient.discovery
 import google.auth.transport.requests
 from async_timeout import timeout as async_timeout
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     filename='advanced_download_log.txt',
@@ -33,7 +45,7 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
-# Google OAuth Configuration
 GOOGLE_OAUTH_CONFIG = {
     "web": {
         "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
@@ -49,49 +61,22 @@ GOOGLE_OAUTH_CONFIG = {
 # Playwright Setup
 def install_playwright_dependencies():
     os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
-    os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
-    try:
-        subprocess.run(['apt-get', 'update', '-y'], check=True)
-        packages = [
-            'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
-            'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
-            'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
-        ]
-        subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
-        os.makedirs('/usr/lib/playwright', exist_ok=True)
-        symlinks = {
-            'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
-            'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
-            'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
-            'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
-            'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
-            'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
-            'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
-            'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
-            'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
-            'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
-        }
-        for link_name, target in symlinks.items():
-            link_path = os.path.join('/usr/lib/playwright', link_name)
-            if not os.path.exists(link_path):
-                os.symlink(target, link_path)
-        subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
-        browser_path = os.path.expanduser("~/.cache/ms-playwright")
-        os.makedirs(browser_path, exist_ok=True)
-        subprocess.run(['chmod', '-R', '755', browser_path], check=True)
-    except subprocess.CalledProcessError as e:
-        print(f"Error installing dependencies: {e}")
-    except Exception as e:
-        print(f"Error: {e}")
-# Initialize Playwright
 install_playwright_dependencies()
 # Model Loading
 @st.cache_resource
 def load_models():
     try:
-        # Try to load spaCy model
         try:
             nlp = spacy.load("en_core_web_sm")
         except OSError:
@@ -99,43 +84,26 @@ def load_models():
             spacy.cli.download("en_core_web_sm")
             nlp = spacy.load("en_core_web_sm")
-        # Load SentenceTransformer with offline handling
         try:
-            from sentence_transformers import SentenceTransformer
-            model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
-            cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
-            if os.path.exists(os.path.join(cache_dir, model_name)):
-                semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
-            else:
-                st.warning(f"Downloading SentenceTransformer model {model_name}...")
-                semantic_model = SentenceTransformer(model_name)
         except Exception as e:
             st.error(f"Error loading SentenceTransformer: {e}")
             semantic_model = None
-        # Load Transformers pipeline with offline handling
         try:
-            from transformers import pipeline
-            model_name = "facebook/bart-large-cnn"
-            cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
-            if os.path.exists(os.path.join(cache_dir, model_name)):
-                summarizer = pipeline("summarization", model=model_name)
-            else:
-                st.warning(f"Downloading Transformer model {model_name}...")
-                summarizer = pipeline("summarization")
         except Exception as e:
             st.error(f"Error loading Transformers: {e}")
             summarizer = None
         return nlp, semantic_model, summarizer
     except Exception as e:
         st.error(f"Error loading models: {e}")
         return None, None, None
-# Initialize models
-with st.spinner("Loading models..."):
-    nlp_model, semantic_model, summarizer = load_models()
 # Utility Functions
 def get_random_user_agent():
@@ -157,11 +125,9 @@ def sizeof_fmt(num, suffix='B'):
 def create_zip_file(file_paths, output_dir):
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
     with zipfile.ZipFile(zip_path, 'w') as zipf:
         for file_path in file_paths:
             zipf.write(file_path, os.path.basename(file_path))
     return zip_path
 # Google Drive Functions
@@ -197,16 +163,23 @@ def exchange_code_for_credentials(auth_code):
     except Exception as e:
         return None, f"Error during token exchange: {e}"
-def google_drive_upload(zip_path: str, credentials):
     try:
         drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
-        file_metadata = {'name': os.path.basename(zip_path)}
-        media = googleapiclient.http.MediaFileUpload(zip_path, resumable=True)
         created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
         return created.get("id", "")
     except Exception as e:
         return f"Error uploading to Drive: {str(e)}"
-# DownloadManager Class
 # DownloadManager Class
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
@@ -234,7 +207,6 @@ class DownloadManager:
         }
         if self.use_proxy and self.proxy:
             opts["proxy"] = {"server": self.proxy}
         self.browser = await self.playwright.chromium.launch(**opts)
         self.context = await self.browser.new_context(user_agent=get_random_user_agent())
         self.page = await self.context.new_page()
@@ -257,14 +229,11 @@ class DownloadManager:
             search_url = f"https://www.bing.com/search?q={self.query}"
             await self.page.goto(search_url, timeout=30000)
             await self.page.wait_for_load_state('networkidle')
-            # Extract search result links
             links = await self.page.query_selector_all("li.b_algo h2 a")
             for link in links[:self.num_results]:
                 href = await link.get_attribute('href')
                 if href:
                     urls.append(href)
             return urls
         except Exception as e:
             logger.error(f"Error searching Bing: {e}")
@@ -335,7 +304,8 @@ class DownloadManager:
             soup = BeautifulSoup(content, 'html.parser')
             default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
-                          '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
             parsed_base = urlparse(final_url)
@@ -344,11 +314,8 @@ class DownloadManager:
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
-                # Handle PHP scripts and redirects
                 if '.php' in href.lower() or 'download' in href.lower():
-                    full_url = href if href.startswith('http') else (
-                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
-                    )
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                         found_files.append({
@@ -359,17 +326,12 @@ class DownloadManager:
                         })
                         continue
-                # Handle direct file links
                 if any(href.lower().endswith(ext) for ext in all_exts):
-                    file_url = href if href.startswith('http') else (
-                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
-                    )
                     size_str = await self.get_file_size(file_url)
                     meta = {}
                     if file_url.lower().endswith('.pdf'):
                         meta = await self.get_pdf_metadata(file_url)
                     found_files.append({
                         'url': file_url,
                         'filename': os.path.basename(file_url.split('?')[0]),
@@ -385,7 +347,6 @@ class DownloadManager:
                         if match:
                             file_id = match.group(1)
                             break
                     if file_id:
                         direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
                         filename = file_id
@@ -396,7 +357,6 @@ class DownloadManager:
                                 mt = re.search(r'filename\*?="?([^";]+)', cd)
                                 if mt:
                                     filename = mt.group(1).strip('"').strip()
                             found_files.append({
                                 'url': direct_url,
                                 'filename': filename,
@@ -406,14 +366,12 @@ class DownloadManager:
                         except Exception as e:
                             logger.error(f"Error processing Google Drive link: {e}")
-            # Make results unique based on URLs
             seen_urls = set()
             unique_files = []
             for f in found_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
@@ -428,106 +386,29 @@ class DownloadManager:
         while os.path.exists(path):
             path = os.path.join(save_dir, f"{base}_{counter}{ext}")
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
             if "drive.google.com" in file_url:
                 import gdown
-                try:
-                    st.write(f"Downloading from Google Drive: {fname}")
-                    # Determine file extension or use a default if none available
-                    if not ext or ext == "":
-                        # Try to determine file type from content-type header
-                        async with self.context.new_page() as page:
-                            response = await page.request.head(file_url, timeout=15000)
-                            content_type = response.headers.get('Content-Type', '')
-                            # Map content types to extensions
-                            extension_map = {
-                                'application/pdf': '.pdf',
-                                'image/jpeg': '.jpg',
-                                'image/png': '.png',
-                                'application/msword': '.doc',
-                                'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
-                                'application/zip': '.zip',
-                                'text/plain': '.txt',
-                                'application/vnd.ms-excel': '.xls',
-                                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
-                                'video/mp4': '.mp4',
-                                'audio/mpeg': '.mp3',
-                                'video/x-msvideo': '.avi',
-                                'video/x-matroska': '.mkv'
-                            }
-                            # Get extension from content type or use .bin as fallback
-                            ext = extension_map.get(content_type.split(';')[0], '.bin')
-                            path = os.path.join(save_dir, f"{base}{ext}")
-                            # Handle name collisions
-                            counter = 1
-                            while os.path.exists(path):
-                                path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-                                counter += 1
-                    output = gdown.download(file_url, path, quiet=False)
-                    if output:
-                        return path
-                    return None
-                except Exception as e:
-                    logger.error(f"Google Drive download error: {e}")
-                    return None
             async with self.context.new_page() as page:
-                st.write(f"Downloading: {fname}")
                 headers = {
                     'Accept': '*/*',
                     'Accept-Encoding': 'gzip, deflate, br',
                     'Referer': referer
                 }
                 response = await page.request.get(file_url, headers=headers, timeout=30000)
                 if response.status == 200:
                     content = await response.body()
-                    # Check if we need to add an extension based on content type
-                    if not ext or ext == "":
-                        content_type = response.headers.get('Content-Type', '')
-                        extension_map = {
-                            'application/pdf': '.pdf',
-                            'image/jpeg': '.jpg',
-                            'image/png': '.png',
-                            'application/msword': '.doc',
-                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
-                            'application/zip': '.zip',
-                            'text/plain': '.txt',
-                            'application/vnd.ms-excel': '.xls',
-                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
-                            'video/mp4': '.mp4',
-                            'audio/mpeg': '.mp3',
-                            'video/x-msvideo': '.avi',
-                            'video/x-matroska': '.mkv'
-                        }
-                        ext = extension_map.get(content_type.split(';')[0], '.bin')
-                        path = os.path.join(save_dir, f"{base}{ext}")
-                        # Handle name collisions again
-                        counter = 1
-                        while os.path.exists(path):
-                            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-                            counter += 1
                     with open(path, 'wb') as f:
                         f.write(content)
                     return path
                 else:
                     logger.error(f"Download failed with status {response.status}: {file_url}")
                     return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
@@ -535,65 +416,45 @@ class DownloadManager:
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
         try:
-            # Search main page
             progress_text.text("Analyzing main page...")
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
             file_count_text.text(f"Found {initial_count} files on main page")
-            # Get and search sublinks
             progress_text.text("Getting sublinks...")
             sublinks = await self.get_sublinks(url, sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
             if not sublinks:
                 progress_bar.progress(1.0)
                 return main_files
-            # Process sublinks
             all_files = main_files
             for i, sublink in enumerate(sublinks, 1):
-                progress = i/total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
                 progress_bar.progress(progress)
                 sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                 all_files.extend(sub_files)
-                # Update count in real-time
                 file_count_text.text(f"Found {len(all_files)} total files")
-            # Make results unique
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
             progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
-            # Clean up progress indicators after a delay
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
@@ -604,10 +465,8 @@ class DownloadManager:
             await self.page.goto(url, timeout=30000)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
             parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             links = set()
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
@@ -615,56 +474,48 @@ class DownloadManager:
                     links.add(href)
                 elif href.startswith('/'):
                     links.add(f"{base_url}{href}")
             return list(links)[:limit]
         except Exception as e:
             logger.error(f"Error getting sublinks: {e}")
             return []
-def safe_rerun():
-    """Safely rerun the app if experimental_rerun is available."""
-    if hasattr(st, "experimental_rerun"):
-        st.experimental_rerun()
 def main():
-    # Initialize session state on first run
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True
         st.session_state.discovered_files = []
         st.session_state.current_url = None
         st.session_state.google_creds = None
         st.session_state.selected_files = []
-        st.session_state.do_deep_search = False  # Add this
-        st.session_state.deep_search_url = None  # Add this
-        st.session_state.search_results = []     # Add this
     st.title("Advanced File Downloader")
-    # Sidebar configuration
     with st.sidebar:
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
         with st.expander("Advanced Options", expanded=True):
-            custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input")
-            max_sublinks = st.number_input(
-                "Maximum Sublinks to Process",
-                min_value=1,
-                max_value=100000,
-                value=10000,
-                step=50,
-                help="Maximum number of sublinks to process from the main page",
-                key="max_sublinks_input"
-            )
-            sublink_timeout = st.number_input(
-                "Search Timeout (seconds per sublink)",
-                min_value=1,
-                max_value=3000,
-                value=30,
-                step=5,
-                help="Maximum time to spend searching each sublink",
-                key="timeout_input"
-            )
             use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
         with st.expander("Google Drive Integration", expanded=False):
@@ -676,158 +527,54 @@ def main():
                 creds, msg = exchange_code_for_credentials(auth_code)
                 st.session_state.google_creds = creds
                 st.write(msg)
-    # Manual URL mode
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
         col1, col2 = st.columns([3, 1])
         with col1:
             if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
                 if url:
                     async def run_deep_search():
-                        try:
-                            async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                files = await dm.deep_search(
-                                    url=url,
-                                    custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
-                                    sublink_limit=int(max_sublinks),
-                                    timeout=int(sublink_timeout)
-                                )
-                                return files
-                        except Exception as e:
-                            st.error(f"Error during deep search: {str(e)}")
-                            return None
                     files = asyncio.run(run_deep_search())
                     if files:
-                        # Save all discovered files—even duplicates
                         st.session_state.discovered_files = files
                         st.session_state.current_url = url
                         st.success(f"Found {len(files)} files!")
-                        # File selection block (Select/Clear)
-                        col1, col2 = st.columns([1, 4])
-                        with col1:
-                            if st.button("Select All", key="select_all_btn"):
-                                st.session_state.selected_files = list(range(len(files)))
-                                safe_rerun()
-                            if st.button("Clear Selection", key="clear_selection_btn"):
-                                st.session_state.selected_files = []
-                                safe_rerun()
-                        selected_files = st.multiselect(
-                            "Select files to download",
-                            options=list(range(len(files))),
-                            default=st.session_state.selected_files,
-                            format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
-                            key="file_multiselect"
-                        )
-                        st.session_state.selected_files = selected_files
-                        if selected_files:
-                            col1, col2, col3, col4 = st.columns(4)
-                            with col1:
-                                download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
-                            with col2:
-                                create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
-                            with col3:
-                                delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
-                            with col4:
-                                upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
-                            if st.button("Download Selected", key="download_btn"):
-                                if not os.path.exists(download_dir):
-                                    os.makedirs(download_dir)
-                                async def download_files():
-                                    downloaded_paths = []
-                                    progress_bar = st.progress(0)
-                                    status_text = st.empty()
-                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                        for i, idx in enumerate(selected_files):
-                                            progress = (i + 1) / len(selected_files)
-                                            file_info = files[idx]
-                                            status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
-                                            progress_bar.progress(progress)
-                                            # Download the file (ensure DownloadManager.download_file downloads duplicates)
-                                            path = await dm.download_file(file_info, download_dir, url)
-                                            if path:
-                                                downloaded_paths.append(path)
-                                        status_text.empty()
-                                        progress_bar.empty()
-                                        return downloaded_paths
-                                downloaded = asyncio.run(download_files())
-                                if downloaded:
-                                    st.success(f"Successfully downloaded {len(downloaded)} files")
-                                    # If the user chose to create a ZIP, generate it and offer a download button
-                                    if create_zip:
-                                        zip_path = create_zip_file(downloaded, download_dir)
-                                        st.success(f"Created ZIP file: {zip_path}")
-                                        with open(zip_path, "rb") as f:
-                                            zip_data = f.read()
-                                        st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
-                                        if upload_to_drive and st.session_state.get('google_creds'):
-                                            with st.spinner("Uploading to Google Drive..."):
-                                                drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
-                                                if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
-                                                    st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
-                                                else:
-                                                    st.error(drive_id)
-                                        if delete_after:
-                                            for path in downloaded:
-                                                try:
-                                                    os.remove(path)
-                                                except Exception as e:
-                                                    st.warning(f"Could not delete {path}: {e}")
-                                            st.info("Deleted original files after ZIP creation")
-                                    else:
-                                        # Otherwise, generate an individual download button for each file
-                                        for path in downloaded:
-                                            with open(path, "rb") as f:
-                                                file_data = f.read()
-                                            st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
                     else:
                         st.warning("No files found.")
-        # If files were discovered in a previous search, show them here as well.
         if st.session_state.discovered_files:
             files = st.session_state.discovered_files
             st.success(f"Found {len(files)} files!")
             col1, col2 = st.columns([1, 4])
             with col1:
-                if st.button("Select All", key="select_all_btn2"):
                     st.session_state.selected_files = list(range(len(files)))
-                    safe_rerun()
-                if st.button("Clear Selection", key="clear_selection_btn2"):
                     st.session_state.selected_files = []
-                    safe_rerun()
-            selected_files = st.multiselect(
-                "Select files to download",
-                options=list(range(len(files))),
-                default=st.session_state.selected_files,
-                format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
-                key="file_multiselect2"
-            )
             st.session_state.selected_files = selected_files
             if selected_files:
                 col1, col2, col3, col4 = st.columns(4)
                 with col1:
-                    download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input2")
                 with col2:
-                    create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox2")
                 with col3:
-                    delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox2")
                 with col4:
-                    upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox2")
-                if st.button("Download Selected", key="download_btn2"):
                     if not os.path.exists(download_dir):
                         os.makedirs(download_dir)
                     async def download_files():
                         downloaded_paths = []
                         progress_bar = st.progress(0)
@@ -838,7 +585,7 @@ def main():
                                 file_info = files[idx]
                                 status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
                                 progress_bar.progress(progress)
-                                path = await dm.download_file(file_info, download_dir, st.session_state.current_url)
                                 if path:
                                     downloaded_paths.append(path)
                             status_text.empty()
@@ -853,13 +600,14 @@ def main():
                             with open(zip_path, "rb") as f:
                                 zip_data = f.read()
                             st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
-                            if upload_to_drive and st.session_state.get('google_creds'):
-                                with st.spinner("Uploading to Google Drive..."):
-                                    drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
-                                    if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
-                                        st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
-                                    else:
-                                        st.error(drive_id)
                             if delete_after:
                                 for path in downloaded:
                                     try:
@@ -872,163 +620,29 @@ def main():
                                 with open(path, "rb") as f:
                                     file_data = f.read()
                                 st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
         query = st.text_input("Enter search query", key="search_query_input")
         num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
-        # Check if deep search was requested
-        if st.session_state.get('do_deep_search', False):
-            url_to_search = st.session_state.get('deep_search_url')
-            st.write(f"Running deep search on: {url_to_search}")
-            async def perform_deep_search():
-                async with DownloadManager(
-                    use_proxy=use_proxy,
-                    proxy=proxy
-                ) as dm:
-                    files = await dm.deep_search(
-                        url=url_to_search,
-                        custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
-                        sublink_limit=max_sublinks,
-                        timeout=sublink_timeout
-                    )
-                    if files:
-                        st.session_state.discovered_files = files
-                        st.session_state.current_url = url_to_search
-                        st.session_state.selected_files = []
-                    else:
-                        st.warning("No files found on this page.")
-                    # Clear the deep search flag after execution
-                    st.session_state.do_deep_search = False
-            asyncio.run(perform_deep_search())
         if st.button("Search", key="search_btn"):
             if query:
                 async def run_search():
-                    async with DownloadManager(
-                        use_proxy=use_proxy,
-                        proxy=proxy,
-                        query=query,
-                        num_results=num_results
-                    ) as dm:
                         with st.spinner("Searching..."):
                             urls = await dm.search_bing()
                             if urls:
-                                st.session_state.search_results = urls  # Store URLs in session state
                                 st.success(f"Found {len(urls)} results!")
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
                                         if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
-                                            st.session_state.deep_search_url = url  # Store the URL to search
-                                            st.session_state.do_deep_search = True  # Flag to perform deep search
-                                            safe_rerun()  # Rerun to apply state change
                             else:
                                 st.warning("No search results found.")
                 asyncio.run(run_search())
-        # Display search results if they exist
-        if hasattr(st.session_state, 'search_results') and st.session_state.search_results:
-            urls = st.session_state.search_results
-            st.success(f"Found {len(urls)} results!")
-            for i, url in enumerate(urls, 1):
-                with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
-                    if st.button(f"Deep Search Result {i}", key=f"deep_search_result_saved_{i}"):
-                        st.session_state.deep_search_url = url
-                        st.session_state.do_deep_search = True
-                        safe_rerun()
-        # If files were discovered in a previous search, show them
-        if st.session_state.discovered_files:
-            files = st.session_state.discovered_files
-            st.success(f"Found {len(files)} files on {st.session_state.current_url}!")
-            # File selection and download UI
-            col1, col2 = st.columns([1, 4])
-            with col1:
-                if st.button("Select All", key="bing_select_all_btn"):
-                    st.session_state.selected_files = list(range(len(files)))
-                    safe_rerun()
-                if st.button("Clear Selection", key="bing_clear_selection_btn"):
-                    st.session_state.selected_files = []
-                    safe_rerun()
-            selected_files = st.multiselect(
-                "Select files to download",
-                options=list(range(len(files))),
-                default=st.session_state.selected_files,
-                format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
-                key="bing_file_multiselect"
-            )
-            st.session_state.selected_files = selected_files
-            if selected_files:
-                col1, col2, col3, col4 = st.columns(4)
-                with col1:
-                    download_dir = st.text_input("Download Directory", value="./downloads", key="bing_download_dir_input")
-                with col2:
-                    create_zip = st.checkbox("Create ZIP file", value=True, key="bing_create_zip_checkbox")
-                with col3:
-                    delete_after = st.checkbox("Delete after creating ZIP", key="bing_delete_after_checkbox")
-                with col4:
-                    upload_to_drive = st.checkbox("Upload to Google Drive", key="bing_upload_drive_checkbox")
-                if st.button("Download Selected", key="bing_download_btn"):
-                    if not os.path.exists(download_dir):
-                        os.makedirs(download_dir)
-                    async def download_files():
-                        downloaded_paths = []
-                        progress_bar = st.progress(0)
-                        status_text = st.empty()
-                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                            for i, idx in enumerate(selected_files):
-                                progress = (i + 1) / len(selected_files)
-                                file_info = files[idx]
-                                status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
-                                progress_bar.progress(progress)
-                                path = await dm.download_file(file_info, download_dir, st.session_state.current_url)
-                                if path:
-                                    downloaded_paths.append(path)
-                            status_text.empty()
-                            progress_bar.empty()
-                            return downloaded_paths
-                    downloaded = asyncio.run(download_files())
-                    if downloaded:
-                        st.success(f"Successfully downloaded {len(downloaded)} files")
-                        if create_zip:
-                            zip_path = create_zip_file(downloaded, download_dir)
-                            st.success(f"Created ZIP file: {zip_path}")
-                            with open(zip_path, "rb") as f:
-                                zip_data = f.read()
-                            st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
-                            if upload_to_drive and st.session_state.get('google_creds'):
-                                with st.spinner("Uploading to Google Drive..."):
-                                    drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
-                                    if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
-                                        st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
-                                    else:
-                                        st.error(drive_id)
-                            if delete_after:
-                                for path in downloaded:
-                                    try:
-                                        os.remove(path)
-                                    except Exception as e:
-                                        st.warning(f"Could not delete {path}: {e}")
-                                st.info("Deleted original files after ZIP creation")
-                        else:
-                            for path in downloaded:
-                                with open(path, "rb") as f:
-                                    file_data = f.read()
-                                st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
     else:  # PDF Summarizer mode
         if summarizer is None:
             st.error("PDF summarization is not available due to model loading errors.")
@@ -1046,15 +660,10 @@ def main():
                             reader = PdfReader(temp_pdf.name)
                             text = " ".join([page.extract_text() or "" for page in reader.pages])
                             os.remove(temp_pdf.name)
-                            limited_text = text[:3000]
-                            summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
-                            st.write("Summary:")
-                            st.write(summary[0]['summary_text'])
                         except Exception as e:
                             st.error(f"Error summarizing PDF: {e}")
 if __name__ == "__main__":
-    try:
-        main()
-    except Exception as e:
-        st.error(f"An error occurred: {str(e)}")

 import googleapiclient.discovery
 import google.auth.transport.requests
 from async_timeout import timeout as async_timeout
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import schedule
+import threading
+import time
+import hashlib
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from sklearn.cluster import KMeans
+import numpy as np
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     filename='advanced_download_log.txt',
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 GOOGLE_OAUTH_CONFIG = {
     "web": {
         "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
 # Playwright Setup
 def install_playwright_dependencies():
     os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
+    subprocess.run(['apt-get', 'update', '-y'], check=True)
+    packages = [
+        'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
+        'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
+        'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
+    ]
+    subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
+    subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
 install_playwright_dependencies()
 # Model Loading
 @st.cache_resource
 def load_models():
     try:
+        # Load spaCy model
         try:
             nlp = spacy.load("en_core_web_sm")
         except OSError:
             spacy.cli.download("en_core_web_sm")
             nlp = spacy.load("en_core_web_sm")
+        # Load SentenceTransformer
         try:
+            semantic_model = SentenceTransformer('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
         except Exception as e:
             st.error(f"Error loading SentenceTransformer: {e}")
             semantic_model = None
+        # Load Transformers pipeline
         try:
+            summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         except Exception as e:
             st.error(f"Error loading Transformers: {e}")
             summarizer = None
         return nlp, semantic_model, summarizer
     except Exception as e:
         st.error(f"Error loading models: {e}")
         return None, None, None
+nlp_model, semantic_model, summarizer = load_models()
 # Utility Functions
 def get_random_user_agent():
 def create_zip_file(file_paths, output_dir):
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
     with zipfile.ZipFile(zip_path, 'w') as zipf:
         for file_path in file_paths:
             zipf.write(file_path, os.path.basename(file_path))
     return zip_path
 # Google Drive Functions
     except Exception as e:
         return None, f"Error during token exchange: {e}"
+def google_drive_upload(file_path, credentials, folder_id=None):
     try:
         drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
+        file_metadata = {'name': os.path.basename(file_path)}
+        if folder_id:
+            file_metadata['parents'] = [folder_id]
+        media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
         created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
         return created.get("id", "")
     except Exception as e:
         return f"Error uploading to Drive: {str(e)}"
+def create_drive_folder(drive_service, name):
+    folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
+    folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
+    return folder.get('id')
 # DownloadManager Class
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         }
         if self.use_proxy and self.proxy:
             opts["proxy"] = {"server": self.proxy}
         self.browser = await self.playwright.chromium.launch(**opts)
         self.context = await self.browser.new_context(user_agent=get_random_user_agent())
         self.page = await self.context.new_page()
             search_url = f"https://www.bing.com/search?q={self.query}"
             await self.page.goto(search_url, timeout=30000)
             await self.page.wait_for_load_state('networkidle')
             links = await self.page.query_selector_all("li.b_algo h2 a")
             for link in links[:self.num_results]:
                 href = await link.get_attribute('href')
                 if href:
                     urls.append(href)
             return urls
         except Exception as e:
             logger.error(f"Error searching Bing: {e}")
             soup = BeautifulSoup(content, 'html.parser')
             default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
+                            '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
+                            '.pptx', '.odt', '.txt']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
             parsed_base = urlparse(final_url)
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
                 if '.php' in href.lower() or 'download' in href.lower():
+                    full_url = href if href.startswith('http') else f"{base_url}{href}"
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                         found_files.append({
                         })
                         continue
                 if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else f"{base_url}{href}"
                     size_str = await self.get_file_size(file_url)
                     meta = {}
                     if file_url.lower().endswith('.pdf'):
                         meta = await self.get_pdf_metadata(file_url)
                     found_files.append({
                         'url': file_url,
                         'filename': os.path.basename(file_url.split('?')[0]),
                         if match:
                             file_id = match.group(1)
                             break
                     if file_id:
                         direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
                         filename = file_id
                                 mt = re.search(r'filename\*?="?([^";]+)', cd)
                                 if mt:
                                     filename = mt.group(1).strip('"').strip()
                             found_files.append({
                                 'url': direct_url,
                                 'filename': filename,
                         except Exception as e:
                             logger.error(f"Error processing Google Drive link: {e}")
             seen_urls = set()
             unique_files = []
             for f in found_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
         while os.path.exists(path):
             path = os.path.join(save_dir, f"{base}_{counter}{ext}")
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
             if "drive.google.com" in file_url:
                 import gdown
+                output = gdown.download(file_url, path, quiet=False)
+                if output:
+                    return path
+                return None
             async with self.context.new_page() as page:
                 headers = {
                     'Accept': '*/*',
                     'Accept-Encoding': 'gzip, deflate, br',
                     'Referer': referer
                 }
                 response = await page.request.get(file_url, headers=headers, timeout=30000)
                 if response.status == 200:
                     content = await response.body()
                     with open(path, 'wb') as f:
                         f.write(content)
                     return path
                 else:
                     logger.error(f"Download failed with status {response.status}: {file_url}")
                     return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
         try:
             progress_text.text("Analyzing main page...")
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
             file_count_text.text(f"Found {initial_count} files on main page")
             progress_text.text("Getting sublinks...")
             sublinks = await self.get_sublinks(url, sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
             if not sublinks:
                 progress_bar.progress(1.0)
                 return main_files
             all_files = main_files
             for i, sublink in enumerate(sublinks, 1):
+                progress = i / total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
                 progress_bar.progress(progress)
                 sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                 all_files.extend(sub_files)
                 file_count_text.text(f"Found {len(all_files)} total files")
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
             progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
             await self.page.goto(url, timeout=30000)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
             parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             links = set()
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
                     links.add(href)
                 elif href.startswith('/'):
                     links.add(f"{base_url}{href}")
             return list(links)[:limit]
         except Exception as e:
             logger.error(f"Error getting sublinks: {e}")
             return []
+# Utility Functions for New Features
+def extract_keywords(text, n=5):
+    doc = nlp_model(text)
+    keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
+    return keywords
+def analyze_sentiment(text):
+    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+    result = sentiment_analyzer(text[:512])[0]
+    return result['label'], result['score']
+def get_file_hash(file_path):
+    hasher = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        hasher.update(f.read())
+    return hasher.hexdigest()
+# Main Function
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True
         st.session_state.discovered_files = []
         st.session_state.current_url = None
         st.session_state.google_creds = None
         st.session_state.selected_files = []
+        st.session_state.do_deep_search = False
+        st.session_state.deep_search_url = None
+        st.session_state.search_results = []
     st.title("Advanced File Downloader")
     with st.sidebar:
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
         with st.expander("Advanced Options", expanded=True):
+            custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
+            max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
+            sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
             use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
         with st.expander("Google Drive Integration", expanded=False):
                 creds, msg = exchange_code_for_credentials(auth_code)
                 st.session_state.google_creds = creds
                 st.write(msg)
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
         col1, col2 = st.columns([3, 1])
         with col1:
             if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
                 if url:
+                    custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
+                    valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
+                    if custom_ext_list != valid_ext_list:
+                        st.warning("Invalid extensions ignored. Use format like '.csv'.")
                     async def run_deep_search():
+                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                            files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
+                            return files
                     files = asyncio.run(run_deep_search())
                     if files:
                         st.session_state.discovered_files = files
                         st.session_state.current_url = url
                         st.success(f"Found {len(files)} files!")
                     else:
                         st.warning("No files found.")
         if st.session_state.discovered_files:
             files = st.session_state.discovered_files
             st.success(f"Found {len(files)} files!")
             col1, col2 = st.columns([1, 4])
             with col1:
+                if st.button("Select All", key="select_all_btn"):
                     st.session_state.selected_files = list(range(len(files)))
+                if st.button("Clear Selection", key="clear_selection_btn"):
                     st.session_state.selected_files = []
+            selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect")
             st.session_state.selected_files = selected_files
             if selected_files:
                 col1, col2, col3, col4 = st.columns(4)
                 with col1:
+                    download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
                 with col2:
+                    create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
                 with col3:
+                    delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
                 with col4:
+                    upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
+                if st.button("Download Selected", key="download_btn"):
                     if not os.path.exists(download_dir):
                         os.makedirs(download_dir)
                     async def download_files():
                         downloaded_paths = []
                         progress_bar = st.progress(0)
                                 file_info = files[idx]
                                 status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
                                 progress_bar.progress(progress)
+                                path = await dm.download_file(file_info, download_dir, url)
                                 if path:
                                     downloaded_paths.append(path)
                             status_text.empty()
                             with open(zip_path, "rb") as f:
                                 zip_data = f.read()
                             st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
+                            if upload_to_drive and st.session_state.google_creds:
+                                drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
+                                folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
+                                drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
+                                if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
+                                    st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
+                                else:
+                                    st.error(drive_id)
                             if delete_after:
                                 for path in downloaded:
                                     try:
                                 with open(path, "rb") as f:
                                     file_data = f.read()
                                 st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
     elif mode == "Bing Search":
         st.header("Bing Search Mode")
         query = st.text_input("Enter search query", key="search_query_input")
         num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
         if st.button("Search", key="search_btn"):
             if query:
                 async def run_search():
+                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm:
                         with st.spinner("Searching..."):
                             urls = await dm.search_bing()
                             if urls:
+                                st.session_state.search_results = urls
                                 st.success(f"Found {len(urls)} results!")
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
                                         if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
+                                            st.session_state.deep_search_url = url
+                                            st.session_state.do_deep_search = True
                             else:
                                 st.warning("No search results found.")
                 asyncio.run(run_search())
     else:  # PDF Summarizer mode
         if summarizer is None:
             st.error("PDF summarization is not available due to model loading errors.")
                             reader = PdfReader(temp_pdf.name)
                             text = " ".join([page.extract_text() or "" for page in reader.pages])
                             os.remove(temp_pdf.name)
+                            summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
+                            st.write("Summary:", summary[0]['summary_text'])
                         except Exception as e:
                             st.error(f"Error summarizing PDF: {e}")
 if __name__ == "__main__":
+    main()