Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 15

Commit

fd2ce95

verified ·

1 Parent(s): 3672c09

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -368

app.py CHANGED Viewed

@@ -1,77 +1,7 @@
 import streamlit as st
 st.set_page_config(page_title="Advanced File Downloader", layout="wide")
-# Import other required packages
-import spacy
-import spacy.cli
-import os
-@st.cache_resource
-def load_models():
-    try:
-        # Try to load spaCy model
-        try:
-            nlp = spacy.load("en_core_web_sm")
-        except OSError:
-            st.info("Downloading spaCy model...")
-            spacy.cli.download("en_core_web_sm")
-            nlp = spacy.load("en_core_web_sm")
-        # Load SentenceTransformer with offline handling
-        try:
-            from sentence_transformers import SentenceTransformer
-            model_name = 'all-MiniLM-L6-v2'
-            cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
-            if os.path.exists(os.path.join(cache_dir, model_name)):
-                semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
-            else:
-                st.warning(f"Downloading SentenceTransformer model {model_name}...")
-                semantic_model = SentenceTransformer(model_name)
-        except Exception as e:
-            st.error(f"Error loading SentenceTransformer: {e}")
-            st.info("Continuing without semantic search capability...")
-            semantic_model = None
-        # Load Transformers pipeline with offline handling
-        try:
-            from transformers import pipeline
-            model_name = "facebook/bart-large-cnn"
-            cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
-            if os.path.exists(os.path.join(cache_dir, model_name)):
-                summarizer = pipeline("summarization", model=model_name)
-            else:
-                st.warning(f"Downloading Transformer model {model_name}...")
-                summarizer = pipeline("summarization")
-        except Exception as e:
-            st.error(f"Error loading Transformers: {e}")
-            st.info("Continuing without summarization capability...")
-            summarizer = None
-        return nlp, semantic_model, summarizer
-    except Exception as e:
-        st.error(f"Error loading models: {e}")
-        return None, None, None
-# Initialize models with better error handling
-with st.spinner("Loading models..."):
-    nlp_model, semantic_model, summarizer = load_models()
-if nlp_model is None:
-    st.error("Failed to load essential NLP model. The application cannot continue.")
-    st.stop()
-else:
-    # Continue with available features based on which models loaded successfully
-    if semantic_model is None:
-        st.warning("Semantic search features will be disabled.")
-    if summarizer is None:
-        st.warning("PDF summarization features will be disabled.")
-# Rest of your imports and code here...
-# Rest of your code...
 import os
 import subprocess
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
@@ -88,7 +18,28 @@ import zipfile
 import tempfile
 import mimetypes
 import requests
-# -------------------- Playwright Setup --------------------
 def install_playwright_dependencies():
     os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
     os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
@@ -126,23 +77,14 @@ def install_playwright_dependencies():
     except Exception as e:
         print(f"Error: {e}")
 install_playwright_dependencies()
-# -------------------- spaCy Model Setup --------------------
-import spacy
-import spacy.cli
-from spacy.language import Language
-@Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
-def dummy_roberta_transformer(nlp, name):
-    def dummy(doc):
-        return doc
-    return dummy
 @st.cache_resource
 def load_models():
     try:
-        # Load spaCy model
         try:
             nlp = spacy.load("en_core_web_sm")
         except OSError:
@@ -150,18 +92,30 @@ def load_models():
             spacy.cli.download("en_core_web_sm")
             nlp = spacy.load("en_core_web_sm")
-        # Load SentenceTransformer
         try:
             from sentence_transformers import SentenceTransformer
-            semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
         except Exception as e:
             st.error(f"Error loading SentenceTransformer: {e}")
             semantic_model = None
-        # Load Transformers pipeline with correct import
         try:
-            from transformers import pipeline, AutoModelForSeq2SeqGenerationWithLMHead, AutoTokenizer
-            summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         except Exception as e:
             st.error(f"Error loading Transformers: {e}")
             summarizer = None
@@ -172,55 +126,38 @@ def load_models():
         st.error(f"Error loading models: {e}")
         return None, None, None
-# Also load SentenceTransformer for semantic re-ranking.
-from sentence_transformers import SentenceTransformer, util
-@st.cache_resource
-def load_semantic_model():
-    return SentenceTransformer('all-MiniLM-L6-v2')
-semantic_model = load_semantic_model()
-# -------------------- Transformers Summarization Setup --------------------
-from transformers import pipeline
-@st.cache_resource
-def load_summarizer():
-    return pipeline("summarization")
-summarizer = load_summarizer()
-def summarize_pdf_url(pdf_url):
-    try:
-        with st.spinner("Downloading and processing PDF..."):
-            response = requests.get(pdf_url, stream=True)
-            temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
-            with open(temp_pdf.name, "wb") as f:
-                f.write(response.content)
-            reader = PdfReader(temp_pdf.name)
-            text = " ".join([page.extract_text() or "" for page in reader.pages])
-            os.remove(temp_pdf.name)
-            limited_text = text[:3000]
-            summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
-            return summary[0]["summary_text"]
-    except Exception as e:
-        return f"Error summarizing PDF: {e}"
-# -------------------- Google API Setup --------------------
-GOOGLE_OAUTH_CONFIG = {
-    "web": {
-        "client_id": "your_client_id",
-        "project_id": "your_project_id",
-        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-        "token_uri": "https://oauth2.googleapis.com/token",
-        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
-        "client_secret": "your_client_secret",
-        "redirect_uris": ["your_redirect_uri"]
-    }
-}
-import google_auth_oauthlib.flow
-import googleapiclient.discovery
-import google.auth.transport.requests
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
     flow = google_auth_oauthlib.flow.Flow.from_client_config(
@@ -253,76 +190,16 @@ def exchange_code_for_credentials(auth_code):
     except Exception as e:
         return None, f"Error during token exchange: {e}"
-# -------------------- Playwright Setup --------------------
-def install_playwright_dependencies():
-    os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
-    os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
     try:
-        subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
     except Exception as e:
-        st.error(f"Error installing Playwright: {e}")
-# Initialize Playwright dependencies
-install_playwright_dependencies()
-# -------------------- Logging Setup --------------------
-logging.basicConfig(
-    filename='advanced_download_log.txt',
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger()
-# -------------------- Shared Utils --------------------
-USER_AGENTS = [
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
-    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
-]
-def get_random_user_agent():
-    return random.choice(USER_AGENTS)
-def sizeof_fmt(num, suffix='B'):
-    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
-        if abs(num) < 1024.0:
-            return f"{num:3.1f}{unit}{suffix}"
-        num /= 1024.0
-    return f"{num:.1f}Y{suffix}"
-# ---------- Human-like Interactions -------------
-async def human_like_scroll(page):
-    scroll_height = await page.evaluate('document.body.scrollHeight')
-    viewport_height = await page.evaluate('window.innerHeight')
-    current_scroll = 0
-    while current_scroll < scroll_height:
-        await page.evaluate(f'window.scrollTo(0, {current_scroll})')
-        await asyncio.sleep(random.uniform(0.5, 1.5))
-        current_scroll += viewport_height * random.uniform(0.5, 1.5)
-        scroll_height = await page.evaluate('document.body.scrollHeight')
-async def human_like_interactions(page):
-    await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000))
-    await asyncio.sleep(random.uniform(0.5, 1.5))
-    await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000))
-    await asyncio.sleep(random.uniform(0.5, 1.5))
-    await page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
-    await asyncio.sleep(random.uniform(0.5, 1.5))
-# ---------- NLP Helpers -------------
-def nlp_preprocess(query: str) -> str:
-    doc = nlp_model(query)
-    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
-    processed = " ".join(tokens)
-    return processed if processed.strip() else query
-def nlp_extract_entities(text: str):
-    doc = nlp_model(text)
-    return [(ent.text, ent.label_) for ent in doc.ents]
-# ---------- AI-enhanced Query Preprocessing -------------
-def ai_preprocess_query(query: str) -> str:
-    return query
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
@@ -336,9 +213,20 @@ class DownloadManager:
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
-        opts = {"headless": True}
         if self.use_proxy and self.proxy:
             opts["proxy"] = {"server": self.proxy}
         self.browser = await self.playwright.chromium.launch(**opts)
         self.context = await self.browser.new_context(user_agent=get_random_user_agent())
         self.page = await self.context.new_page()
@@ -391,10 +279,6 @@ class DownloadManager:
                 response = await page.goto(url, wait_until='networkidle', timeout=30000)
                 if response and response.headers.get('location'):
                     return response.headers['location']
-                content_type = response.headers.get('content-type', '')
-                if 'text/html' not in content_type.lower():
-                    return url
-                content = await page.content()
                 return page.url
         except Exception as e:
             logger.error(f"Error extracting real download URL: {e}")
@@ -432,6 +316,23 @@ class DownloadManager:
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
                 if any(href.lower().endswith(ext) for ext in all_exts):
                     file_url = href if href.startswith('http') else (
                         f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
@@ -449,6 +350,7 @@ class DownloadManager:
                         'metadata': meta
                     })
                 elif ("drive.google.com" in href) or ("docs.google.com" in href):
                     file_id = None
                     for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
@@ -477,7 +379,15 @@ class DownloadManager:
                         except Exception as e:
                             logger.error(f"Error processing Google Drive link: {e}")
-            return found_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
@@ -531,75 +441,27 @@ class DownloadManager:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
-    async def search_bing(self):
-        if not self.query:
-            return [], []
-        search_query = self.query
-        if "filetype:pdf" not in search_query.lower():
-            search_query += " filetype:pdf"
-        search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
-        try:
-            await self.page.goto(search_url, timeout=30000)
-            await self.page.wait_for_selector('li.b_algo', timeout=30000)
-            await human_like_scroll(self.page)
-            results = []
-            elements = await self.page.query_selector_all('li.b_algo')
-            for element in elements:
-                link = await element.query_selector('h2 a')
-                if link:
-                    url = await link.get_attribute('href')
-                    if url:
-                        results.append(url)
-            return results[:self.num_results]
-        except Exception as e:
-            logger.error(f"Bing search error: {e}")
-            return []
-    async def get_sublinks(self, url, limit=100):
-        try:
-            await self.page.goto(url, timeout=30000)
-            content = await self.page.content()
-            soup = BeautifulSoup(content, 'html.parser')
-            parsed_base = urlparse(url)
-            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            links = set()
-            for a in soup.find_all('a', href=True):
-                href = a['href'].strip()
-                if href.startswith('http'):
-                    links.add(href)
-                elif href.startswith('/'):
-                    links.add(f"{base_url}{href}")
-            return list(links)[:limit]
-        except Exception as e:
-            logger.error(f"Error getting sublinks: {e}")
-            return []
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         try:
             # Search main page
             progress_text.text("Analyzing main page...")
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             # Get and search sublinks
             progress_text.text("Getting sublinks...")
             sublinks = await self.get_sublinks(url, sublink_limit)
             if not sublinks:
                 progress_bar.progress(1.0)
@@ -607,32 +469,67 @@ class DownloadManager:
             # Process sublinks
             all_files = main_files
-            total_links = len(sublinks)
             for i, sublink in enumerate(sublinks, 1):
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
-                progress_bar.progress(i/total_links)
                 sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                 all_files.extend(sub_files)
             # Make results unique
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
-            progress_text.text(f"Found {len(unique_files)} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             return []
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True
@@ -642,6 +539,7 @@ def main():
     st.title("Advanced File Downloader")
     with st.sidebar:
         st.header("Settings")
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
@@ -651,9 +549,28 @@ def main():
                 "Custom File Extensions",
                 placeholder=".csv, .txt, .epub"
             )
             use_proxy = st.checkbox("Use Proxy")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com")
@@ -662,74 +579,99 @@ def main():
             if url:
                 async def run_deep_search():
                     async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                        with st.spinner("Searching for files..."):
-                            files = await dm.deep_search(
-                                url=url,
-                                custom_ext_list=custom_extensions.split(',') if custom_extensions else []
-                            )
-                            st.session_state.discovered_files = files
-                            st.session_state.current_url = url
-                            return files
                 files = asyncio.run(run_deep_search())
                 if files:
                     st.success(f"Found {len(files)} files!")
-                    with st.expander("Found Files", expanded=True):
-                        for i, file in enumerate(files):
-                            col1, col2 = st.columns([3, 1])
-                            with col1:
-                                st.write(f"{i+1}. {file['filename']}")
-                            with col2:
-                                st.write(f"Size: {file['size']}")
-                    # Download section
-                    st.subheader("Download Files")
                     selected_files = st.multiselect(
                         "Select files to download",
                         range(len(files)),
                         format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
                     )
                     if selected_files:
-                        col1, col2 = st.columns([3, 1])
                         with col1:
                             download_dir = st.text_input("Download Directory", value="./downloads")
                         with col2:
-                            if st.button("Download Selected", use_container_width=True):
-                                async def download_files():
-                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
-                                        paths = []
-                                        progress_text = st.empty()
-                                        progress_bar = st.progress(0)
-                                        for i, idx in enumerate(selected_files):
-                                            progress = (i + 1) / len(selected_files)
-                                            progress_text.text(f"Downloading {files[idx]['filename']}...")
-                                            progress_bar.progress(progress)
-                                            path = await dm.download_file(
-                                                files[idx],
-                                                download_dir,
-                                                url
-                                            )
-                                            if path:
-                                                paths.append(path)
-                                        progress_text.empty()
-                                        progress_bar.empty()
-                                        return paths
-                                downloaded = asyncio.run(download_files())
-                                if downloaded:
-                                    st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
-                                    # Create zip file if multiple files were downloaded
-                                    if len(downloaded) > 1:
-                                        zip_path = os.path.join(download_dir, "downloads.zip")
-                                        with zipfile.ZipFile(zip_path, 'w') as zipf:
-                                            for file in downloaded:
-                                                zipf.write(file, os.path.basename(file))
-                                        st.success(f"Created zip file: {zip_path}")
                 else:
                     st.warning("No files found.")
@@ -753,64 +695,18 @@ def main():
                                 st.success(f"Found {len(urls)} results!")
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=i==1):
-                                        if st.button(f"Deep Search This Result {i}"):
                                             files = await dm.deep_search(
                                                 url=url,
-                                                custom_ext_list=custom_extensions.split(',') if custom_extensions else []
                                             )
                                             if files:
                                                 st.session_state.discovered_files = files
                                                 st.session_state.current_url = url
                                                 st.success(f"Found {len(files)} files!")
-                                                with st.expander("Found Files", expanded=True):
-                                                    for j, file in enumerate(files):
-                                                        col1, col2 = st.columns([3, 1])
-                                                        with col1:
-                                                            st.write(f"{j+1}. {file['filename']}")
-                                                        with col2:
-                                                            st.write(f"Size: {file['size']}")
-                                                selected_files = st.multiselect(
-                                                    "Select files to download",
-                                                    range(len(files)),
-                                                    format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
-                                                )
-                                                if selected_files:
-                                                    col1, col2 = st.columns([3, 1])
-                                                    with col1:
-                                                        download_dir = st.text_input("Download Directory", value="./downloads")
-                                                    with col2:
-                                                        if st.button("Download Selected Files"):
-                                                            progress_text = st.empty()
-                                                            progress_bar = st.progress(0)
-                                                            paths = []
-                                                            for k, idx in enumerate(selected_files):
-                                                                progress = (k + 1) / len(selected_files)
-                                                                progress_text.text(f"Downloading {files[idx]['filename']}...")
-                                                                progress_bar.progress(progress)
-                                                                path = await dm.download_file(
-                                                                    files[idx],
-                                                                    download_dir,
-                                                                    url
-                                                                )
-                                                                if path:
-                                                                    paths.append(path)
-                                                            progress_text.empty()
-                                                            progress_bar.empty()
-                                                            if paths:
-                                                                st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
-                                                                if len(paths) > 1:
-                                                                    zip_path = os.path.join(download_dir, "downloads.zip")
-                                                                    with zipfile.ZipFile(zip_path, 'w') as zipf:
-                                                                        for file in paths:
-                                                                            zipf.write(file, os.path.basename(file))
-                                                                    st.success(f"Created zip file: {zip_path}")
                                             else:
                                                 st.warning("No files found on this page.")
                             else:
@@ -828,9 +724,20 @@ def main():
             if st.button("Summarize"):
                 if pdf_url:
                     with st.spinner("Generating summary..."):
-                        summary = summarize_pdf_url(pdf_url)
-                        st.write("Summary:")
-                        st.write(summary)
 if __name__ == "__main__":
     try:

 import streamlit as st
 st.set_page_config(page_title="Advanced File Downloader", layout="wide")
+# Core imports
 import os
 import subprocess
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 import tempfile
 import mimetypes
 import requests
+import datetime
+import spacy
+import spacy.cli
+from spacy.language import Language
+import google_auth_oauthlib.flow
+import googleapiclient.discovery
+import google.auth.transport.requests
+# Google OAuth Configuration
+GOOGLE_OAUTH_CONFIG = {
+    "web": {
+        "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
+        "project_id": "huggingface-449214",
+        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+        "token_uri": "https://oauth2.googleapis.com/token",
+        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_secret": "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",
+        "redirect_uris": ["https://euler314-craw-web.hf.space/"]
+    }
+}
+# Playwright Setup
 def install_playwright_dependencies():
     os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
     os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
     except Exception as e:
         print(f"Error: {e}")
+# Initialize Playwright
 install_playwright_dependencies()
+# Model Loading
 @st.cache_resource
 def load_models():
     try:
+        # Try to load spaCy model
         try:
             nlp = spacy.load("en_core_web_sm")
         except OSError:
             spacy.cli.download("en_core_web_sm")
             nlp = spacy.load("en_core_web_sm")
+        # Load SentenceTransformer with offline handling
         try:
             from sentence_transformers import SentenceTransformer
+            model_name = 'all-MiniLM-L6-v2'
+            cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
+            if os.path.exists(os.path.join(cache_dir, model_name)):
+                semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
+            else:
+                st.warning(f"Downloading SentenceTransformer model {model_name}...")
+                semantic_model = SentenceTransformer(model_name)
         except Exception as e:
             st.error(f"Error loading SentenceTransformer: {e}")
             semantic_model = None
+        # Load Transformers pipeline with offline handling
         try:
+            from transformers import pipeline
+            model_name = "facebook/bart-large-cnn"
+            cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
+            if os.path.exists(os.path.join(cache_dir, model_name)):
+                summarizer = pipeline("summarization", model=model_name)
+            else:
+                st.warning(f"Downloading Transformer model {model_name}...")
+                summarizer = pipeline("summarization")
         except Exception as e:
             st.error(f"Error loading Transformers: {e}")
             summarizer = None
         st.error(f"Error loading models: {e}")
         return None, None, None
+# Initialize models
+with st.spinner("Loading models..."):
+    nlp_model, semantic_model, summarizer = load_models()
+# Utility Functions
+def get_random_user_agent():
+    USER_AGENTS = [
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
+    ]
+    return random.choice(USER_AGENTS)
+def sizeof_fmt(num, suffix='B'):
+    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Y{suffix}"
+def create_zip_file(file_paths, output_dir):
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
+    with zipfile.ZipFile(zip_path, 'w') as zipf:
+        for file_path in file_paths:
+            zipf.write(file_path, os.path.basename(file_path))
+    return zip_path
+# Google Drive Functions
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
     flow = google_auth_oauthlib.flow.Flow.from_client_config(
     except Exception as e:
         return None, f"Error during token exchange: {e}"
+def google_drive_upload(zip_path: str, credentials):
     try:
+        drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
+        file_metadata = {'name': os.path.basename(zip_path)}
+        media = googleapiclient.http.MediaFileUpload(zip_path, resumable=True)
+        created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
+        return created.get("id", "")
     except Exception as e:
+        return f"Error uploading to Drive: {str(e)}"
+# DownloadManager Class
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
+        opts = {
+            "headless": True,
+            "args": [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--no-zygote',
+                '--single-process'
+            ]
+        }
         if self.use_proxy and self.proxy:
             opts["proxy"] = {"server": self.proxy}
         self.browser = await self.playwright.chromium.launch(**opts)
         self.context = await self.browser.new_context(user_agent=get_random_user_agent())
         self.page = await self.context.new_page()
                 response = await page.goto(url, wait_until='networkidle', timeout=30000)
                 if response and response.headers.get('location'):
                     return response.headers['location']
                 return page.url
         except Exception as e:
             logger.error(f"Error extracting real download URL: {e}")
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
+                # Handle PHP scripts and redirects
+                if '.php' in href.lower() or 'download' in href.lower():
+                    full_url = href if href.startswith('http') else (
+                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
+                    )
+                    real_url = await self.extract_real_download_url(full_url)
+                    if real_url and real_url != full_url:
+                        found_files.append({
+                            'url': real_url,
+                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                            'size': await self.get_file_size(real_url),
+                            'metadata': {}
+                        })
+                        continue
+                # Handle direct file links
                 if any(href.lower().endswith(ext) for ext in all_exts):
                     file_url = href if href.startswith('http') else (
                         f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
                         'metadata': meta
                     })
+                # Handle Google Drive links
                 elif ("drive.google.com" in href) or ("docs.google.com" in href):
                     file_id = None
                     for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
                         except Exception as e:
                             logger.error(f"Error processing Google Drive link: {e}")
+            # Make results unique based on URLs
+            seen_urls = set()
+            unique_files = []
+            for f in found_files:
+                if f['url'] not in seen_urls:
+                    seen_urls.add(f['url'])
+                    unique_files.append(f)
+            return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
             logger.error(f"Error downloading {file_url}: {e}")
             return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
+        file_count_text = st.empty()
         try:
             # Search main page
             progress_text.text("Analyzing main page...")
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
+            initial_count = len(main_files)
+            file_count_text.text(f"Found {initial_count} files on main page")
             # Get and search sublinks
             progress_text.text("Getting sublinks...")
             sublinks = await self.get_sublinks(url, sublink_limit)
+            total_links = len(sublinks)
+            progress_text.text(f"Found {total_links} sublinks to process")
             if not sublinks:
                 progress_bar.progress(1.0)
             # Process sublinks
             all_files = main_files
             for i, sublink in enumerate(sublinks, 1):
+                progress = i/total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
+                progress_bar.progress(progress)
                 sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                 all_files.extend(sub_files)
+                # Update count in real-time
+                file_count_text.text(f"Found {len(all_files)} total files")
             # Make results unique
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
+            final_count = len(unique_files)
+            progress_text.text(f"Deep search complete!")
+            file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
+            progress_text.text(f"Error during deep search: {str(e)}")
             return []
+        finally:
+            # Clean up progress indicators after a delay
+            await asyncio.sleep(2)
+            if not st.session_state.get('keep_progress', False):
+                progress_text.empty()
+                progress_bar.empty()
+    async def get_sublinks(self, url, limit=100):
+        try:
+            await self.page.goto(url, timeout=30000)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            parsed_base = urlparse(url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            links = set()
+            for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                if href.startswith('http'):
+                    links.add(href)
+                elif href.startswith('/'):
+                    links.add(f"{base_url}{href}")
+            return list(links)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting sublinks: {e}")
+            return []
 def main():
     if 'initialized' not in st.session_state:
         st.session_state.initialized = True
     st.title("Advanced File Downloader")
+    # Sidebar settings
     with st.sidebar:
         st.header("Settings")
         mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
                 "Custom File Extensions",
                 placeholder=".csv, .txt, .epub"
             )
+            max_sublinks = st.number_input(
+                "Maximum Sublinks to Process",
+                min_value=1,
+                max_value=10000,
+                value=100,
+                help="Maximum number of sublinks to process from the main page"
+            )
             use_proxy = st.checkbox("Use Proxy")
             proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
+        # Google Drive Integration
+        with st.expander("Google Drive Integration"):
+            if st.button("Start Google Sign-In"):
+                auth_url = get_google_auth_url()
+                st.markdown(f"[Click here to authorize]({auth_url})")
+            auth_code = st.text_input("Enter authorization code")
+            if st.button("Complete Sign-In") and auth_code:
+                creds, msg = exchange_code_for_credentials(auth_code)
+                st.session_state.google_creds = creds
+                st.write(msg)
     if mode == "Manual URL":
         st.header("Manual URL Mode")
         url = st.text_input("Enter URL", placeholder="https://example.com")
             if url:
                 async def run_deep_search():
                     async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                        files = await dm.deep_search(
+                            url=url,
+                            custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
+                            sublink_limit=max_sublinks
+                        )
+                        st.session_state.discovered_files = files
+                        st.session_state.current_url = url
+                        return files
                 files = asyncio.run(run_deep_search())
                 if files:
                     st.success(f"Found {len(files)} files!")
+                    # Select All/Clear Selection buttons
+                    col1, col2 = st.columns([1, 4])
+                    with col1:
+                        if st.button("Select All"):
+                            st.session_state.selected_files = list(range(len(files)))
+                        if st.button("Clear Selection"):
+                            st.session_state.selected_files = []
+                    # File selection
                     selected_files = st.multiselect(
                         "Select files to download",
                         range(len(files)),
+                        default=st.session_state.get('selected_files', []),
                         format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
                     )
                     if selected_files:
+                        col1, col2, col3, col4 = st.columns(4)
                         with col1:
                             download_dir = st.text_input("Download Directory", value="./downloads")
                         with col2:
+                            create_zip = st.checkbox("Create ZIP file", value=True)
+                        with col3:
+                            delete_after = st.checkbox("Delete after creating ZIP")
+                        with col4:
+                            upload_to_drive = st.checkbox("Upload to Google Drive")
+                        if st.button("Download Selected"):
+                            if not os.path.exists(download_dir):
+                                os.makedirs(download_dir)
+                            async def download_files():
+                                downloaded_paths = []
+                                progress_bar = st.progress(0)
+                                status_text = st.empty()
+                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                    for i, idx in enumerate(selected_files):
+                                        progress = (i + 1) / len(selected_files)
+                                        file_info = files[idx]
+                                        status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
+                                        progress_bar.progress(progress)
+                                        path = await dm.download_file(
+                                            file_info,
+                                            download_dir,
+                                            url
+                                        )
+                                        if path:
+                                            downloaded_paths.append(path)
+                                    status_text.empty()
+                                    progress_bar.empty()
+                                    return downloaded_paths
+                            downloaded = asyncio.run(download_files())
+                            if downloaded:
+                                st.success(f"Successfully downloaded {len(downloaded)} files")
+                                if create_zip or upload_to_drive:
+                                    zip_path = create_zip_file(downloaded, download_dir)
+                                    st.success(f"Created ZIP file: {zip_path}")
+                                    if upload_to_drive and st.session_state.get('google_creds'):
+                                        with st.spinner("Uploading to Google Drive..."):
+                                            drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
+                                            if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
+                                                st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
+                                            else:
+                                                st.error(drive_id)
+                                    if delete_after:
+                                        for path in downloaded:
+                                            try:
+                                                os.remove(path)
+                                            except Exception as e:
+                                                st.warning(f"Could not delete {path}: {e}")
+                                        st.info("Deleted original files after ZIP creation")
                 else:
                     st.warning("No files found.")
                                 st.success(f"Found {len(urls)} results!")
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=i==1):
+                                        if st.button(f"Deep Search Result {i}"):
                                             files = await dm.deep_search(
                                                 url=url,
+                                                custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
+                                                sublink_limit=max_sublinks
                                             )
+                                            # Reuse the same file handling logic as Manual URL mode
                                             if files:
                                                 st.session_state.discovered_files = files
                                                 st.session_state.current_url = url
                                                 st.success(f"Found {len(files)} files!")
+                                                # Add file selection and download UI here (same as Manual URL mode)
                                             else:
                                                 st.warning("No files found on this page.")
                             else:
             if st.button("Summarize"):
                 if pdf_url:
                     with st.spinner("Generating summary..."):
+                        try:
+                            response = requests.get(pdf_url, stream=True)
+                            temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+                            with open(temp_pdf.name, "wb") as f:
+                                f.write(response.content)
+                            reader = PdfReader(temp_pdf.name)
+                            text = " ".join([page.extract_text() or "" for page in reader.pages])
+                            os.remove(temp_pdf.name)
+                            limited_text = text[:3000]
+                            summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
+                            st.write("Summary:")
+                            st.write(summary[0]['summary_text'])
+                        except Exception as e:
+                            st.error(f"Error summarizing PDF: {e}")
 if __name__ == "__main__":
     try: