Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 15

Commit

9fa91d7

verified ·

1 Parent(s): 851f436

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -485

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import streamlit as st
-# Must be the first Streamlit command at the global level
 st.set_page_config(page_title="Advanced File Downloader", layout="wide")
-# Now all other imports
 import os
 import subprocess
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
@@ -19,44 +18,29 @@ import zipfile
 import tempfile
 import mimetypes
 import requests
 import spacy
 import spacy.cli
 from spacy.language import Language
-from sentence_transformers import SentenceTransformer, util
-from transformers import pipeline
-# Initialize logging
-logging.basicConfig(
-    filename='advanced_download_log.txt',
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger()
-# Model initialization with caching
 @st.cache_resource
-def initialize_models():
-    # spaCy
     try:
-        nlp = spacy.load("en_core_web_sm")
     except OSError:
         spacy.cli.download("en_core_web_sm")
-        nlp = spacy.load("en_core_web_sm")
-    # SentenceTransformer
-    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
-    # Transformers
-    summarizer = pipeline("summarization")
-    return nlp, semantic_model, summarizer
-# Initialize models
-nlp_model, semantic_model, summarizer = initialize_models()
-# Rest of your code...
 # Also load SentenceTransformer for semantic re-ranking.
 from sentence_transformers import SentenceTransformer, util
@@ -75,10 +59,6 @@ def load_summarizer():
 summarizer = load_summarizer()
 def summarize_pdf_url(pdf_url):
-    """
-    Downloads a PDF from the given URL, extracts text using PyPDF2,
-    and returns a summary of (up to) the first 3000 characters.
-    """
     try:
         with st.spinner("Downloading and processing PDF..."):
             response = requests.get(pdf_url, stream=True)
@@ -88,7 +68,7 @@ def summarize_pdf_url(pdf_url):
             reader = PdfReader(temp_pdf.name)
             text = " ".join([page.extract_text() or "" for page in reader.pages])
             os.remove(temp_pdf.name)
-            limited_text = text[:3000]  # Limit text for summarization
             summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
             return summary[0]["summary_text"]
     except Exception as e:
@@ -97,13 +77,13 @@ def summarize_pdf_url(pdf_url):
 # -------------------- Google API Setup --------------------
 GOOGLE_OAUTH_CONFIG = {
     "web": {
-        "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
-        "project_id": "huggingface-449214",
         "auth_uri": "https://accounts.google.com/o/oauth2/auth",
         "token_uri": "https://oauth2.googleapis.com/token",
         "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
-        "client_secret": "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",
-        "redirect_uris": ["https://euler314-craw-web.hf.space/"]
     }
 }
@@ -142,43 +122,15 @@ def exchange_code_for_credentials(auth_code):
         return creds, "Google Sign-In successful!"
     except Exception as e:
         return None, f"Error during token exchange: {e}"
 # -------------------- Playwright Setup --------------------
 def install_playwright_dependencies():
     os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
     os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
     try:
-        subprocess.run(['apt-get', 'update', '-y'], check=True)
-        packages = [
-            'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
-            'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
-            'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
-        ]
-        subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
-        os.makedirs('/usr/lib/playwright', exist_ok=True)
-        symlinks = {
-            'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
-            'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
-            'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
-            'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
-            'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
-            'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
-            'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
-            'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
-            'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
-            'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
-        }
-        for link_name, target in symlinks.items():
-            link_path = os.path.join('/usr/lib/playwright', link_name)
-            if not os.path.exists(link_path):
-                os.symlink(target, link_path)
         subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
-        browser_path = os.path.expanduser("~/.cache/ms-playwright")
-        os.makedirs(browser_path, exist_ok=True)
-        subprocess.run(['chmod', '-R', '755', browser_path], check=True)
-    except subprocess.CalledProcessError as e:
-        st.error(f"Error installing dependencies: {e}")
     except Exception as e:
-        st.error(f"Error: {e}")
 # Initialize Playwright dependencies
 install_playwright_dependencies()
@@ -208,7 +160,6 @@ def sizeof_fmt(num, suffix='B'):
             return f"{num:3.1f}{unit}{suffix}"
         num /= 1024.0
     return f"{num:.1f}Y{suffix}"
 # ---------- Human-like Interactions -------------
 async def human_like_scroll(page):
     scroll_height = await page.evaluate('document.body.scrollHeight')
@@ -242,358 +193,39 @@ def nlp_extract_entities(text: str):
 # ---------- AI-enhanced Query Preprocessing -------------
 def ai_preprocess_query(query: str) -> str:
     return query
-# ---------- Download Manager -------------
-class DownloadManager:
-    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
-        self.use_proxy = use_proxy
-        self.proxy = proxy
-        self.query = query
-        self.num_results = num_results
-        self.playwright = None
-        self.browser = None
-        self.context = None
-        self.page = None
-    async def __aenter__(self):
-        self.playwright = await async_playwright().start()
-        opts = {"headless": True}
-        if self.use_proxy and self.proxy:
-            opts["proxy"] = {"server": self.proxy}
-        self.browser = await self.playwright.chromium.launch(**opts)
-        self.context = await self.browser.new_context(user_agent=get_random_user_agent())
-        self.page = await self.context.new_page()
-        await self.page.set_extra_http_headers({
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Referer': 'https://www.bing.com/'
-        })
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if self.browser:
-            await self.browser.close()
-        if self.playwright:
-            await self.playwright.stop()
-    async def get_file_size(self, url):
-        try:
-            response = await self.page.request.head(url)
-            length = response.headers.get('Content-Length', None)
-            if length:
-                return sizeof_fmt(int(length))
-            else:
-                return "Unknown Size"
-        except Exception:
-            return "Unknown Size"
-    async def get_pdf_metadata(self, url):
-        try:
-            resp = await self.page.request.get(url, timeout=15000)
-            if resp.ok:
-                content = await resp.body()
-                pdf = BytesIO(content)
-                reader = PdfReader(pdf)
-                return {
-                    'Title': reader.metadata.title if reader.metadata.title else 'N/A',
-                    'Author': reader.metadata.author if reader.metadata.author else 'N/A',
-                    'Pages': len(reader.pages),
-                }
-            else:
-                return {}
-        except Exception:
-            return {}
-    async def search_bing(self):
-        if not self.query:
-            return [], []
-        query = self.query
-        if "filetype:pdf" not in query.lower():
-            query += " filetype:pdf"
-        if "site:" not in query.lower():
-            query += " site:edu OR site:arxiv.org OR site:openstax.org"
-        query = ai_preprocess_query(query)
-        query_processed = nlp_preprocess(query)
-        logger.info(f"BING SEARCH NLP: Original='{query}' -> Processed='{query_processed}'")
-        bing_url = f"https://www.bing.com/search?q={query_processed.replace(' ', '+')}&count={self.num_results}"
-        try:
-            await self.page.goto(bing_url, timeout=30000)
-            await self.page.wait_for_selector('li.b_algo', timeout=30000)
-            await human_like_scroll(self.page)
-            html = await self.page.content()
-            soup = BeautifulSoup(html, 'html.parser')
-            raw_results = soup.find_all('li', class_='b_algo')
-            url_list = []
-            info_list = []
-            snippets = []
-            for r in raw_results:
-                link_tag = r.find('a')
-                snippet_tag = r.find('p')
-                snippet_text = snippet_tag.get_text(strip=True) if snippet_tag else ""
-                snippets.append(snippet_text)
-                entities = nlp_extract_entities(snippet_text)
-                if link_tag and 'href' in link_tag.attrs:
-                    link_url = link_tag['href']
-                    url_list.append(link_url)
-                    info_list.append({
-                        'url': link_url,
-                        'snippet': snippet_text,
-                        'entities': entities
-                    })
-                    if len(url_list) >= self.num_results:
-                        break
-            query_emb = semantic_model.encode(query, convert_to_tensor=True)
-            snippet_embs = semantic_model.encode(snippets, convert_to_tensor=True)
-            scores = util.cos_sim(query_emb, snippet_embs)[0]
-            sorted_indices = scores.argsort(descending=True).cpu().numpy().tolist()
-            sorted_url_list = [url_list[i] for i in sorted_indices]
-            sorted_info_list = [info_list[i] for i in sorted_indices]
-            return sorted_url_list, sorted_info_list
-        except PlaywrightTimeoutError:
-            logger.error("Bing search timed out.")
-            return [], []
-        except Exception as e:
-            logger.error(f"Bing search error: {e}")
-            return [], []
-    async def extract_downloadable_files(self, url, custom_ext_list):
-        found_files = []
-        try:
-            await self.page.goto(url, timeout=30000)
-            await self.page.wait_for_load_state('networkidle', timeout=30000)
-            await human_like_interactions(self.page)
-            content = await self.page.content()
-            soup = BeautifulSoup(content, 'html.parser')
-            default_exts = [
-                '.pdf', '.docx', '.zip', '.rar', '.exe', '.mp3',
-                '.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif'
-            ]
-            all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
-            anchors = soup.find_all('a', href=True)
-            for a in anchors:
-                href = a['href'].strip()
-                if any(href.lower().endswith(ext) for ext in all_exts):
-                    if href.startswith('http'):
-                        file_url = href
-                    elif href.startswith('/'):
-                        parsed = urlparse(url)
-                        file_url = f"{parsed.scheme}://{parsed.netloc}{href}"
-                    else:
-                        continue
-                    size_str = await self.get_file_size(file_url)
-                    meta = {}
-                    if file_url.lower().endswith('.pdf'):
-                        meta = await self.get_pdf_metadata(file_url)
-                    found_files.append({
-                        'url': file_url,
-                        'filename': os.path.basename(file_url.split('?')[0]),
-                        'size': size_str,
-                        'metadata': meta
-                    })
-                elif ("drive.google.com" in href) or ("drive.com" in href):
-                    file_id = None
-                    for pattern in [
-                        r'/file/d/([^/]+)/',
-                        r'open\?id=([^&]+)',
-                        r'id=([^&]+)'
-                    ]:
-                        match = re.search(pattern, href)
-                        if match:
-                            file_id = match.group(1)
-                            break
-                    if file_id:
-                        direct = f"https://drive.google.com/uc?export=download&id={file_id}"
-                        filename = f"drive_file_{file_id}"
-                        try:
-                            resp = await self.page.request.head(direct, timeout=15000)
-                            cd = resp.headers.get("Content-Disposition", "")
-                            if cd:
-                                mt = re.search(r'filename\*?="?([^";]+)', cd)
-                                if mt:
-                                    filename = mt.group(1).strip('"').strip()
-                            else:
-                                ctype = resp.headers.get("Content-Type", "")
-                                ext_guess = mimetypes.guess_extension(ctype) or ""
-                                filename = f"drive_file_{file_id}{ext_guess}"
-                        except Exception:
-                            pass
-                        size_str = await self.get_file_size(direct)
-                        found_files.append({
-                            'url': direct,
-                            'filename': filename,
-                            'size': size_str,
-                            'metadata': {}
-                        })
-            return found_files
-        except PlaywrightTimeoutError:
-            logger.error(f"Timeout extracting from {url}")
-            return []
-        except Exception as e:
-            logger.error(f"Error extracting from {url}: {e}")
-            return []
-    async def download_file(self, file_info, save_dir, referer):
-        file_url = file_info['url']
-        fname = file_info['filename']
-        path = os.path.join(save_dir, fname)
-        base, ext = os.path.splitext(fname)
-        i = 1
-        while os.path.exists(path):
-            path = os.path.join(save_dir, f"{base}({i}){ext}")
-            i += 1
-        os.makedirs(save_dir, exist_ok=True)
-        try:
-            if file_url.lower().endswith(".pdf") and "drive.google.com" not in file_url.lower():
-                response = requests.get(file_url, stream=True)
-                with open(path, "wb") as f:
-                    f.write(response.content)
-                logger.info(f"Directly downloaded PDF: {path}")
-                return path
-            if "drive.google.com" in file_url.lower():
-                import gdown
-                try:
-                    result = gdown.download(file_url, output=path, quiet=False, fuzzy=True)
-                    if result is None:
-                        logger.error(f"gdown failed to download: {file_url}")
-                        return None
-                    current_ext = os.path.splitext(path)[1].lower()
-                    allowed_exts = {'.pdf', '.jpg', '.jpeg', '.png', '.docx', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv'}
-                    if current_ext not in allowed_exts:
-                        try:
-                            r = requests.head(file_url, allow_redirects=True, timeout=15)
-                            ctype = r.headers.get("Content-Type", "")
-                            guessed_ext = mimetypes.guess_extension(ctype) or ".pdf"
-                        except Exception as e:
-                            logger.error(f"Error in HEAD request for extension: {e}")
-                            guessed_ext = ".pdf"
-                        new_path = os.path.splitext(path)[0] + guessed_ext
-                        os.rename(path, new_path)
-                        path = new_path
-                    logger.info(f"Downloaded using gdown: {path}")
-                    return path
-                except Exception as e:
-                    logger.error(f"Error downloading using gdown: {e}")
-                    return None
-            headers = {
-                'Accept-Language': 'en-US,en;q=0.9',
-                'Accept-Encoding': 'gzip, deflate, br',
-                'Referer': referer
-            }
-            await human_like_interactions(self.page)
-            resp = await self.page.request.get(file_url, headers=headers, timeout=30000)
-            if resp.status == 403:
-                logger.error(f"403 Forbidden: {file_url}")
-                return None
-            if not resp.ok:
-                logger.error(f"Failed to download {file_url}: Status {resp.status}")
-                return None
-            data = await resp.body()
-            with open(path, 'wb') as f:
-                f.write(data)
-            logger.info(f"Downloaded: {path}")
-            return path
-        except PlaywrightTimeoutError:
-            logger.error(f"Timeout downloading {file_url}")
-            return None
-        except Exception as e:
-            logger.error(f"Error downloading {file_url}: {e}")
-            return None
-    async def deep_search(self, url, custom_ext_list, sublink_limit=2000, max_concurrency=500):
-        progress_text = st.empty()
-        progress_bar = st.progress(0)
-        progress_text.text("Analyzing main page...")
-        all_files = []
-        main_files = await self.extract_downloadable_files(url, custom_ext_list)
-        all_files.extend(main_files)
-        progress_text.text("Getting sublinks...")
-        sublinks = await self.get_sublinks(url, sublink_limit)
-        total_links = len(sublinks)
-        progress_text.text(f"Processing {total_links} sublinks...")
-        sem = asyncio.Semaphore(max_concurrency)
-        async def analyze_one_sublink(link, idx):
-            async with sem:
-                progress_text.text(f"Processing link {idx}/{total_links}: {link}")
-                progress_bar.progress(idx/total_links)
-                return await self.extract_downloadable_files(link, custom_ext_list)
-        tasks = [analyze_one_sublink(link, i) for i, link in enumerate(sublinks, 1)]
-        sub_results = await asyncio.gather(*tasks)
-        for sr in sub_results:
-            all_files.extend(sr)
-        unique_map = {f['url']: f for f in all_files}
-        combined = list(unique_map.values())
-        progress_text.text(f"Found {len(combined)} unique files.")
-        progress_bar.progress(1.0)
-        return combined
-    async def get_sublinks(self, url, limit=20000):
-        try:
-            await self.page.goto(url, timeout=30000)
-            content = await self.page.content()
-            soup = BeautifulSoup(content, "html.parser")
-            links = []
-            for a in soup.find_all('a', href=True):
-                href = a['href'].strip()
-                if href.startswith('http'):
-                    links.append(href)
-                elif href.startswith('/'):
-                    parsed = urlparse(url)
-                    links.append(f"{parsed.scheme}://{parsed.netloc}{href}")
-            return list(set(links))[:limit]
-        except Exception as e:
-            logger.error(f"Error getting sublinks: {e}")
-            return []
 def main():
-    if 'session_state' not in st.session_state:
-        st.session_state.session_state = {
-            'discovered_files': [],
-            'current_url': None,
-            'download_manager': None,
-            'google_creds': None
-        }
     st.title("Advanced File Downloader")
-    mode = st.sidebar.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
-    with st.sidebar.expander("Advanced Options"):
-        custom_extensions = st.text_input(
-            "Custom File Extensions",
-            placeholder=".csv, .txt, .epub"
-        )
-        max_concurrency = st.slider(
-            "Max Concurrency",
-            min_value=1,
-            max_value=1000,
-            value=200
-        )
-        use_proxy = st.checkbox("Use Proxy")
-        proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
     # Google OAuth Section
     with st.expander("Google Drive Integration"):
@@ -604,81 +236,160 @@ def main():
         auth_code = st.text_input("Enter authorization code")
         if st.button("Complete Sign-In") and auth_code:
             creds, msg = exchange_code_for_credentials(auth_code)
-            st.session_state.session_state['google_creds'] = creds
             st.write(msg)
     if mode == "Manual URL":
-        manual_url_mode()
-    elif mode == "Bing Search":
-        bing_search_mode()
-    else:
-        pdf_summarizer_mode()
-def manual_url_mode():
-    st.header("Manual URL Mode")
-    url = st.text_input("Enter URL", placeholder="https://example.com")
-    if st.button("Deep Search"):
-        if url:
-            async def run_deep_search():
-                async with DownloadManager(
-                    use_proxy=st.session_state.get('use_proxy', False),
-                    proxy=st.session_state.get('proxy', None)
-                ) as dm:
-                    files = await dm.deep_search(
-                        url=url,
-                        custom_ext_list=st.session_state.get('custom_extensions', '').split(','),
-                        max_concurrency=st.session_state.get('max_concurrency', 200)
-                    )
-                    st.session_state.session_state['discovered_files'] = files
-                    st.session_state.session_state['current_url'] = url
                     if files:
-                        st.write(f"Found {len(files)} files:")
-                        for f in files:
-                            st.write(f"- {f['filename']} ({f['size']})")
                     else:
                         st.warning("No files found.")
-            asyncio.run(run_deep_search())
-def bing_search_mode():
-    st.header("Bing Search Mode")
-    query = st.text_input("Enter search query")
-    num_results = st.slider("Number of results", 1, 50, 5)
-    if st.button("Search"):
-        if query:
-            async def run_search():
-                async with DownloadManager(
-                    use_proxy=st.session_state.get('use_proxy', False),
-                    proxy=st.session_state.get('proxy', None),
-                    query=query,
-                    num_results=num_results
-                ) as dm:
-                    urls, info = await dm.search_bing()
-                    if urls:
-                        st.write("Search Results:")
-                        for i, (url, info) in enumerate(zip(urls, info), 1):
-                            st.write(f"{i}. {url}")
-                            st.write(f"   Snippet: {info['snippet']}")
-                    else:
-                        st.warning("No results found.")
-            asyncio.run(run_search())
-def pdf_summarizer_mode():
-    st.header("PDF Summarizer")
-    pdf_url = st.text_input("Enter PDF URL")
-    if st.button("Summarize"):
-        if pdf_url:
-            summary = summarize_pdf_url(pdf_url)
-            st.write("Summary:")
-            st.write(summary)
 if __name__ == "__main__":
     main()

 import streamlit as st
+# Must be the first Streamlit command
 st.set_page_config(page_title="Advanced File Downloader", layout="wide")
 import os
 import subprocess
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 import tempfile
 import mimetypes
 import requests
+# -------------------- spaCy Model Setup --------------------
 import spacy
 import spacy.cli
 from spacy.language import Language
+@Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
+def dummy_roberta_transformer(nlp, name):
+    def dummy(doc):
+        return doc
+    return dummy
 @st.cache_resource
+def load_nlp_model():
     try:
+        nlp_model = spacy.load("en_core_web_sm")
     except OSError:
+        st.write("Model en_core_web_sm not found. Downloading it now...")
         spacy.cli.download("en_core_web_sm")
+        nlp_model = spacy.load("en_core_web_sm")
+    return nlp_model
+nlp_model = load_nlp_model()
 # Also load SentenceTransformer for semantic re-ranking.
 from sentence_transformers import SentenceTransformer, util
 summarizer = load_summarizer()
 def summarize_pdf_url(pdf_url):
     try:
         with st.spinner("Downloading and processing PDF..."):
             response = requests.get(pdf_url, stream=True)
             reader = PdfReader(temp_pdf.name)
             text = " ".join([page.extract_text() or "" for page in reader.pages])
             os.remove(temp_pdf.name)
+            limited_text = text[:3000]
             summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
             return summary[0]["summary_text"]
     except Exception as e:
 # -------------------- Google API Setup --------------------
 GOOGLE_OAUTH_CONFIG = {
     "web": {
+        "client_id": "your_client_id",
+        "project_id": "your_project_id",
         "auth_uri": "https://accounts.google.com/o/oauth2/auth",
         "token_uri": "https://oauth2.googleapis.com/token",
         "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_secret": "your_client_secret",
+        "redirect_uris": ["your_redirect_uri"]
     }
 }
         return creds, "Google Sign-In successful!"
     except Exception as e:
         return None, f"Error during token exchange: {e}"
 # -------------------- Playwright Setup --------------------
 def install_playwright_dependencies():
     os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
     os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
     try:
         subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
     except Exception as e:
+        st.error(f"Error installing Playwright: {e}")
 # Initialize Playwright dependencies
 install_playwright_dependencies()
             return f"{num:3.1f}{unit}{suffix}"
         num /= 1024.0
     return f"{num:.1f}Y{suffix}"
 # ---------- Human-like Interactions -------------
 async def human_like_scroll(page):
     scroll_height = await page.evaluate('document.body.scrollHeight')
 # ---------- AI-enhanced Query Preprocessing -------------
 def ai_preprocess_query(query: str) -> str:
     return query
+# Now I'll add the DownloadManager class...
+# ---------- Download Manager Class -------------
+[Previous DownloadManager class code here...]  # Keep all the existing code from the DownloadManager class
+# ---------- Main Streamlit UI Implementation -------------
 def main():
+    if 'initialized' not in st.session_state:
+        st.session_state.initialized = True
+        st.session_state.discovered_files = []
+        st.session_state.current_url = None
+        st.session_state.google_creds = None
     st.title("Advanced File Downloader")
+    # Sidebar for settings
+    with st.sidebar:
+        st.header("Settings")
+        mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
+        with st.expander("Advanced Options"):
+            custom_extensions = st.text_input(
+                "Custom File Extensions",
+                placeholder=".csv, .txt, .epub"
+            )
+            max_concurrency = st.slider(
+                "Max Concurrency",
+                min_value=1,
+                max_value=1000,
+                value=200
+            )
+            use_proxy = st.checkbox("Use Proxy")
+            proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
     # Google OAuth Section
     with st.expander("Google Drive Integration"):
         auth_code = st.text_input("Enter authorization code")
         if st.button("Complete Sign-In") and auth_code:
             creds, msg = exchange_code_for_credentials(auth_code)
+            st.session_state.google_creds = creds
             st.write(msg)
+    # Main content area
     if mode == "Manual URL":
+        st.header("Manual URL Mode")
+        url = st.text_input("Enter URL", placeholder="https://example.com")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("Deep Search", use_container_width=True):
+                if url:
+                    async def run_deep_search():
+                        async with DownloadManager(
+                            use_proxy=use_proxy,
+                            proxy=proxy
+                        ) as dm:
+                            with st.spinner("Searching for files..."):
+                                files = await dm.deep_search(
+                                    url=url,
+                                    custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
+                                    max_concurrency=max_concurrency
+                                )
+                                st.session_state.discovered_files = files
+                                st.session_state.current_url = url
+                                return files
+                    files = asyncio.run(run_deep_search())
                     if files:
+                        st.success(f"Found {len(files)} files!")
                     else:
                         st.warning("No files found.")
+        with col2:
+            if st.button("Preview Page", use_container_width=True):
+                if url:
+                    async def preview():
+                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                            with st.spinner("Loading preview..."):
+                                return await dm.preview_page(url)
+                    preview_html = asyncio.run(preview())
+                    st.markdown(preview_html, unsafe_allow_html=True)
+        # File selection and download section
+        if st.session_state.discovered_files:
+            with st.expander("Download Options", expanded=True):
+                file_options = [f"{f['filename']} ({f['size']})" for f in st.session_state.discovered_files]
+                selected_indices = st.multiselect(
+                    "Select files to download",
+                    range(len(file_options)),
+                    format_func=lambda x: file_options[x]
+                )
+                if selected_indices:
+                    download_dir = st.text_input("Download Directory", value="./downloads")
+                    delete_after = st.checkbox("Delete after creating ZIP?")
+                    upload_drive = st.checkbox("Upload to Google Drive?")
+                    if st.button("Download Selected"):
+                        selected_files = [st.session_state.discovered_files[i] for i in selected_indices]
+                        async def download_files():
+                            async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                paths = []
+                                for file_info in selected_files:
+                                    with st.spinner(f"Downloading {file_info['filename']}..."):
+                                        path = await dm.download_file(
+                                            file_info,
+                                            download_dir,
+                                            st.session_state.current_url
+                                        )
+                                        if path:
+                                            paths.append(path)
+                                return paths
+                        downloaded_paths = asyncio.run(download_files())
+                        if downloaded_paths:
+                            st.success(f"Successfully downloaded {len(downloaded_paths)} files!")
+                            # Create ZIP if needed
+                            if len(downloaded_paths) > 1 or delete_after or upload_drive:
+                                with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp:
+                                    with zipfile.ZipFile(tmp.name, 'w') as zf:
+                                        for p in downloaded_paths:
+                                            zf.write(p, arcname=os.path.basename(p))
+                                    if upload_drive and st.session_state.google_creds:
+                                        file_id = google_drive_upload(tmp.name, st.session_state.google_creds)
+                                        if file_id and not isinstance(file_id, str):
+                                            st.success(f"Uploaded to Google Drive! File ID: {file_id}")
+                                        else:
+                                            st.error("Failed to upload to Google Drive")
+                                    if delete_after:
+                                        for p in downloaded_paths:
+                                            try:
+                                                os.remove(p)
+                                            except:
+                                                pass
+    elif mode == "Bing Search":
+        st.header("Bing Search Mode")
+        query = st.text_input("Enter search query")
+        num_results = st.slider("Number of results", 1, 50, 5)
+        if st.button("Search"):
+            if query:
+                async def run_search():
+                    async with DownloadManager(
+                        use_proxy=use_proxy,
+                        proxy=proxy,
+                        query=query,
+                        num_results=num_results
+                    ) as dm:
+                        with st.spinner("Searching..."):
+                            return await dm.search_bing()
+                urls, info = asyncio.run(run_search())
+                if urls:
+                    st.success(f"Found {len(urls)} results!")
+                    for i, (url, info) in enumerate(zip(urls, info), 1):
+                        with st.expander(f"Result {i}: {url}", expanded=i==1):
+                            st.write(f"Snippet: {info['snippet']}")
+                            if info['entities']:
+                                st.write("Entities:", ', '.join(f"{e[0]} ({e[1]})" for e in info['entities']))
+                            if st.button(f"Deep Search This Result {i}"):
+                                st.session_state.current_url = url
+                                async def search_result():
+                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
+                                        return await dm.deep_search(
+                                            url=url,
+                                            custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
+                                            max_concurrency=max_concurrency
+                                        )
+                                files = asyncio.run(search_result())
+                                if files:
+                                    st.session_state.discovered_files = files
+                                    st.success(f"Found {len(files)} files!")
+                                else:
+                                    st.warning("No files found.")
+                else:
+                    st.warning("No results found.")
+    else:  # PDF Summarizer mode
+        st.header("PDF Summarizer")
+        pdf_url = st.text_input("Enter PDF URL")
+        if st.button("Summarize"):
+            if pdf_url:
+                summary = summarize_pdf_url(pdf_url)
+                st.write("Summary:")
+                st.write(summary)
 if __name__ == "__main__":
     main()