Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 15

Commit

4b9057c

verified ·

1 Parent(s): 697d57f

Upload 9 files

Browse files

Files changed (8) hide show

.dockerignore +19 -0
Dockerfile +92 -0
Privacy Policy.txt +34 -0
README.md +5 -5
Terms of Service.txt +37 -0
app.py +674 -0
app_hf.py +5 -0
requirements.txt +13 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,19 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.pytest_cache
+.env
+.venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,92 @@

+FROM ubuntu:22.04
+USER root
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HOME=/home/user
+ENV PLAYWRIGHT_BROWSERS_PATH=${HOME}/.cache/ms-playwright
+ENV LD_LIBRARY_PATH=/usr/lib/playwright:/usr/lib/x86_64-linux-gnu
+ENV GRADIO_NODE_PORT=disabled
+RUN useradd -m -d /home/user user && \
+    mkdir -p ${HOME}/.cache/ms-playwright && \
+    mkdir -p /usr/lib/playwright && \
+    chown -R user:user ${HOME}/.cache && \
+    chmod -R 755 ${HOME}/.cache
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3-pip \
+    python3.11-dev \
+    wget \
+    unzip \
+    ca-certificates \
+    libnss3 \
+    libnss3-tools \
+    libnspr4 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libatspi2.0-0 \
+    libcups2 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxrandr2 \
+    libxkbcommon0 \
+    libx11-xcb1 \
+    libxcursor1 \
+    libxi6 \
+    libxss1 \
+    libxtst6 \
+    libasound2 \
+    libx11-6 \
+    libxcb1 \
+    libxext6 \
+    libxfixes3 \
+    libxrender1 \
+    libdbus-1-3 \
+    libdrm2 \
+    libpango-1.0-0 \
+    fonts-liberation \
+    fonts-noto-color-emoji \
+    gcc && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnss3.so /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libnssutil3.so /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libsmime3.so /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libnspr4.so /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libatk-1.0.so.0 /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0 /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libcups.so.2 /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libatspi.so.0 /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libXcomposite.so.1 /usr/lib/playwright/ && \
+    ln -s /usr/lib/x86_64-linux-gnu/libXdamage.so.1 /usr/lib/playwright/
+WORKDIR /app
+COPY requirements.txt ./
+RUN pip3 install --no-cache-dir -r requirements.txt
+RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip3 install playwright==1.30.0
+RUN cd ${HOME}/.cache/ms-playwright && \
+    wget -q https://playwright.azureedge.net/builds/chromium/1045/chromium-linux.zip && \
+    unzip chromium-linux.zip && \
+    rm chromium-linux.zip && \
+    chmod -R 755 ${HOME}/.cache/ms-playwright
+COPY . .
+RUN chown -R user:user /app && \
+    chmod -R 755 /app && \
+    chmod -R 755 ${HOME}/.cache/ms-playwright && \
+    chmod -R 755 /usr/lib/playwright
+USER user
+CMD ["python3", "app.py"]

	@@ -0,0 +1,34 @@

+Effective Date: 1/28/2025
+Welcome to Craw_Web ("we," "our," or "us"). Your privacy is important to us. This Privacy Policy explains how we collect, use, disclose, and safeguard your information when you use our platform.
+1. Information We Collect
+Personal Information: Name, email address, phone number, or any other personal data you provide while using Craw_Web.
+Usage Data: IP address, browser type, operating system, device information, and details about your interactions with Craw_Web, such as the pages visited or actions performed.
+2. How We Use Your Information
+We use the information we collect for the following purposes:
+To operate and maintain Craw_Web.
+To personalize your experience and deliver content and features based on your preferences.
+To communicate with you, including responding to inquiries or providing updates about Craw_Web.
+To improve Craw_Web by analyzing usage trends and gathering feedback.
+3. Sharing Your Information
+We do not sell or rent your personal information to third parties. We may share information in the following circumstances:
+With service providers who assist in operating Craw_Web (e.g., hosting, analytics).
+To comply with legal obligations, such as responding to a court order or regulatory request.
+To protect the rights, safety, and security of Craw_Web and its users.
+4. Data Security
+We implement reasonable security measures to protect your data. However, no method of transmission over the internet is entirely secure. Therefore, we cannot guarantee absolute security.
+5. Your Rights
+You may have rights under applicable data protection laws, including:
+The right to access your data.
+The right to request corrections or deletions of your data.
+The right to opt out of certain types of data processing.
+6. Changes to This Policy
+We may update this Privacy Policy from time to time. The revised policy will be effective as of the updated date and available on Craw_Web.
+7. Contact Us
+If you have any questions about this Privacy Policy, you can contact us at: [email protected]

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
 title: Craw Web
-emoji: 🌍
-colorFrom: blue
 colorTo: yellow
 sdk: streamlit
-sdk_version: 1.42.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: an application for web scraping
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Craw Web
+emoji: 🔥
+colorFrom: pink
 colorTo: yellow
 sdk: streamlit
+sdk_version: 5.16.0
 app_file: app.py
 pinned: false
 license: mit
+short_description: 'a application to craw the web '
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

	@@ -0,0 +1,37 @@

+Effective Date: 1/28/2025
+Welcome to Craw_Web. By accessing or using our platform, you agree to these Terms of Service ("Terms"). If you do not agree, please refrain from using Craw_Web.
+1. Use of Craw_Web
+You agree to use Craw_Web only for lawful purposes and in compliance with these Terms.
+You must not attempt to harm, disrupt, or misuse Craw_Web, including engaging in unauthorized access or data scraping.
+2. User Accounts
+You may be required to create an account to access certain features of Craw_Web.
+You are responsible for safeguarding your account credentials and ensuring their confidentiality.
+3. Intellectual Property
+All content, trademarks, and materials on Craw_Web are owned by us or our licensors.
+You may not reproduce, distribute, or use any content without prior written permission.
+4. Prohibited Activities
+You agree not to:
+Engage in activities that violate any laws or regulations.
+Upload or distribute harmful or malicious content.
+Use Craw_Web to spam, harass, or harm others.
+5. Limitation of Liability
+We are not liable for any damages or losses arising from:
+Your use of Craw_Web.
+Technical issues or interruptions in service.
+Unauthorized access to your account or data.
+6. Termination
+We may suspend or terminate your access to Craw_Web at any time for any reason, including violations of these Terms.
+7. Changes to Terms
+We may update these Terms from time to time. Continued use of Craw_Web after changes are made constitutes acceptance of the revised Terms.
+8. Governing Law
+These Terms are governed by the laws of [Your Jurisdiction], without regard to conflict of law principles.
+9. Contact Us
+If you have any questions about these Terms, you can contact us at: [email protected]

app.py ADDED Viewed

	@@ -0,0 +1,674 @@

+import os
+import subprocess
+from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
+import asyncio
+import logging
+from urllib.parse import urlparse
+import re
+from pathlib import Path
+from io import BytesIO
+import random
+import streamlit as st
+from bs4 import BeautifulSoup
+from PyPDF2 import PdfReader
+import zipfile
+import tempfile
+import mimetypes
+import requests
+# -------------------- spaCy Model Setup --------------------
+import spacy
+import spacy.cli
+from spacy.language import Language
+# Register a dummy factory under the exact key that the transformer model expects.
+@Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
+def dummy_roberta_transformer(nlp, name):
+    # This dummy component simply passes the Doc through.
+    def dummy(doc):
+        return doc
+    return dummy
+# Try to load the transformer-based model.
+@st.cache_resource
+def load_nlp_model():
+    try:
+        nlp_model = spacy.load("en_core_web_trf")
+    except OSError:
+        st.write("Model en_core_web_trf not found. Downloading it now...")
+        spacy.cli.download("en_core_web_trf")
+        try:
+            nlp_model = spacy.load("en_core_web_trf")
+        except Exception as e:
+            st.error(f"Error loading model after download: {e}")
+            st.write("Falling back to en_core_web_sm...")
+            spacy.cli.download("en_core_web_sm")
+            nlp_model = spacy.load("en_core_web_sm")
+    return nlp_model
+nlp_model = load_nlp_model()
+# Also load SentenceTransformer for semantic re-ranking.
+from sentence_transformers import SentenceTransformer, util
+@st.cache_resource
+def load_semantic_model():
+    return SentenceTransformer('all-MiniLM-L6-v2')
+semantic_model = load_semantic_model()
+# -------------------- Transformers Summarization Setup --------------------
+from transformers import pipeline
+@st.cache_resource
+def load_summarizer():
+    return pipeline("summarization")
+summarizer = load_summarizer()
+def summarize_pdf_url(pdf_url):
+    """
+    Downloads a PDF from the given URL, extracts text using PyPDF2,
+    and returns a summary of (up to) the first 3000 characters.
+    """
+    try:
+        with st.spinner("Downloading and processing PDF..."):
+            response = requests.get(pdf_url, stream=True)
+            temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+            with open(temp_pdf.name, "wb") as f:
+                f.write(response.content)
+            reader = PdfReader(temp_pdf.name)
+            text = " ".join([page.extract_text() or "" for page in reader.pages])
+            os.remove(temp_pdf.name)
+            limited_text = text[:3000]  # Limit text for summarization
+            summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
+            return summary[0]["summary_text"]
+    except Exception as e:
+        return f"Error summarizing PDF: {e}"
+# -------------------- Google API Setup --------------------
+GOOGLE_OAUTH_CONFIG = {
+    "web": {
+        "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
+        "project_id": "huggingface-449214",
+        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+        "token_uri": "https://oauth2.googleapis.com/token",
+        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_secret": "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",
+        "redirect_uris": ["https://euler314-craw-web.hf.space/"]
+    }
+}
+import google_auth_oauthlib.flow
+import googleapiclient.discovery
+import google.auth.transport.requests
+def get_google_auth_url():
+    client_config = GOOGLE_OAUTH_CONFIG["web"]
+    flow = google_auth_oauthlib.flow.Flow.from_client_config(
+        {"web": client_config},
+        scopes=["https://www.googleapis.com/auth/drive.file"]
+    )
+    flow.redirect_uri = client_config["redirect_uris"][0]
+    authorization_url, _ = flow.authorization_url(
+        access_type="offline",
+        include_granted_scopes="true",
+        prompt="consent"
+    )
+    return authorization_url
+def exchange_code_for_credentials(auth_code):
+    if not auth_code.strip():
+        return None, "No code provided."
+    try:
+        client_config = GOOGLE_OAUTH_CONFIG["web"]
+        flow = google_auth_oauthlib.flow.Flow.from_client_config(
+            {"web": client_config},
+            scopes=["https://www.googleapis.com/auth/drive.file"]
+        )
+        flow.redirect_uri = client_config["redirect_uris"][0]
+        flow.fetch_token(code=auth_code.strip())
+        creds = flow.credentials
+        if not creds or not creds.valid:
+            return None, "Could not validate credentials. Check code and try again."
+        return creds, "Google Sign-In successful!"
+    except Exception as e:
+        return None, f"Error during token exchange: {e}"
+# -------------------- Playwright Setup --------------------
+def install_playwright_dependencies():
+    os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
+    os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
+    try:
+        subprocess.run(['apt-get', 'update', '-y'], check=True)
+        packages = [
+            'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
+            'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
+            'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
+        ]
+        subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
+        os.makedirs('/usr/lib/playwright', exist_ok=True)
+        symlinks = {
+            'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
+            'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
+            'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
+            'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
+            'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
+            'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
+            'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
+            'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
+            'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
+            'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
+        }
+        for link_name, target in symlinks.items():
+            link_path = os.path.join('/usr/lib/playwright', link_name)
+            if not os.path.exists(link_path):
+                os.symlink(target, link_path)
+        subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
+        browser_path = os.path.expanduser("~/.cache/ms-playwright")
+        os.makedirs(browser_path, exist_ok=True)
+        subprocess.run(['chmod', '-R', '755', browser_path], check=True)
+    except subprocess.CalledProcessError as e:
+        st.error(f"Error installing dependencies: {e}")
+    except Exception as e:
+        st.error(f"Error: {e}")
+# Initialize Playwright dependencies
+install_playwright_dependencies()
+# -------------------- Logging Setup --------------------
+logging.basicConfig(
+    filename='advanced_download_log.txt',
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger()
+# -------------------- Shared Utils --------------------
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
+]
+def get_random_user_agent():
+    return random.choice(USER_AGENTS)
+def sizeof_fmt(num, suffix='B'):
+    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Y{suffix}"
+# ---------- Human-like Interactions -------------
+async def human_like_scroll(page):
+    scroll_height = await page.evaluate('document.body.scrollHeight')
+    viewport_height = await page.evaluate('window.innerHeight')
+    current_scroll = 0
+    while current_scroll < scroll_height:
+        await page.evaluate(f'window.scrollTo(0, {current_scroll})')
+        await asyncio.sleep(random.uniform(0.5, 1.5))
+        current_scroll += viewport_height * random.uniform(0.5, 1.5)
+        scroll_height = await page.evaluate('document.body.scrollHeight')
+async def human_like_interactions(page):
+    await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000))
+    await asyncio.sleep(random.uniform(0.5, 1.5))
+    await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000))
+    await asyncio.sleep(random.uniform(0.5, 1.5))
+    await page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
+    await asyncio.sleep(random.uniform(0.5, 1.5))
+# ---------- NLP Helpers -------------
+def nlp_preprocess(query: str) -> str:
+    doc = nlp_model(query)
+    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
+    processed = " ".join(tokens)
+    return processed if processed.strip() else query
+def nlp_extract_entities(text: str):
+    doc = nlp_model(text)
+    return [(ent.text, ent.label_) for ent in doc.ents]
+# ---------- AI-enhanced Query Preprocessing -------------
+def ai_preprocess_query(query: str) -> str:
+    return query
+# ---------- Download Manager -------------
+class DownloadManager:
+    def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
+        self.use_proxy = use_proxy
+        self.proxy = proxy
+        self.query = query
+        self.num_results = num_results
+        self.playwright = None
+        self.browser = None
+        self.context = None
+        self.page = None
+    async def __aenter__(self):
+        self.playwright = await async_playwright().start()
+        opts = {"headless": True}
+        if self.use_proxy and self.proxy:
+            opts["proxy"] = {"server": self.proxy}
+        self.browser = await self.playwright.chromium.launch(**opts)
+        self.context = await self.browser.new_context(user_agent=get_random_user_agent())
+        self.page = await self.context.new_page()
+        await self.page.set_extra_http_headers({
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Referer': 'https://www.bing.com/'
+        })
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    async def get_file_size(self, url):
+        try:
+            response = await self.page.request.head(url)
+            length = response.headers.get('Content-Length', None)
+            if length:
+                return sizeof_fmt(int(length))
+            else:
+                return "Unknown Size"
+        except Exception:
+            return "Unknown Size"
+    async def get_pdf_metadata(self, url):
+        try:
+            resp = await self.page.request.get(url, timeout=15000)
+            if resp.ok:
+                content = await resp.body()
+                pdf = BytesIO(content)
+                reader = PdfReader(pdf)
+                return {
+                    'Title': reader.metadata.title if reader.metadata.title else 'N/A',
+                    'Author': reader.metadata.author if reader.metadata.author else 'N/A',
+                    'Pages': len(reader.pages),
+                }
+            else:
+                return {}
+        except Exception:
+            return {}
+    async def search_bing(self):
+        if not self.query:
+            return [], []
+        query = self.query
+        if "filetype:pdf" not in query.lower():
+            query += " filetype:pdf"
+        if "site:" not in query.lower():
+            query += " site:edu OR site:arxiv.org OR site:openstax.org"
+        query = ai_preprocess_query(query)
+        query_processed = nlp_preprocess(query)
+        logger.info(f"BING SEARCH NLP: Original='{query}' -> Processed='{query_processed}'")
+        bing_url = f"https://www.bing.com/search?q={query_processed.replace(' ', '+')}&count={self.num_results}"
+        try:
+            await self.page.goto(bing_url, timeout=30000)
+            await self.page.wait_for_selector('li.b_algo', timeout=30000)
+            await human_like_scroll(self.page)
+            html = await self.page.content()
+            soup = BeautifulSoup(html, 'html.parser')
+            raw_results = soup.find_all('li', class_='b_algo')
+            url_list = []
+            info_list = []
+            snippets = []
+            for r in raw_results:
+                link_tag = r.find('a')
+                snippet_tag = r.find('p')
+                snippet_text = snippet_tag.get_text(strip=True) if snippet_tag else ""
+                snippets.append(snippet_text)
+                entities = nlp_extract_entities(snippet_text)
+                if link_tag and 'href' in link_tag.attrs:
+                    link_url = link_tag['href']
+                    url_list.append(link_url)
+                    info_list.append({
+                        'url': link_url,
+                        'snippet': snippet_text,
+                        'entities': entities
+                    })
+                    if len(url_list) >= self.num_results:
+                        break
+            query_emb = semantic_model.encode(query, convert_to_tensor=True)
+            snippet_embs = semantic_model.encode(snippets, convert_to_tensor=True)
+            scores = util.cos_sim(query_emb, snippet_embs)[0]
+            sorted_indices = scores.argsort(descending=True).cpu().numpy().tolist()
+            sorted_url_list = [url_list[i] for i in sorted_indices]
+            sorted_info_list = [info_list[i] for i in sorted_indices]
+            return sorted_url_list, sorted_info_list
+        except PlaywrightTimeoutError:
+            logger.error("Bing search timed out.")
+            return [], []
+        except Exception as e:
+            logger.error(f"Bing search error: {e}")
+            return [], []
+    async def extract_downloadable_files(self, url, custom_ext_list):
+        found_files = []
+        try:
+            await self.page.goto(url, timeout=30000)
+            await self.page.wait_for_load_state('networkidle', timeout=30000)
+            await human_like_interactions(self.page)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            default_exts = [
+                '.pdf', '.docx', '.zip', '.rar', '.exe', '.mp3',
+                '.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif'
+            ]
+            all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
+            anchors = soup.find_all('a', href=True)
+            for a in anchors:
+                href = a['href'].strip()
+                if any(href.lower().endswith(ext) for ext in all_exts):
+                    if href.startswith('http'):
+                        file_url = href
+                    elif href.startswith('/'):
+                        parsed = urlparse(url)
+                        file_url = f"{parsed.scheme}://{parsed.netloc}{href}"
+                    else:
+                        continue
+                    size_str = await self.get_file_size(file_url)
+                    meta = {}
+                    if file_url.lower().endswith('.pdf'):
+                        meta = await self.get_pdf_metadata(file_url)
+                    found_files.append({
+                        'url': file_url,
+                        'filename': os.path.basename(file_url.split('?')[0]),
+                        'size': size_str,
+                        'metadata': meta
+                    })
+                elif ("drive.google.com" in href) or ("drive.com" in href):
+                    file_id = None
+                    for pattern in [
+                        r'/file/d/([^/]+)/',
+                        r'open\?id=([^&]+)',
+                        r'id=([^&]+)'
+                    ]:
+                        match = re.search(pattern, href)
+                        if match:
+                            file_id = match.group(1)
+                            break
+                    if file_id:
+                        direct = f"https://drive.google.com/uc?export=download&id={file_id}"
+                        filename = f"drive_file_{file_id}"
+                        try:
+                            resp = await self.page.request.head(direct, timeout=15000)
+                            cd = resp.headers.get("Content-Disposition", "")
+                            if cd:
+                                mt = re.search(r'filename\*?="?([^";]+)', cd)
+                                if mt:
+                                    filename = mt.group(1).strip('"').strip()
+                            else:
+                                ctype = resp.headers.get("Content-Type", "")
+                                ext_guess = mimetypes.guess_extension(ctype) or ""
+                                filename = f"drive_file_{file_id}{ext_guess}"
+                        except Exception:
+                            pass
+                        size_str = await self.get_file_size(direct)
+                        found_files.append({
+                            'url': direct,
+                            'filename': filename,
+                            'size': size_str,
+                            'metadata': {}
+                        })
+            return found_files
+        except PlaywrightTimeoutError:
+            logger.error(f"Timeout extracting from {url}")
+            return []
+        except Exception as e:
+            logger.error(f"Error extracting from {url}: {e}")
+            return []
+async def download_file(self, file_info, save_dir, referer):
+        file_url = file_info['url']
+        fname = file_info['filename']
+        path = os.path.join(save_dir, fname)
+        base, ext = os.path.splitext(fname)
+        i = 1
+        while os.path.exists(path):
+            path = os.path.join(save_dir, f"{base}({i}){ext}")
+            i += 1
+        os.makedirs(save_dir, exist_ok=True)
+        try:
+            if file_url.lower().endswith(".pdf") and "drive.google.com" not in file_url.lower():
+                response = requests.get(file_url, stream=True)
+                with open(path, "wb") as f:
+                    f.write(response.content)
+                logger.info(f"Directly downloaded PDF: {path}")
+                return path
+            if "drive.google.com" in file_url.lower():
+                import gdown
+                try:
+                    result = gdown.download(file_url, output=path, quiet=False, fuzzy=True)
+                    if result is None:
+                        logger.error(f"gdown failed to download: {file_url}")
+                        return None
+                    current_ext = os.path.splitext(path)[1].lower()
+                    allowed_exts = {'.pdf', '.jpg', '.jpeg', '.png', '.docx', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv'}
+                    if current_ext not in allowed_exts:
+                        try:
+                            r = requests.head(file_url, allow_redirects=True, timeout=15)
+                            ctype = r.headers.get("Content-Type", "")
+                            guessed_ext = mimetypes.guess_extension(ctype) or ".pdf"
+                        except Exception as e:
+                            logger.error(f"Error in HEAD request for extension: {e}")
+                            guessed_ext = ".pdf"
+                        new_path = os.path.splitext(path)[0] + guessed_ext
+                        os.rename(path, new_path)
+                        path = new_path
+                    logger.info(f"Downloaded using gdown: {path}")
+                    return path
+                except Exception as e:
+                    logger.error(f"Error downloading using gdown: {e}")
+                    return None
+            headers = {
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'Referer': referer
+            }
+            await human_like_interactions(self.page)
+            resp = await self.page.request.get(file_url, headers=headers, timeout=30000)
+            if resp.status == 403:
+                logger.error(f"403 Forbidden: {file_url}")
+                return None
+            if not resp.ok:
+                logger.error(f"Failed to download {file_url}: Status {resp.status}")
+                return None
+            data = await resp.body()
+            with open(path, 'wb') as f:
+                f.write(data)
+            logger.info(f"Downloaded: {path}")
+            return path
+        except PlaywrightTimeoutError:
+            logger.error(f"Timeout downloading {file_url}")
+            return None
+        except Exception as e:
+            logger.error(f"Error downloading {file_url}: {e}")
+            return None
+    async def deep_search(self, url, custom_ext_list, sublink_limit=2000, max_concurrency=500):
+        progress_text = st.empty()
+        progress_bar = st.progress(0)
+        progress_text.text("Analyzing main page...")
+        all_files = []
+        main_files = await self.extract_downloadable_files(url, custom_ext_list)
+        all_files.extend(main_files)
+        progress_text.text("Getting sublinks...")
+        sublinks = await self.get_sublinks(url, sublink_limit)
+        total_links = len(sublinks)
+        progress_text.text(f"Processing {total_links} sublinks...")
+        sem = asyncio.Semaphore(max_concurrency)
+        async def analyze_one_sublink(link, idx):
+            async with sem:
+                progress_text.text(f"Processing link {idx}/{total_links}: {link}")
+                progress_bar.progress(idx/total_links)
+                return await self.extract_downloadable_files(link, custom_ext_list)
+        tasks = [analyze_one_sublink(link, i) for i, link in enumerate(sublinks, 1)]
+        sub_results = await asyncio.gather(*tasks)
+        for sr in sub_results:
+            all_files.extend(sr)
+        unique_map = {f['url']: f for f in all_files}
+        combined = list(unique_map.values())
+        progress_text.text(f"Found {len(combined)} unique files.")
+        progress_bar.progress(1.0)
+        return combined
+    async def get_sublinks(self, url, limit=20000):
+        try:
+            await self.page.goto(url, timeout=30000)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, "html.parser")
+            links = []
+            for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                if href.startswith('http'):
+                    links.append(href)
+                elif href.startswith('/'):
+                    parsed = urlparse(url)
+                    links.append(f"{parsed.scheme}://{parsed.netloc}{href}")
+            return list(set(links))[:limit]
+        except Exception as e:
+            logger.error(f"Error getting sublinks: {e}")
+            return []
+def main():
+    st.set_page_config(page_title="Advanced File Downloader", layout="wide")
+    if 'session_state' not in st.session_state:
+        st.session_state.session_state = {
+            'discovered_files': [],
+            'current_url': None,
+            'download_manager': None,
+            'google_creds': None
+        }
+    st.title("Advanced File Downloader")
+    mode = st.sidebar.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
+    with st.sidebar.expander("Advanced Options"):
+        custom_extensions = st.text_input(
+            "Custom File Extensions",
+            placeholder=".csv, .txt, .epub"
+        )
+        max_concurrency = st.slider(
+            "Max Concurrency",
+            min_value=1,
+            max_value=1000,
+            value=200
+        )
+        use_proxy = st.checkbox("Use Proxy")
+        proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
+    # Google OAuth Section
+    with st.expander("Google Drive Integration"):
+        if st.button("Start Google Sign-In"):
+            auth_url = get_google_auth_url()
+            st.markdown(f"[Click here to authorize]({auth_url})")
+        auth_code = st.text_input("Enter authorization code")
+        if st.button("Complete Sign-In") and auth_code:
+            creds, msg = exchange_code_for_credentials(auth_code)
+            st.session_state.session_state['google_creds'] = creds
+            st.write(msg)
+    if mode == "Manual URL":
+        manual_url_mode()
+    elif mode == "Bing Search":
+        bing_search_mode()
+    else:
+        pdf_summarizer_mode()
+def manual_url_mode():
+    st.header("Manual URL Mode")
+    url = st.text_input("Enter URL", placeholder="https://example.com")
+    if st.button("Deep Search"):
+        if url:
+            async def run_deep_search():
+                async with DownloadManager(
+                    use_proxy=st.session_state.get('use_proxy', False),
+                    proxy=st.session_state.get('proxy', None)
+                ) as dm:
+                    files = await dm.deep_search(
+                        url=url,
+                        custom_ext_list=st.session_state.get('custom_extensions', '').split(','),
+                        max_concurrency=st.session_state.get('max_concurrency', 200)
+                    )
+                    st.session_state.session_state['discovered_files'] = files
+                    st.session_state.session_state['current_url'] = url
+                    if files:
+                        st.write(f"Found {len(files)} files:")
+                        for f in files:
+                            st.write(f"- {f['filename']} ({f['size']})")
+                    else:
+                        st.warning("No files found.")
+            asyncio.run(run_deep_search())
+def bing_search_mode():
+    st.header("Bing Search Mode")
+    query = st.text_input("Enter search query")
+    num_results = st.slider("Number of results", 1, 50, 5)
+    if st.button("Search"):
+        if query:
+            async def run_search():
+                async with DownloadManager(
+                    use_proxy=st.session_state.get('use_proxy', False),
+                    proxy=st.session_state.get('proxy', None),
+                    query=query,
+                    num_results=num_results
+                ) as dm:
+                    urls, info = await dm.search_bing()
+                    if urls:
+                        st.write("Search Results:")
+                        for i, (url, info) in enumerate(zip(urls, info), 1):
+                            st.write(f"{i}. {url}")
+                            st.write(f"   Snippet: {info['snippet']}")
+                    else:
+                        st.warning("No results found.")
+            asyncio.run(run_search())
+def pdf_summarizer_mode():
+    st.header("PDF Summarizer")
+    pdf_url = st.text_input("Enter PDF URL")
+    if st.button("Summarize"):
+        if pdf_url:
+            summary = summarize_pdf_url(pdf_url)
+            st.write("Summary:")
+            st.write(summary)
+if __name__ == "__main__":
+    main()

app_hf.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import gradio as gr
+from app import build_gradio_app
+app = build_gradio_app()
+app.launch(server_name="0.0.0.0")

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+stablepy==0.6.0
+gradio>=3.0.0
+playwright>=1.35.0
+spacy>=3.5.0
+google-auth-oauthlib>=0.4.6
+google-auth-httplib2>=0.1.0
+google-api-python-client>=2.70.0
+PyPDF2>=3.0.0
+beautifulsoup4>=4.11.2
+gdown
+sentence-transformers
+spacy-transformers
+transformers