Upload 9 files
Browse files- .dockerignore +19 -0
- Dockerfile +92 -0
- Privacy Policy.txt +34 -0
- README.md +5 -5
- Terms of Service.txt +37 -0
- app.py +674 -0
- app_hf.py +5 -0
- requirements.txt +13 -0
.dockerignore
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
*.pyd
|
5 |
+
.Python
|
6 |
+
env
|
7 |
+
pip-log.txt
|
8 |
+
pip-delete-this-directory.txt
|
9 |
+
.tox
|
10 |
+
.coverage
|
11 |
+
.coverage.*
|
12 |
+
.cache
|
13 |
+
nosetests.xml
|
14 |
+
coverage.xml
|
15 |
+
*.cover
|
16 |
+
*.log
|
17 |
+
.pytest_cache
|
18 |
+
.env
|
19 |
+
.venv
|
Dockerfile
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM ubuntu:22.04
|
2 |
+
|
3 |
+
USER root
|
4 |
+
|
5 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
6 |
+
ENV PYTHONUNBUFFERED=1
|
7 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
8 |
+
ENV PIP_ROOT_USER_ACTION=ignore
|
9 |
+
ENV HOME=/home/user
|
10 |
+
ENV PLAYWRIGHT_BROWSERS_PATH=${HOME}/.cache/ms-playwright
|
11 |
+
ENV LD_LIBRARY_PATH=/usr/lib/playwright:/usr/lib/x86_64-linux-gnu
|
12 |
+
ENV GRADIO_NODE_PORT=disabled
|
13 |
+
|
14 |
+
RUN useradd -m -d /home/user user && \
|
15 |
+
mkdir -p ${HOME}/.cache/ms-playwright && \
|
16 |
+
mkdir -p /usr/lib/playwright && \
|
17 |
+
chown -R user:user ${HOME}/.cache && \
|
18 |
+
chmod -R 755 ${HOME}/.cache
|
19 |
+
|
20 |
+
RUN apt-get update && \
|
21 |
+
apt-get install -y --no-install-recommends \
|
22 |
+
python3.11 \
|
23 |
+
python3-pip \
|
24 |
+
python3.11-dev \
|
25 |
+
wget \
|
26 |
+
unzip \
|
27 |
+
ca-certificates \
|
28 |
+
libnss3 \
|
29 |
+
libnss3-tools \
|
30 |
+
libnspr4 \
|
31 |
+
libatk1.0-0 \
|
32 |
+
libatk-bridge2.0-0 \
|
33 |
+
libatspi2.0-0 \
|
34 |
+
libcups2 \
|
35 |
+
libxcomposite1 \
|
36 |
+
libxdamage1 \
|
37 |
+
libxrandr2 \
|
38 |
+
libxkbcommon0 \
|
39 |
+
libx11-xcb1 \
|
40 |
+
libxcursor1 \
|
41 |
+
libxi6 \
|
42 |
+
libxss1 \
|
43 |
+
libxtst6 \
|
44 |
+
libasound2 \
|
45 |
+
libx11-6 \
|
46 |
+
libxcb1 \
|
47 |
+
libxext6 \
|
48 |
+
libxfixes3 \
|
49 |
+
libxrender1 \
|
50 |
+
libdbus-1-3 \
|
51 |
+
libdrm2 \
|
52 |
+
libpango-1.0-0 \
|
53 |
+
fonts-liberation \
|
54 |
+
fonts-noto-color-emoji \
|
55 |
+
gcc && \
|
56 |
+
apt-get clean && \
|
57 |
+
rm -rf /var/lib/apt/lists/*
|
58 |
+
|
59 |
+
RUN ln -s /usr/lib/x86_64-linux-gnu/libnss3.so /usr/lib/playwright/ && \
|
60 |
+
ln -s /usr/lib/x86_64-linux-gnu/libnssutil3.so /usr/lib/playwright/ && \
|
61 |
+
ln -s /usr/lib/x86_64-linux-gnu/libsmime3.so /usr/lib/playwright/ && \
|
62 |
+
ln -s /usr/lib/x86_64-linux-gnu/libnspr4.so /usr/lib/playwright/ && \
|
63 |
+
ln -s /usr/lib/x86_64-linux-gnu/libatk-1.0.so.0 /usr/lib/playwright/ && \
|
64 |
+
ln -s /usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0 /usr/lib/playwright/ && \
|
65 |
+
ln -s /usr/lib/x86_64-linux-gnu/libcups.so.2 /usr/lib/playwright/ && \
|
66 |
+
ln -s /usr/lib/x86_64-linux-gnu/libatspi.so.0 /usr/lib/playwright/ && \
|
67 |
+
ln -s /usr/lib/x86_64-linux-gnu/libXcomposite.so.1 /usr/lib/playwright/ && \
|
68 |
+
ln -s /usr/lib/x86_64-linux-gnu/libXdamage.so.1 /usr/lib/playwright/
|
69 |
+
|
70 |
+
WORKDIR /app
|
71 |
+
|
72 |
+
COPY requirements.txt ./
|
73 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
74 |
+
|
75 |
+
RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip3 install playwright==1.30.0
|
76 |
+
|
77 |
+
RUN cd ${HOME}/.cache/ms-playwright && \
|
78 |
+
wget -q https://playwright.azureedge.net/builds/chromium/1045/chromium-linux.zip && \
|
79 |
+
unzip chromium-linux.zip && \
|
80 |
+
rm chromium-linux.zip && \
|
81 |
+
chmod -R 755 ${HOME}/.cache/ms-playwright
|
82 |
+
|
83 |
+
COPY . .
|
84 |
+
|
85 |
+
RUN chown -R user:user /app && \
|
86 |
+
chmod -R 755 /app && \
|
87 |
+
chmod -R 755 ${HOME}/.cache/ms-playwright && \
|
88 |
+
chmod -R 755 /usr/lib/playwright
|
89 |
+
|
90 |
+
USER user
|
91 |
+
|
92 |
+
CMD ["python3", "app.py"]
|
Privacy Policy.txt
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Effective Date: 1/28/2025
|
2 |
+
|
3 |
+
Welcome to Craw_Web ("we," "our," or "us"). Your privacy is important to us. This Privacy Policy explains how we collect, use, disclose, and safeguard your information when you use our platform.
|
4 |
+
|
5 |
+
1. Information We Collect
|
6 |
+
Personal Information: Name, email address, phone number, or any other personal data you provide while using Craw_Web.
|
7 |
+
Usage Data: IP address, browser type, operating system, device information, and details about your interactions with Craw_Web, such as the pages visited or actions performed.
|
8 |
+
2. How We Use Your Information
|
9 |
+
We use the information we collect for the following purposes:
|
10 |
+
|
11 |
+
To operate and maintain Craw_Web.
|
12 |
+
To personalize your experience and deliver content and features based on your preferences.
|
13 |
+
To communicate with you, including responding to inquiries or providing updates about Craw_Web.
|
14 |
+
To improve Craw_Web by analyzing usage trends and gathering feedback.
|
15 |
+
3. Sharing Your Information
|
16 |
+
We do not sell or rent your personal information to third parties. We may share information in the following circumstances:
|
17 |
+
|
18 |
+
With service providers who assist in operating Craw_Web (e.g., hosting, analytics).
|
19 |
+
To comply with legal obligations, such as responding to a court order or regulatory request.
|
20 |
+
To protect the rights, safety, and security of Craw_Web and its users.
|
21 |
+
4. Data Security
|
22 |
+
We implement reasonable security measures to protect your data. However, no method of transmission over the internet is entirely secure. Therefore, we cannot guarantee absolute security.
|
23 |
+
|
24 |
+
5. Your Rights
|
25 |
+
You may have rights under applicable data protection laws, including:
|
26 |
+
|
27 |
+
The right to access your data.
|
28 |
+
The right to request corrections or deletions of your data.
|
29 |
+
The right to opt out of certain types of data processing.
|
30 |
+
6. Changes to This Policy
|
31 |
+
We may update this Privacy Policy from time to time. The revised policy will be effective as of the updated date and available on Craw_Web.
|
32 |
+
|
33 |
+
7. Contact Us
|
34 |
+
If you have any questions about this Privacy Policy, you can contact us at: [email protected]
|
README.md
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
---
|
2 |
title: Craw Web
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
short_description:
|
12 |
---
|
13 |
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Craw Web
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: pink
|
5 |
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 5.16.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
short_description: 'a application to craw the web '
|
12 |
---
|
13 |
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
Terms of Service.txt
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Effective Date: 1/28/2025
|
2 |
+
|
3 |
+
Welcome to Craw_Web. By accessing or using our platform, you agree to these Terms of Service ("Terms"). If you do not agree, please refrain from using Craw_Web.
|
4 |
+
|
5 |
+
1. Use of Craw_Web
|
6 |
+
You agree to use Craw_Web only for lawful purposes and in compliance with these Terms.
|
7 |
+
You must not attempt to harm, disrupt, or misuse Craw_Web, including engaging in unauthorized access or data scraping.
|
8 |
+
2. User Accounts
|
9 |
+
You may be required to create an account to access certain features of Craw_Web.
|
10 |
+
You are responsible for safeguarding your account credentials and ensuring their confidentiality.
|
11 |
+
3. Intellectual Property
|
12 |
+
All content, trademarks, and materials on Craw_Web are owned by us or our licensors.
|
13 |
+
You may not reproduce, distribute, or use any content without prior written permission.
|
14 |
+
4. Prohibited Activities
|
15 |
+
You agree not to:
|
16 |
+
|
17 |
+
Engage in activities that violate any laws or regulations.
|
18 |
+
Upload or distribute harmful or malicious content.
|
19 |
+
Use Craw_Web to spam, harass, or harm others.
|
20 |
+
5. Limitation of Liability
|
21 |
+
We are not liable for any damages or losses arising from:
|
22 |
+
|
23 |
+
Your use of Craw_Web.
|
24 |
+
Technical issues or interruptions in service.
|
25 |
+
Unauthorized access to your account or data.
|
26 |
+
6. Termination
|
27 |
+
We may suspend or terminate your access to Craw_Web at any time for any reason, including violations of these Terms.
|
28 |
+
|
29 |
+
7. Changes to Terms
|
30 |
+
We may update these Terms from time to time. Continued use of Craw_Web after changes are made constitutes acceptance of the revised Terms.
|
31 |
+
|
32 |
+
8. Governing Law
|
33 |
+
These Terms are governed by the laws of [Your Jurisdiction], without regard to conflict of law principles.
|
34 |
+
|
35 |
+
9. Contact Us
|
36 |
+
If you have any questions about these Terms, you can contact us at: [email protected]
|
37 |
+
|
app.py
ADDED
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
4 |
+
import asyncio
|
5 |
+
import logging
|
6 |
+
from urllib.parse import urlparse
|
7 |
+
import re
|
8 |
+
from pathlib import Path
|
9 |
+
from io import BytesIO
|
10 |
+
import random
|
11 |
+
import streamlit as st
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
from PyPDF2 import PdfReader
|
14 |
+
import zipfile
|
15 |
+
import tempfile
|
16 |
+
import mimetypes
|
17 |
+
import requests
|
18 |
+
|
19 |
+
# -------------------- spaCy Model Setup --------------------
|
20 |
+
import spacy
|
21 |
+
import spacy.cli
|
22 |
+
from spacy.language import Language
|
23 |
+
|
24 |
+
# Register a dummy factory under the exact key that the transformer model expects.
|
25 |
+
@Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
|
26 |
+
def dummy_roberta_transformer(nlp, name):
|
27 |
+
# This dummy component simply passes the Doc through.
|
28 |
+
def dummy(doc):
|
29 |
+
return doc
|
30 |
+
return dummy
|
31 |
+
|
32 |
+
# Try to load the transformer-based model.
|
33 |
+
@st.cache_resource
|
34 |
+
def load_nlp_model():
|
35 |
+
try:
|
36 |
+
nlp_model = spacy.load("en_core_web_trf")
|
37 |
+
except OSError:
|
38 |
+
st.write("Model en_core_web_trf not found. Downloading it now...")
|
39 |
+
spacy.cli.download("en_core_web_trf")
|
40 |
+
try:
|
41 |
+
nlp_model = spacy.load("en_core_web_trf")
|
42 |
+
except Exception as e:
|
43 |
+
st.error(f"Error loading model after download: {e}")
|
44 |
+
st.write("Falling back to en_core_web_sm...")
|
45 |
+
spacy.cli.download("en_core_web_sm")
|
46 |
+
nlp_model = spacy.load("en_core_web_sm")
|
47 |
+
return nlp_model
|
48 |
+
|
49 |
+
nlp_model = load_nlp_model()
|
50 |
+
|
51 |
+
# Also load SentenceTransformer for semantic re-ranking.
|
52 |
+
from sentence_transformers import SentenceTransformer, util
|
53 |
+
@st.cache_resource
|
54 |
+
def load_semantic_model():
|
55 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
56 |
+
|
57 |
+
semantic_model = load_semantic_model()
|
58 |
+
|
59 |
+
# -------------------- Transformers Summarization Setup --------------------
|
60 |
+
from transformers import pipeline
|
61 |
+
@st.cache_resource
|
62 |
+
def load_summarizer():
|
63 |
+
return pipeline("summarization")
|
64 |
+
|
65 |
+
summarizer = load_summarizer()
|
66 |
+
|
67 |
+
def summarize_pdf_url(pdf_url):
|
68 |
+
"""
|
69 |
+
Downloads a PDF from the given URL, extracts text using PyPDF2,
|
70 |
+
and returns a summary of (up to) the first 3000 characters.
|
71 |
+
"""
|
72 |
+
try:
|
73 |
+
with st.spinner("Downloading and processing PDF..."):
|
74 |
+
response = requests.get(pdf_url, stream=True)
|
75 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
76 |
+
with open(temp_pdf.name, "wb") as f:
|
77 |
+
f.write(response.content)
|
78 |
+
reader = PdfReader(temp_pdf.name)
|
79 |
+
text = " ".join([page.extract_text() or "" for page in reader.pages])
|
80 |
+
os.remove(temp_pdf.name)
|
81 |
+
limited_text = text[:3000] # Limit text for summarization
|
82 |
+
summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
|
83 |
+
return summary[0]["summary_text"]
|
84 |
+
except Exception as e:
|
85 |
+
return f"Error summarizing PDF: {e}"
|
86 |
+
|
87 |
+
# -------------------- Google API Setup --------------------
|
88 |
+
GOOGLE_OAUTH_CONFIG = {
|
89 |
+
"web": {
|
90 |
+
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
91 |
+
"project_id": "huggingface-449214",
|
92 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
93 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
94 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
95 |
+
"client_secret": "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",
|
96 |
+
"redirect_uris": ["https://euler314-craw-web.hf.space/"]
|
97 |
+
}
|
98 |
+
}
|
99 |
+
|
100 |
+
import google_auth_oauthlib.flow
|
101 |
+
import googleapiclient.discovery
|
102 |
+
import google.auth.transport.requests
|
103 |
+
|
104 |
+
def get_google_auth_url():
|
105 |
+
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
106 |
+
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
107 |
+
{"web": client_config},
|
108 |
+
scopes=["https://www.googleapis.com/auth/drive.file"]
|
109 |
+
)
|
110 |
+
flow.redirect_uri = client_config["redirect_uris"][0]
|
111 |
+
authorization_url, _ = flow.authorization_url(
|
112 |
+
access_type="offline",
|
113 |
+
include_granted_scopes="true",
|
114 |
+
prompt="consent"
|
115 |
+
)
|
116 |
+
return authorization_url
|
117 |
+
|
118 |
+
def exchange_code_for_credentials(auth_code):
|
119 |
+
if not auth_code.strip():
|
120 |
+
return None, "No code provided."
|
121 |
+
try:
|
122 |
+
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
123 |
+
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
124 |
+
{"web": client_config},
|
125 |
+
scopes=["https://www.googleapis.com/auth/drive.file"]
|
126 |
+
)
|
127 |
+
flow.redirect_uri = client_config["redirect_uris"][0]
|
128 |
+
flow.fetch_token(code=auth_code.strip())
|
129 |
+
creds = flow.credentials
|
130 |
+
if not creds or not creds.valid:
|
131 |
+
return None, "Could not validate credentials. Check code and try again."
|
132 |
+
return creds, "Google Sign-In successful!"
|
133 |
+
except Exception as e:
|
134 |
+
return None, f"Error during token exchange: {e}"
|
135 |
+
# -------------------- Playwright Setup --------------------
|
136 |
+
def install_playwright_dependencies():
|
137 |
+
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
138 |
+
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
|
139 |
+
try:
|
140 |
+
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
141 |
+
packages = [
|
142 |
+
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
143 |
+
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
144 |
+
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
145 |
+
]
|
146 |
+
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
147 |
+
os.makedirs('/usr/lib/playwright', exist_ok=True)
|
148 |
+
symlinks = {
|
149 |
+
'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
|
150 |
+
'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
|
151 |
+
'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
|
152 |
+
'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
|
153 |
+
'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
|
154 |
+
'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
|
155 |
+
'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
|
156 |
+
'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
|
157 |
+
'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
|
158 |
+
'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
|
159 |
+
}
|
160 |
+
for link_name, target in symlinks.items():
|
161 |
+
link_path = os.path.join('/usr/lib/playwright', link_name)
|
162 |
+
if not os.path.exists(link_path):
|
163 |
+
os.symlink(target, link_path)
|
164 |
+
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
165 |
+
browser_path = os.path.expanduser("~/.cache/ms-playwright")
|
166 |
+
os.makedirs(browser_path, exist_ok=True)
|
167 |
+
subprocess.run(['chmod', '-R', '755', browser_path], check=True)
|
168 |
+
except subprocess.CalledProcessError as e:
|
169 |
+
st.error(f"Error installing dependencies: {e}")
|
170 |
+
except Exception as e:
|
171 |
+
st.error(f"Error: {e}")
|
172 |
+
|
173 |
+
# Initialize Playwright dependencies
|
174 |
+
install_playwright_dependencies()
|
175 |
+
|
176 |
+
# -------------------- Logging Setup --------------------
|
177 |
+
logging.basicConfig(
|
178 |
+
filename='advanced_download_log.txt',
|
179 |
+
level=logging.INFO,
|
180 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
181 |
+
)
|
182 |
+
logger = logging.getLogger()
|
183 |
+
|
184 |
+
# -------------------- Shared Utils --------------------
|
185 |
+
USER_AGENTS = [
|
186 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
187 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
188 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
189 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
190 |
+
]
|
191 |
+
|
192 |
+
def get_random_user_agent():
|
193 |
+
return random.choice(USER_AGENTS)
|
194 |
+
|
195 |
+
def sizeof_fmt(num, suffix='B'):
|
196 |
+
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
|
197 |
+
if abs(num) < 1024.0:
|
198 |
+
return f"{num:3.1f}{unit}{suffix}"
|
199 |
+
num /= 1024.0
|
200 |
+
return f"{num:.1f}Y{suffix}"
|
201 |
+
|
202 |
+
# ---------- Human-like Interactions -------------
|
203 |
+
async def human_like_scroll(page):
|
204 |
+
scroll_height = await page.evaluate('document.body.scrollHeight')
|
205 |
+
viewport_height = await page.evaluate('window.innerHeight')
|
206 |
+
current_scroll = 0
|
207 |
+
while current_scroll < scroll_height:
|
208 |
+
await page.evaluate(f'window.scrollTo(0, {current_scroll})')
|
209 |
+
await asyncio.sleep(random.uniform(0.5, 1.5))
|
210 |
+
current_scroll += viewport_height * random.uniform(0.5, 1.5)
|
211 |
+
scroll_height = await page.evaluate('document.body.scrollHeight')
|
212 |
+
|
213 |
+
async def human_like_interactions(page):
|
214 |
+
await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000))
|
215 |
+
await asyncio.sleep(random.uniform(0.5, 1.5))
|
216 |
+
await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000))
|
217 |
+
await asyncio.sleep(random.uniform(0.5, 1.5))
|
218 |
+
await page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
|
219 |
+
await asyncio.sleep(random.uniform(0.5, 1.5))
|
220 |
+
|
221 |
+
# ---------- NLP Helpers -------------
|
222 |
+
def nlp_preprocess(query: str) -> str:
|
223 |
+
doc = nlp_model(query)
|
224 |
+
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
|
225 |
+
processed = " ".join(tokens)
|
226 |
+
return processed if processed.strip() else query
|
227 |
+
|
228 |
+
def nlp_extract_entities(text: str):
|
229 |
+
doc = nlp_model(text)
|
230 |
+
return [(ent.text, ent.label_) for ent in doc.ents]
|
231 |
+
|
232 |
+
# ---------- AI-enhanced Query Preprocessing -------------
|
233 |
+
def ai_preprocess_query(query: str) -> str:
|
234 |
+
return query
|
235 |
+
# ---------- Download Manager -------------
|
236 |
+
class DownloadManager:
|
237 |
+
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
238 |
+
self.use_proxy = use_proxy
|
239 |
+
self.proxy = proxy
|
240 |
+
self.query = query
|
241 |
+
self.num_results = num_results
|
242 |
+
self.playwright = None
|
243 |
+
self.browser = None
|
244 |
+
self.context = None
|
245 |
+
self.page = None
|
246 |
+
|
247 |
+
async def __aenter__(self):
|
248 |
+
self.playwright = await async_playwright().start()
|
249 |
+
opts = {"headless": True}
|
250 |
+
if self.use_proxy and self.proxy:
|
251 |
+
opts["proxy"] = {"server": self.proxy}
|
252 |
+
self.browser = await self.playwright.chromium.launch(**opts)
|
253 |
+
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
|
254 |
+
self.page = await self.context.new_page()
|
255 |
+
await self.page.set_extra_http_headers({
|
256 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
257 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
258 |
+
'Referer': 'https://www.bing.com/'
|
259 |
+
})
|
260 |
+
return self
|
261 |
+
|
262 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
263 |
+
if self.browser:
|
264 |
+
await self.browser.close()
|
265 |
+
if self.playwright:
|
266 |
+
await self.playwright.stop()
|
267 |
+
|
268 |
+
async def get_file_size(self, url):
|
269 |
+
try:
|
270 |
+
response = await self.page.request.head(url)
|
271 |
+
length = response.headers.get('Content-Length', None)
|
272 |
+
if length:
|
273 |
+
return sizeof_fmt(int(length))
|
274 |
+
else:
|
275 |
+
return "Unknown Size"
|
276 |
+
except Exception:
|
277 |
+
return "Unknown Size"
|
278 |
+
|
279 |
+
async def get_pdf_metadata(self, url):
|
280 |
+
try:
|
281 |
+
resp = await self.page.request.get(url, timeout=15000)
|
282 |
+
if resp.ok:
|
283 |
+
content = await resp.body()
|
284 |
+
pdf = BytesIO(content)
|
285 |
+
reader = PdfReader(pdf)
|
286 |
+
return {
|
287 |
+
'Title': reader.metadata.title if reader.metadata.title else 'N/A',
|
288 |
+
'Author': reader.metadata.author if reader.metadata.author else 'N/A',
|
289 |
+
'Pages': len(reader.pages),
|
290 |
+
}
|
291 |
+
else:
|
292 |
+
return {}
|
293 |
+
except Exception:
|
294 |
+
return {}
|
295 |
+
|
296 |
+
async def search_bing(self):
|
297 |
+
if not self.query:
|
298 |
+
return [], []
|
299 |
+
query = self.query
|
300 |
+
if "filetype:pdf" not in query.lower():
|
301 |
+
query += " filetype:pdf"
|
302 |
+
if "site:" not in query.lower():
|
303 |
+
query += " site:edu OR site:arxiv.org OR site:openstax.org"
|
304 |
+
query = ai_preprocess_query(query)
|
305 |
+
query_processed = nlp_preprocess(query)
|
306 |
+
logger.info(f"BING SEARCH NLP: Original='{query}' -> Processed='{query_processed}'")
|
307 |
+
|
308 |
+
bing_url = f"https://www.bing.com/search?q={query_processed.replace(' ', '+')}&count={self.num_results}"
|
309 |
+
try:
|
310 |
+
await self.page.goto(bing_url, timeout=30000)
|
311 |
+
await self.page.wait_for_selector('li.b_algo', timeout=30000)
|
312 |
+
await human_like_scroll(self.page)
|
313 |
+
html = await self.page.content()
|
314 |
+
soup = BeautifulSoup(html, 'html.parser')
|
315 |
+
raw_results = soup.find_all('li', class_='b_algo')
|
316 |
+
url_list = []
|
317 |
+
info_list = []
|
318 |
+
snippets = []
|
319 |
+
|
320 |
+
for r in raw_results:
|
321 |
+
link_tag = r.find('a')
|
322 |
+
snippet_tag = r.find('p')
|
323 |
+
snippet_text = snippet_tag.get_text(strip=True) if snippet_tag else ""
|
324 |
+
snippets.append(snippet_text)
|
325 |
+
entities = nlp_extract_entities(snippet_text)
|
326 |
+
|
327 |
+
if link_tag and 'href' in link_tag.attrs:
|
328 |
+
link_url = link_tag['href']
|
329 |
+
url_list.append(link_url)
|
330 |
+
info_list.append({
|
331 |
+
'url': link_url,
|
332 |
+
'snippet': snippet_text,
|
333 |
+
'entities': entities
|
334 |
+
})
|
335 |
+
if len(url_list) >= self.num_results:
|
336 |
+
break
|
337 |
+
|
338 |
+
query_emb = semantic_model.encode(query, convert_to_tensor=True)
|
339 |
+
snippet_embs = semantic_model.encode(snippets, convert_to_tensor=True)
|
340 |
+
scores = util.cos_sim(query_emb, snippet_embs)[0]
|
341 |
+
sorted_indices = scores.argsort(descending=True).cpu().numpy().tolist()
|
342 |
+
sorted_url_list = [url_list[i] for i in sorted_indices]
|
343 |
+
sorted_info_list = [info_list[i] for i in sorted_indices]
|
344 |
+
|
345 |
+
return sorted_url_list, sorted_info_list
|
346 |
+
except PlaywrightTimeoutError:
|
347 |
+
logger.error("Bing search timed out.")
|
348 |
+
return [], []
|
349 |
+
except Exception as e:
|
350 |
+
logger.error(f"Bing search error: {e}")
|
351 |
+
return [], []
|
352 |
+
|
353 |
+
async def extract_downloadable_files(self, url, custom_ext_list):
|
354 |
+
found_files = []
|
355 |
+
try:
|
356 |
+
await self.page.goto(url, timeout=30000)
|
357 |
+
await self.page.wait_for_load_state('networkidle', timeout=30000)
|
358 |
+
await human_like_interactions(self.page)
|
359 |
+
content = await self.page.content()
|
360 |
+
soup = BeautifulSoup(content, 'html.parser')
|
361 |
+
|
362 |
+
default_exts = [
|
363 |
+
'.pdf', '.docx', '.zip', '.rar', '.exe', '.mp3',
|
364 |
+
'.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif'
|
365 |
+
]
|
366 |
+
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
367 |
+
|
368 |
+
anchors = soup.find_all('a', href=True)
|
369 |
+
for a in anchors:
|
370 |
+
href = a['href'].strip()
|
371 |
+
if any(href.lower().endswith(ext) for ext in all_exts):
|
372 |
+
if href.startswith('http'):
|
373 |
+
file_url = href
|
374 |
+
elif href.startswith('/'):
|
375 |
+
parsed = urlparse(url)
|
376 |
+
file_url = f"{parsed.scheme}://{parsed.netloc}{href}"
|
377 |
+
else:
|
378 |
+
continue
|
379 |
+
|
380 |
+
size_str = await self.get_file_size(file_url)
|
381 |
+
meta = {}
|
382 |
+
if file_url.lower().endswith('.pdf'):
|
383 |
+
meta = await self.get_pdf_metadata(file_url)
|
384 |
+
|
385 |
+
found_files.append({
|
386 |
+
'url': file_url,
|
387 |
+
'filename': os.path.basename(file_url.split('?')[0]),
|
388 |
+
'size': size_str,
|
389 |
+
'metadata': meta
|
390 |
+
})
|
391 |
+
|
392 |
+
elif ("drive.google.com" in href) or ("drive.com" in href):
|
393 |
+
file_id = None
|
394 |
+
for pattern in [
|
395 |
+
r'/file/d/([^/]+)/',
|
396 |
+
r'open\?id=([^&]+)',
|
397 |
+
r'id=([^&]+)'
|
398 |
+
]:
|
399 |
+
match = re.search(pattern, href)
|
400 |
+
if match:
|
401 |
+
file_id = match.group(1)
|
402 |
+
break
|
403 |
+
|
404 |
+
if file_id:
|
405 |
+
direct = f"https://drive.google.com/uc?export=download&id={file_id}"
|
406 |
+
filename = f"drive_file_{file_id}"
|
407 |
+
try:
|
408 |
+
resp = await self.page.request.head(direct, timeout=15000)
|
409 |
+
cd = resp.headers.get("Content-Disposition", "")
|
410 |
+
if cd:
|
411 |
+
mt = re.search(r'filename\*?="?([^";]+)', cd)
|
412 |
+
if mt:
|
413 |
+
filename = mt.group(1).strip('"').strip()
|
414 |
+
else:
|
415 |
+
ctype = resp.headers.get("Content-Type", "")
|
416 |
+
ext_guess = mimetypes.guess_extension(ctype) or ""
|
417 |
+
filename = f"drive_file_{file_id}{ext_guess}"
|
418 |
+
except Exception:
|
419 |
+
pass
|
420 |
+
|
421 |
+
size_str = await self.get_file_size(direct)
|
422 |
+
found_files.append({
|
423 |
+
'url': direct,
|
424 |
+
'filename': filename,
|
425 |
+
'size': size_str,
|
426 |
+
'metadata': {}
|
427 |
+
})
|
428 |
+
|
429 |
+
return found_files
|
430 |
+
except PlaywrightTimeoutError:
|
431 |
+
logger.error(f"Timeout extracting from {url}")
|
432 |
+
return []
|
433 |
+
except Exception as e:
|
434 |
+
logger.error(f"Error extracting from {url}: {e}")
|
435 |
+
return []
|
436 |
+
async def download_file(self, file_info, save_dir, referer):
|
437 |
+
file_url = file_info['url']
|
438 |
+
fname = file_info['filename']
|
439 |
+
path = os.path.join(save_dir, fname)
|
440 |
+
base, ext = os.path.splitext(fname)
|
441 |
+
i = 1
|
442 |
+
while os.path.exists(path):
|
443 |
+
path = os.path.join(save_dir, f"{base}({i}){ext}")
|
444 |
+
i += 1
|
445 |
+
|
446 |
+
os.makedirs(save_dir, exist_ok=True)
|
447 |
+
try:
|
448 |
+
if file_url.lower().endswith(".pdf") and "drive.google.com" not in file_url.lower():
|
449 |
+
response = requests.get(file_url, stream=True)
|
450 |
+
with open(path, "wb") as f:
|
451 |
+
f.write(response.content)
|
452 |
+
logger.info(f"Directly downloaded PDF: {path}")
|
453 |
+
return path
|
454 |
+
|
455 |
+
if "drive.google.com" in file_url.lower():
|
456 |
+
import gdown
|
457 |
+
try:
|
458 |
+
result = gdown.download(file_url, output=path, quiet=False, fuzzy=True)
|
459 |
+
if result is None:
|
460 |
+
logger.error(f"gdown failed to download: {file_url}")
|
461 |
+
return None
|
462 |
+
current_ext = os.path.splitext(path)[1].lower()
|
463 |
+
allowed_exts = {'.pdf', '.jpg', '.jpeg', '.png', '.docx', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv'}
|
464 |
+
if current_ext not in allowed_exts:
|
465 |
+
try:
|
466 |
+
r = requests.head(file_url, allow_redirects=True, timeout=15)
|
467 |
+
ctype = r.headers.get("Content-Type", "")
|
468 |
+
guessed_ext = mimetypes.guess_extension(ctype) or ".pdf"
|
469 |
+
except Exception as e:
|
470 |
+
logger.error(f"Error in HEAD request for extension: {e}")
|
471 |
+
guessed_ext = ".pdf"
|
472 |
+
new_path = os.path.splitext(path)[0] + guessed_ext
|
473 |
+
os.rename(path, new_path)
|
474 |
+
path = new_path
|
475 |
+
logger.info(f"Downloaded using gdown: {path}")
|
476 |
+
return path
|
477 |
+
except Exception as e:
|
478 |
+
logger.error(f"Error downloading using gdown: {e}")
|
479 |
+
return None
|
480 |
+
|
481 |
+
headers = {
|
482 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
483 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
484 |
+
'Referer': referer
|
485 |
+
}
|
486 |
+
await human_like_interactions(self.page)
|
487 |
+
resp = await self.page.request.get(file_url, headers=headers, timeout=30000)
|
488 |
+
if resp.status == 403:
|
489 |
+
logger.error(f"403 Forbidden: {file_url}")
|
490 |
+
return None
|
491 |
+
if not resp.ok:
|
492 |
+
logger.error(f"Failed to download {file_url}: Status {resp.status}")
|
493 |
+
return None
|
494 |
+
data = await resp.body()
|
495 |
+
with open(path, 'wb') as f:
|
496 |
+
f.write(data)
|
497 |
+
logger.info(f"Downloaded: {path}")
|
498 |
+
return path
|
499 |
+
except PlaywrightTimeoutError:
|
500 |
+
logger.error(f"Timeout downloading {file_url}")
|
501 |
+
return None
|
502 |
+
except Exception as e:
|
503 |
+
logger.error(f"Error downloading {file_url}: {e}")
|
504 |
+
return None
|
505 |
+
|
506 |
+
async def deep_search(self, url, custom_ext_list, sublink_limit=2000, max_concurrency=500):
|
507 |
+
progress_text = st.empty()
|
508 |
+
progress_bar = st.progress(0)
|
509 |
+
|
510 |
+
progress_text.text("Analyzing main page...")
|
511 |
+
all_files = []
|
512 |
+
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
513 |
+
all_files.extend(main_files)
|
514 |
+
|
515 |
+
progress_text.text("Getting sublinks...")
|
516 |
+
sublinks = await self.get_sublinks(url, sublink_limit)
|
517 |
+
total_links = len(sublinks)
|
518 |
+
|
519 |
+
progress_text.text(f"Processing {total_links} sublinks...")
|
520 |
+
sem = asyncio.Semaphore(max_concurrency)
|
521 |
+
|
522 |
+
async def analyze_one_sublink(link, idx):
|
523 |
+
async with sem:
|
524 |
+
progress_text.text(f"Processing link {idx}/{total_links}: {link}")
|
525 |
+
progress_bar.progress(idx/total_links)
|
526 |
+
return await self.extract_downloadable_files(link, custom_ext_list)
|
527 |
+
|
528 |
+
tasks = [analyze_one_sublink(link, i) for i, link in enumerate(sublinks, 1)]
|
529 |
+
sub_results = await asyncio.gather(*tasks)
|
530 |
+
|
531 |
+
for sr in sub_results:
|
532 |
+
all_files.extend(sr)
|
533 |
+
|
534 |
+
unique_map = {f['url']: f for f in all_files}
|
535 |
+
combined = list(unique_map.values())
|
536 |
+
|
537 |
+
progress_text.text(f"Found {len(combined)} unique files.")
|
538 |
+
progress_bar.progress(1.0)
|
539 |
+
return combined
|
540 |
+
|
541 |
+
async def get_sublinks(self, url, limit=20000):
|
542 |
+
try:
|
543 |
+
await self.page.goto(url, timeout=30000)
|
544 |
+
content = await self.page.content()
|
545 |
+
soup = BeautifulSoup(content, "html.parser")
|
546 |
+
links = []
|
547 |
+
for a in soup.find_all('a', href=True):
|
548 |
+
href = a['href'].strip()
|
549 |
+
if href.startswith('http'):
|
550 |
+
links.append(href)
|
551 |
+
elif href.startswith('/'):
|
552 |
+
parsed = urlparse(url)
|
553 |
+
links.append(f"{parsed.scheme}://{parsed.netloc}{href}")
|
554 |
+
return list(set(links))[:limit]
|
555 |
+
except Exception as e:
|
556 |
+
logger.error(f"Error getting sublinks: {e}")
|
557 |
+
return []
|
558 |
+
|
559 |
+
def main():
|
560 |
+
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
561 |
+
|
562 |
+
if 'session_state' not in st.session_state:
|
563 |
+
st.session_state.session_state = {
|
564 |
+
'discovered_files': [],
|
565 |
+
'current_url': None,
|
566 |
+
'download_manager': None,
|
567 |
+
'google_creds': None
|
568 |
+
}
|
569 |
+
|
570 |
+
st.title("Advanced File Downloader")
|
571 |
+
|
572 |
+
mode = st.sidebar.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
|
573 |
+
|
574 |
+
with st.sidebar.expander("Advanced Options"):
|
575 |
+
custom_extensions = st.text_input(
|
576 |
+
"Custom File Extensions",
|
577 |
+
placeholder=".csv, .txt, .epub"
|
578 |
+
)
|
579 |
+
max_concurrency = st.slider(
|
580 |
+
"Max Concurrency",
|
581 |
+
min_value=1,
|
582 |
+
max_value=1000,
|
583 |
+
value=200
|
584 |
+
)
|
585 |
+
use_proxy = st.checkbox("Use Proxy")
|
586 |
+
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
587 |
+
|
588 |
+
# Google OAuth Section
|
589 |
+
with st.expander("Google Drive Integration"):
|
590 |
+
if st.button("Start Google Sign-In"):
|
591 |
+
auth_url = get_google_auth_url()
|
592 |
+
st.markdown(f"[Click here to authorize]({auth_url})")
|
593 |
+
|
594 |
+
auth_code = st.text_input("Enter authorization code")
|
595 |
+
if st.button("Complete Sign-In") and auth_code:
|
596 |
+
creds, msg = exchange_code_for_credentials(auth_code)
|
597 |
+
st.session_state.session_state['google_creds'] = creds
|
598 |
+
st.write(msg)
|
599 |
+
|
600 |
+
if mode == "Manual URL":
|
601 |
+
manual_url_mode()
|
602 |
+
elif mode == "Bing Search":
|
603 |
+
bing_search_mode()
|
604 |
+
else:
|
605 |
+
pdf_summarizer_mode()
|
606 |
+
|
607 |
+
def manual_url_mode():
|
608 |
+
st.header("Manual URL Mode")
|
609 |
+
|
610 |
+
url = st.text_input("Enter URL", placeholder="https://example.com")
|
611 |
+
|
612 |
+
if st.button("Deep Search"):
|
613 |
+
if url:
|
614 |
+
async def run_deep_search():
|
615 |
+
async with DownloadManager(
|
616 |
+
use_proxy=st.session_state.get('use_proxy', False),
|
617 |
+
proxy=st.session_state.get('proxy', None)
|
618 |
+
) as dm:
|
619 |
+
files = await dm.deep_search(
|
620 |
+
url=url,
|
621 |
+
custom_ext_list=st.session_state.get('custom_extensions', '').split(','),
|
622 |
+
max_concurrency=st.session_state.get('max_concurrency', 200)
|
623 |
+
)
|
624 |
+
st.session_state.session_state['discovered_files'] = files
|
625 |
+
st.session_state.session_state['current_url'] = url
|
626 |
+
|
627 |
+
if files:
|
628 |
+
st.write(f"Found {len(files)} files:")
|
629 |
+
for f in files:
|
630 |
+
st.write(f"- {f['filename']} ({f['size']})")
|
631 |
+
else:
|
632 |
+
st.warning("No files found.")
|
633 |
+
|
634 |
+
asyncio.run(run_deep_search())
|
635 |
+
|
636 |
+
def bing_search_mode():
|
637 |
+
st.header("Bing Search Mode")
|
638 |
+
|
639 |
+
query = st.text_input("Enter search query")
|
640 |
+
num_results = st.slider("Number of results", 1, 50, 5)
|
641 |
+
|
642 |
+
if st.button("Search"):
|
643 |
+
if query:
|
644 |
+
async def run_search():
|
645 |
+
async with DownloadManager(
|
646 |
+
use_proxy=st.session_state.get('use_proxy', False),
|
647 |
+
proxy=st.session_state.get('proxy', None),
|
648 |
+
query=query,
|
649 |
+
num_results=num_results
|
650 |
+
) as dm:
|
651 |
+
urls, info = await dm.search_bing()
|
652 |
+
if urls:
|
653 |
+
st.write("Search Results:")
|
654 |
+
for i, (url, info) in enumerate(zip(urls, info), 1):
|
655 |
+
st.write(f"{i}. {url}")
|
656 |
+
st.write(f" Snippet: {info['snippet']}")
|
657 |
+
else:
|
658 |
+
st.warning("No results found.")
|
659 |
+
|
660 |
+
asyncio.run(run_search())
|
661 |
+
|
662 |
+
def pdf_summarizer_mode():
|
663 |
+
st.header("PDF Summarizer")
|
664 |
+
|
665 |
+
pdf_url = st.text_input("Enter PDF URL")
|
666 |
+
|
667 |
+
if st.button("Summarize"):
|
668 |
+
if pdf_url:
|
669 |
+
summary = summarize_pdf_url(pdf_url)
|
670 |
+
st.write("Summary:")
|
671 |
+
st.write(summary)
|
672 |
+
|
673 |
+
if __name__ == "__main__":
|
674 |
+
main()
|
app_hf.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from app import build_gradio_app
|
3 |
+
|
4 |
+
app = build_gradio_app()
|
5 |
+
app.launch(server_name="0.0.0.0")
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
stablepy==0.6.0
|
2 |
+
gradio>=3.0.0
|
3 |
+
playwright>=1.35.0
|
4 |
+
spacy>=3.5.0
|
5 |
+
google-auth-oauthlib>=0.4.6
|
6 |
+
google-auth-httplib2>=0.1.0
|
7 |
+
google-api-python-client>=2.70.0
|
8 |
+
PyPDF2>=3.0.0
|
9 |
+
beautifulsoup4>=4.11.2
|
10 |
+
gdown
|
11 |
+
sentence-transformers
|
12 |
+
spacy-transformers
|
13 |
+
transformers
|