euler314 commited on
Commit
4b9057c
·
verified ·
1 Parent(s): 697d57f

Upload 9 files

Browse files
Files changed (8) hide show
  1. .dockerignore +19 -0
  2. Dockerfile +92 -0
  3. Privacy Policy.txt +34 -0
  4. README.md +5 -5
  5. Terms of Service.txt +37 -0
  6. app.py +674 -0
  7. app_hf.py +5 -0
  8. requirements.txt +13 -0
.dockerignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env
7
+ pip-log.txt
8
+ pip-delete-this-directory.txt
9
+ .tox
10
+ .coverage
11
+ .coverage.*
12
+ .cache
13
+ nosetests.xml
14
+ coverage.xml
15
+ *.cover
16
+ *.log
17
+ .pytest_cache
18
+ .env
19
+ .venv
Dockerfile ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ USER root
4
+
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+ ENV DEBIAN_FRONTEND=noninteractive
8
+ ENV PIP_ROOT_USER_ACTION=ignore
9
+ ENV HOME=/home/user
10
+ ENV PLAYWRIGHT_BROWSERS_PATH=${HOME}/.cache/ms-playwright
11
+ ENV LD_LIBRARY_PATH=/usr/lib/playwright:/usr/lib/x86_64-linux-gnu
12
+ ENV GRADIO_NODE_PORT=disabled
13
+
14
+ RUN useradd -m -d /home/user user && \
15
+ mkdir -p ${HOME}/.cache/ms-playwright && \
16
+ mkdir -p /usr/lib/playwright && \
17
+ chown -R user:user ${HOME}/.cache && \
18
+ chmod -R 755 ${HOME}/.cache
19
+
20
+ RUN apt-get update && \
21
+ apt-get install -y --no-install-recommends \
22
+ python3.11 \
23
+ python3-pip \
24
+ python3.11-dev \
25
+ wget \
26
+ unzip \
27
+ ca-certificates \
28
+ libnss3 \
29
+ libnss3-tools \
30
+ libnspr4 \
31
+ libatk1.0-0 \
32
+ libatk-bridge2.0-0 \
33
+ libatspi2.0-0 \
34
+ libcups2 \
35
+ libxcomposite1 \
36
+ libxdamage1 \
37
+ libxrandr2 \
38
+ libxkbcommon0 \
39
+ libx11-xcb1 \
40
+ libxcursor1 \
41
+ libxi6 \
42
+ libxss1 \
43
+ libxtst6 \
44
+ libasound2 \
45
+ libx11-6 \
46
+ libxcb1 \
47
+ libxext6 \
48
+ libxfixes3 \
49
+ libxrender1 \
50
+ libdbus-1-3 \
51
+ libdrm2 \
52
+ libpango-1.0-0 \
53
+ fonts-liberation \
54
+ fonts-noto-color-emoji \
55
+ gcc && \
56
+ apt-get clean && \
57
+ rm -rf /var/lib/apt/lists/*
58
+
59
+ RUN ln -s /usr/lib/x86_64-linux-gnu/libnss3.so /usr/lib/playwright/ && \
60
+ ln -s /usr/lib/x86_64-linux-gnu/libnssutil3.so /usr/lib/playwright/ && \
61
+ ln -s /usr/lib/x86_64-linux-gnu/libsmime3.so /usr/lib/playwright/ && \
62
+ ln -s /usr/lib/x86_64-linux-gnu/libnspr4.so /usr/lib/playwright/ && \
63
+ ln -s /usr/lib/x86_64-linux-gnu/libatk-1.0.so.0 /usr/lib/playwright/ && \
64
+ ln -s /usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0 /usr/lib/playwright/ && \
65
+ ln -s /usr/lib/x86_64-linux-gnu/libcups.so.2 /usr/lib/playwright/ && \
66
+ ln -s /usr/lib/x86_64-linux-gnu/libatspi.so.0 /usr/lib/playwright/ && \
67
+ ln -s /usr/lib/x86_64-linux-gnu/libXcomposite.so.1 /usr/lib/playwright/ && \
68
+ ln -s /usr/lib/x86_64-linux-gnu/libXdamage.so.1 /usr/lib/playwright/
69
+
70
+ WORKDIR /app
71
+
72
+ COPY requirements.txt ./
73
+ RUN pip3 install --no-cache-dir -r requirements.txt
74
+
75
+ RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip3 install playwright==1.30.0
76
+
77
+ RUN cd ${HOME}/.cache/ms-playwright && \
78
+ wget -q https://playwright.azureedge.net/builds/chromium/1045/chromium-linux.zip && \
79
+ unzip chromium-linux.zip && \
80
+ rm chromium-linux.zip && \
81
+ chmod -R 755 ${HOME}/.cache/ms-playwright
82
+
83
+ COPY . .
84
+
85
+ RUN chown -R user:user /app && \
86
+ chmod -R 755 /app && \
87
+ chmod -R 755 ${HOME}/.cache/ms-playwright && \
88
+ chmod -R 755 /usr/lib/playwright
89
+
90
+ USER user
91
+
92
+ CMD ["python3", "app.py"]
Privacy Policy.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Effective Date: 1/28/2025
2
+
3
+ Welcome to Craw_Web ("we," "our," or "us"). Your privacy is important to us. This Privacy Policy explains how we collect, use, disclose, and safeguard your information when you use our platform.
4
+
5
+ 1. Information We Collect
6
+ Personal Information: Name, email address, phone number, or any other personal data you provide while using Craw_Web.
7
+ Usage Data: IP address, browser type, operating system, device information, and details about your interactions with Craw_Web, such as the pages visited or actions performed.
8
+ 2. How We Use Your Information
9
+ We use the information we collect for the following purposes:
10
+
11
+ To operate and maintain Craw_Web.
12
+ To personalize your experience and deliver content and features based on your preferences.
13
+ To communicate with you, including responding to inquiries or providing updates about Craw_Web.
14
+ To improve Craw_Web by analyzing usage trends and gathering feedback.
15
+ 3. Sharing Your Information
16
+ We do not sell or rent your personal information to third parties. We may share information in the following circumstances:
17
+
18
+ With service providers who assist in operating Craw_Web (e.g., hosting, analytics).
19
+ To comply with legal obligations, such as responding to a court order or regulatory request.
20
+ To protect the rights, safety, and security of Craw_Web and its users.
21
+ 4. Data Security
22
+ We implement reasonable security measures to protect your data. However, no method of transmission over the internet is entirely secure. Therefore, we cannot guarantee absolute security.
23
+
24
+ 5. Your Rights
25
+ You may have rights under applicable data protection laws, including:
26
+
27
+ The right to access your data.
28
+ The right to request corrections or deletions of your data.
29
+ The right to opt out of certain types of data processing.
30
+ 6. Changes to This Policy
31
+ We may update this Privacy Policy from time to time. The revised policy will be effective as of the updated date and available on Craw_Web.
32
+
33
+ 7. Contact Us
34
+ If you have any questions about this Privacy Policy, you can contact us at: [email protected]
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
  title: Craw Web
3
- emoji: 🌍
4
- colorFrom: blue
5
  colorTo: yellow
6
  sdk: streamlit
7
- sdk_version: 1.42.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: an application for web scraping
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Craw Web
3
+ emoji: 🔥
4
+ colorFrom: pink
5
  colorTo: yellow
6
  sdk: streamlit
7
+ sdk_version: 5.16.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: 'a application to craw the web '
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Terms of Service.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Effective Date: 1/28/2025
2
+
3
+ Welcome to Craw_Web. By accessing or using our platform, you agree to these Terms of Service ("Terms"). If you do not agree, please refrain from using Craw_Web.
4
+
5
+ 1. Use of Craw_Web
6
+ You agree to use Craw_Web only for lawful purposes and in compliance with these Terms.
7
+ You must not attempt to harm, disrupt, or misuse Craw_Web, including engaging in unauthorized access or data scraping.
8
+ 2. User Accounts
9
+ You may be required to create an account to access certain features of Craw_Web.
10
+ You are responsible for safeguarding your account credentials and ensuring their confidentiality.
11
+ 3. Intellectual Property
12
+ All content, trademarks, and materials on Craw_Web are owned by us or our licensors.
13
+ You may not reproduce, distribute, or use any content without prior written permission.
14
+ 4. Prohibited Activities
15
+ You agree not to:
16
+
17
+ Engage in activities that violate any laws or regulations.
18
+ Upload or distribute harmful or malicious content.
19
+ Use Craw_Web to spam, harass, or harm others.
20
+ 5. Limitation of Liability
21
+ We are not liable for any damages or losses arising from:
22
+
23
+ Your use of Craw_Web.
24
+ Technical issues or interruptions in service.
25
+ Unauthorized access to your account or data.
26
+ 6. Termination
27
+ We may suspend or terminate your access to Craw_Web at any time for any reason, including violations of these Terms.
28
+
29
+ 7. Changes to Terms
30
+ We may update these Terms from time to time. Continued use of Craw_Web after changes are made constitutes acceptance of the revised Terms.
31
+
32
+ 8. Governing Law
33
+ These Terms are governed by the laws of [Your Jurisdiction], without regard to conflict of law principles.
34
+
35
+ 9. Contact Us
36
+ If you have any questions about these Terms, you can contact us at: [email protected]
37
+
app.py ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
4
+ import asyncio
5
+ import logging
6
+ from urllib.parse import urlparse
7
+ import re
8
+ from pathlib import Path
9
+ from io import BytesIO
10
+ import random
11
+ import streamlit as st
12
+ from bs4 import BeautifulSoup
13
+ from PyPDF2 import PdfReader
14
+ import zipfile
15
+ import tempfile
16
+ import mimetypes
17
+ import requests
18
+
19
+ # -------------------- spaCy Model Setup --------------------
20
+ import spacy
21
+ import spacy.cli
22
+ from spacy.language import Language
23
+
24
+ # Register a dummy factory under the exact key that the transformer model expects.
25
+ @Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
26
+ def dummy_roberta_transformer(nlp, name):
27
+ # This dummy component simply passes the Doc through.
28
+ def dummy(doc):
29
+ return doc
30
+ return dummy
31
+
32
+ # Try to load the transformer-based model.
33
+ @st.cache_resource
34
+ def load_nlp_model():
35
+ try:
36
+ nlp_model = spacy.load("en_core_web_trf")
37
+ except OSError:
38
+ st.write("Model en_core_web_trf not found. Downloading it now...")
39
+ spacy.cli.download("en_core_web_trf")
40
+ try:
41
+ nlp_model = spacy.load("en_core_web_trf")
42
+ except Exception as e:
43
+ st.error(f"Error loading model after download: {e}")
44
+ st.write("Falling back to en_core_web_sm...")
45
+ spacy.cli.download("en_core_web_sm")
46
+ nlp_model = spacy.load("en_core_web_sm")
47
+ return nlp_model
48
+
49
+ nlp_model = load_nlp_model()
50
+
51
+ # Also load SentenceTransformer for semantic re-ranking.
52
+ from sentence_transformers import SentenceTransformer, util
53
+ @st.cache_resource
54
+ def load_semantic_model():
55
+ return SentenceTransformer('all-MiniLM-L6-v2')
56
+
57
+ semantic_model = load_semantic_model()
58
+
59
+ # -------------------- Transformers Summarization Setup --------------------
60
+ from transformers import pipeline
61
+ @st.cache_resource
62
+ def load_summarizer():
63
+ return pipeline("summarization")
64
+
65
+ summarizer = load_summarizer()
66
+
67
+ def summarize_pdf_url(pdf_url):
68
+ """
69
+ Downloads a PDF from the given URL, extracts text using PyPDF2,
70
+ and returns a summary of (up to) the first 3000 characters.
71
+ """
72
+ try:
73
+ with st.spinner("Downloading and processing PDF..."):
74
+ response = requests.get(pdf_url, stream=True)
75
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
76
+ with open(temp_pdf.name, "wb") as f:
77
+ f.write(response.content)
78
+ reader = PdfReader(temp_pdf.name)
79
+ text = " ".join([page.extract_text() or "" for page in reader.pages])
80
+ os.remove(temp_pdf.name)
81
+ limited_text = text[:3000] # Limit text for summarization
82
+ summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
83
+ return summary[0]["summary_text"]
84
+ except Exception as e:
85
+ return f"Error summarizing PDF: {e}"
86
+
87
+ # -------------------- Google API Setup --------------------
88
+ GOOGLE_OAUTH_CONFIG = {
89
+ "web": {
90
+ "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
91
+ "project_id": "huggingface-449214",
92
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
93
+ "token_uri": "https://oauth2.googleapis.com/token",
94
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
95
+ "client_secret": "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",
96
+ "redirect_uris": ["https://euler314-craw-web.hf.space/"]
97
+ }
98
+ }
99
+
100
+ import google_auth_oauthlib.flow
101
+ import googleapiclient.discovery
102
+ import google.auth.transport.requests
103
+
104
+ def get_google_auth_url():
105
+ client_config = GOOGLE_OAUTH_CONFIG["web"]
106
+ flow = google_auth_oauthlib.flow.Flow.from_client_config(
107
+ {"web": client_config},
108
+ scopes=["https://www.googleapis.com/auth/drive.file"]
109
+ )
110
+ flow.redirect_uri = client_config["redirect_uris"][0]
111
+ authorization_url, _ = flow.authorization_url(
112
+ access_type="offline",
113
+ include_granted_scopes="true",
114
+ prompt="consent"
115
+ )
116
+ return authorization_url
117
+
118
+ def exchange_code_for_credentials(auth_code):
119
+ if not auth_code.strip():
120
+ return None, "No code provided."
121
+ try:
122
+ client_config = GOOGLE_OAUTH_CONFIG["web"]
123
+ flow = google_auth_oauthlib.flow.Flow.from_client_config(
124
+ {"web": client_config},
125
+ scopes=["https://www.googleapis.com/auth/drive.file"]
126
+ )
127
+ flow.redirect_uri = client_config["redirect_uris"][0]
128
+ flow.fetch_token(code=auth_code.strip())
129
+ creds = flow.credentials
130
+ if not creds or not creds.valid:
131
+ return None, "Could not validate credentials. Check code and try again."
132
+ return creds, "Google Sign-In successful!"
133
+ except Exception as e:
134
+ return None, f"Error during token exchange: {e}"
135
+ # -------------------- Playwright Setup --------------------
136
+ def install_playwright_dependencies():
137
+ os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
138
+ os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
139
+ try:
140
+ subprocess.run(['apt-get', 'update', '-y'], check=True)
141
+ packages = [
142
+ 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
143
+ 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
144
+ 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
145
+ ]
146
+ subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
147
+ os.makedirs('/usr/lib/playwright', exist_ok=True)
148
+ symlinks = {
149
+ 'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
150
+ 'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
151
+ 'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
152
+ 'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
153
+ 'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
154
+ 'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
155
+ 'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
156
+ 'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
157
+ 'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
158
+ 'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
159
+ }
160
+ for link_name, target in symlinks.items():
161
+ link_path = os.path.join('/usr/lib/playwright', link_name)
162
+ if not os.path.exists(link_path):
163
+ os.symlink(target, link_path)
164
+ subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
165
+ browser_path = os.path.expanduser("~/.cache/ms-playwright")
166
+ os.makedirs(browser_path, exist_ok=True)
167
+ subprocess.run(['chmod', '-R', '755', browser_path], check=True)
168
+ except subprocess.CalledProcessError as e:
169
+ st.error(f"Error installing dependencies: {e}")
170
+ except Exception as e:
171
+ st.error(f"Error: {e}")
172
+
173
+ # Initialize Playwright dependencies
174
+ install_playwright_dependencies()
175
+
176
+ # -------------------- Logging Setup --------------------
177
+ logging.basicConfig(
178
+ filename='advanced_download_log.txt',
179
+ level=logging.INFO,
180
+ format='%(asctime)s - %(levelname)s - %(message)s'
181
+ )
182
+ logger = logging.getLogger()
183
+
184
+ # -------------------- Shared Utils --------------------
185
+ USER_AGENTS = [
186
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
187
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
188
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
189
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
190
+ ]
191
+
192
+ def get_random_user_agent():
193
+ return random.choice(USER_AGENTS)
194
+
195
+ def sizeof_fmt(num, suffix='B'):
196
+ for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
197
+ if abs(num) < 1024.0:
198
+ return f"{num:3.1f}{unit}{suffix}"
199
+ num /= 1024.0
200
+ return f"{num:.1f}Y{suffix}"
201
+
202
+ # ---------- Human-like Interactions -------------
203
+ async def human_like_scroll(page):
204
+ scroll_height = await page.evaluate('document.body.scrollHeight')
205
+ viewport_height = await page.evaluate('window.innerHeight')
206
+ current_scroll = 0
207
+ while current_scroll < scroll_height:
208
+ await page.evaluate(f'window.scrollTo(0, {current_scroll})')
209
+ await asyncio.sleep(random.uniform(0.5, 1.5))
210
+ current_scroll += viewport_height * random.uniform(0.5, 1.5)
211
+ scroll_height = await page.evaluate('document.body.scrollHeight')
212
+
213
+ async def human_like_interactions(page):
214
+ await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000))
215
+ await asyncio.sleep(random.uniform(0.5, 1.5))
216
+ await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000))
217
+ await asyncio.sleep(random.uniform(0.5, 1.5))
218
+ await page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
219
+ await asyncio.sleep(random.uniform(0.5, 1.5))
220
+
221
+ # ---------- NLP Helpers -------------
222
+ def nlp_preprocess(query: str) -> str:
223
+ doc = nlp_model(query)
224
+ tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
225
+ processed = " ".join(tokens)
226
+ return processed if processed.strip() else query
227
+
228
+ def nlp_extract_entities(text: str):
229
+ doc = nlp_model(text)
230
+ return [(ent.text, ent.label_) for ent in doc.ents]
231
+
232
+ # ---------- AI-enhanced Query Preprocessing -------------
233
+ def ai_preprocess_query(query: str) -> str:
234
+ return query
235
+ # ---------- Download Manager -------------
236
+ class DownloadManager:
237
+ def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
238
+ self.use_proxy = use_proxy
239
+ self.proxy = proxy
240
+ self.query = query
241
+ self.num_results = num_results
242
+ self.playwright = None
243
+ self.browser = None
244
+ self.context = None
245
+ self.page = None
246
+
247
+ async def __aenter__(self):
248
+ self.playwright = await async_playwright().start()
249
+ opts = {"headless": True}
250
+ if self.use_proxy and self.proxy:
251
+ opts["proxy"] = {"server": self.proxy}
252
+ self.browser = await self.playwright.chromium.launch(**opts)
253
+ self.context = await self.browser.new_context(user_agent=get_random_user_agent())
254
+ self.page = await self.context.new_page()
255
+ await self.page.set_extra_http_headers({
256
+ 'Accept-Language': 'en-US,en;q=0.9',
257
+ 'Accept-Encoding': 'gzip, deflate, br',
258
+ 'Referer': 'https://www.bing.com/'
259
+ })
260
+ return self
261
+
262
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
263
+ if self.browser:
264
+ await self.browser.close()
265
+ if self.playwright:
266
+ await self.playwright.stop()
267
+
268
+ async def get_file_size(self, url):
269
+ try:
270
+ response = await self.page.request.head(url)
271
+ length = response.headers.get('Content-Length', None)
272
+ if length:
273
+ return sizeof_fmt(int(length))
274
+ else:
275
+ return "Unknown Size"
276
+ except Exception:
277
+ return "Unknown Size"
278
+
279
+ async def get_pdf_metadata(self, url):
280
+ try:
281
+ resp = await self.page.request.get(url, timeout=15000)
282
+ if resp.ok:
283
+ content = await resp.body()
284
+ pdf = BytesIO(content)
285
+ reader = PdfReader(pdf)
286
+ return {
287
+ 'Title': reader.metadata.title if reader.metadata.title else 'N/A',
288
+ 'Author': reader.metadata.author if reader.metadata.author else 'N/A',
289
+ 'Pages': len(reader.pages),
290
+ }
291
+ else:
292
+ return {}
293
+ except Exception:
294
+ return {}
295
+
296
+ async def search_bing(self):
297
+ if not self.query:
298
+ return [], []
299
+ query = self.query
300
+ if "filetype:pdf" not in query.lower():
301
+ query += " filetype:pdf"
302
+ if "site:" not in query.lower():
303
+ query += " site:edu OR site:arxiv.org OR site:openstax.org"
304
+ query = ai_preprocess_query(query)
305
+ query_processed = nlp_preprocess(query)
306
+ logger.info(f"BING SEARCH NLP: Original='{query}' -> Processed='{query_processed}'")
307
+
308
+ bing_url = f"https://www.bing.com/search?q={query_processed.replace(' ', '+')}&count={self.num_results}"
309
+ try:
310
+ await self.page.goto(bing_url, timeout=30000)
311
+ await self.page.wait_for_selector('li.b_algo', timeout=30000)
312
+ await human_like_scroll(self.page)
313
+ html = await self.page.content()
314
+ soup = BeautifulSoup(html, 'html.parser')
315
+ raw_results = soup.find_all('li', class_='b_algo')
316
+ url_list = []
317
+ info_list = []
318
+ snippets = []
319
+
320
+ for r in raw_results:
321
+ link_tag = r.find('a')
322
+ snippet_tag = r.find('p')
323
+ snippet_text = snippet_tag.get_text(strip=True) if snippet_tag else ""
324
+ snippets.append(snippet_text)
325
+ entities = nlp_extract_entities(snippet_text)
326
+
327
+ if link_tag and 'href' in link_tag.attrs:
328
+ link_url = link_tag['href']
329
+ url_list.append(link_url)
330
+ info_list.append({
331
+ 'url': link_url,
332
+ 'snippet': snippet_text,
333
+ 'entities': entities
334
+ })
335
+ if len(url_list) >= self.num_results:
336
+ break
337
+
338
+ query_emb = semantic_model.encode(query, convert_to_tensor=True)
339
+ snippet_embs = semantic_model.encode(snippets, convert_to_tensor=True)
340
+ scores = util.cos_sim(query_emb, snippet_embs)[0]
341
+ sorted_indices = scores.argsort(descending=True).cpu().numpy().tolist()
342
+ sorted_url_list = [url_list[i] for i in sorted_indices]
343
+ sorted_info_list = [info_list[i] for i in sorted_indices]
344
+
345
+ return sorted_url_list, sorted_info_list
346
+ except PlaywrightTimeoutError:
347
+ logger.error("Bing search timed out.")
348
+ return [], []
349
+ except Exception as e:
350
+ logger.error(f"Bing search error: {e}")
351
+ return [], []
352
+
353
+ async def extract_downloadable_files(self, url, custom_ext_list):
354
+ found_files = []
355
+ try:
356
+ await self.page.goto(url, timeout=30000)
357
+ await self.page.wait_for_load_state('networkidle', timeout=30000)
358
+ await human_like_interactions(self.page)
359
+ content = await self.page.content()
360
+ soup = BeautifulSoup(content, 'html.parser')
361
+
362
+ default_exts = [
363
+ '.pdf', '.docx', '.zip', '.rar', '.exe', '.mp3',
364
+ '.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif'
365
+ ]
366
+ all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
367
+
368
+ anchors = soup.find_all('a', href=True)
369
+ for a in anchors:
370
+ href = a['href'].strip()
371
+ if any(href.lower().endswith(ext) for ext in all_exts):
372
+ if href.startswith('http'):
373
+ file_url = href
374
+ elif href.startswith('/'):
375
+ parsed = urlparse(url)
376
+ file_url = f"{parsed.scheme}://{parsed.netloc}{href}"
377
+ else:
378
+ continue
379
+
380
+ size_str = await self.get_file_size(file_url)
381
+ meta = {}
382
+ if file_url.lower().endswith('.pdf'):
383
+ meta = await self.get_pdf_metadata(file_url)
384
+
385
+ found_files.append({
386
+ 'url': file_url,
387
+ 'filename': os.path.basename(file_url.split('?')[0]),
388
+ 'size': size_str,
389
+ 'metadata': meta
390
+ })
391
+
392
+ elif ("drive.google.com" in href) or ("drive.com" in href):
393
+ file_id = None
394
+ for pattern in [
395
+ r'/file/d/([^/]+)/',
396
+ r'open\?id=([^&]+)',
397
+ r'id=([^&]+)'
398
+ ]:
399
+ match = re.search(pattern, href)
400
+ if match:
401
+ file_id = match.group(1)
402
+ break
403
+
404
+ if file_id:
405
+ direct = f"https://drive.google.com/uc?export=download&id={file_id}"
406
+ filename = f"drive_file_{file_id}"
407
+ try:
408
+ resp = await self.page.request.head(direct, timeout=15000)
409
+ cd = resp.headers.get("Content-Disposition", "")
410
+ if cd:
411
+ mt = re.search(r'filename\*?="?([^";]+)', cd)
412
+ if mt:
413
+ filename = mt.group(1).strip('"').strip()
414
+ else:
415
+ ctype = resp.headers.get("Content-Type", "")
416
+ ext_guess = mimetypes.guess_extension(ctype) or ""
417
+ filename = f"drive_file_{file_id}{ext_guess}"
418
+ except Exception:
419
+ pass
420
+
421
+ size_str = await self.get_file_size(direct)
422
+ found_files.append({
423
+ 'url': direct,
424
+ 'filename': filename,
425
+ 'size': size_str,
426
+ 'metadata': {}
427
+ })
428
+
429
+ return found_files
430
+ except PlaywrightTimeoutError:
431
+ logger.error(f"Timeout extracting from {url}")
432
+ return []
433
+ except Exception as e:
434
+ logger.error(f"Error extracting from {url}: {e}")
435
+ return []
436
+ async def download_file(self, file_info, save_dir, referer):
437
+ file_url = file_info['url']
438
+ fname = file_info['filename']
439
+ path = os.path.join(save_dir, fname)
440
+ base, ext = os.path.splitext(fname)
441
+ i = 1
442
+ while os.path.exists(path):
443
+ path = os.path.join(save_dir, f"{base}({i}){ext}")
444
+ i += 1
445
+
446
+ os.makedirs(save_dir, exist_ok=True)
447
+ try:
448
+ if file_url.lower().endswith(".pdf") and "drive.google.com" not in file_url.lower():
449
+ response = requests.get(file_url, stream=True)
450
+ with open(path, "wb") as f:
451
+ f.write(response.content)
452
+ logger.info(f"Directly downloaded PDF: {path}")
453
+ return path
454
+
455
+ if "drive.google.com" in file_url.lower():
456
+ import gdown
457
+ try:
458
+ result = gdown.download(file_url, output=path, quiet=False, fuzzy=True)
459
+ if result is None:
460
+ logger.error(f"gdown failed to download: {file_url}")
461
+ return None
462
+ current_ext = os.path.splitext(path)[1].lower()
463
+ allowed_exts = {'.pdf', '.jpg', '.jpeg', '.png', '.docx', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv'}
464
+ if current_ext not in allowed_exts:
465
+ try:
466
+ r = requests.head(file_url, allow_redirects=True, timeout=15)
467
+ ctype = r.headers.get("Content-Type", "")
468
+ guessed_ext = mimetypes.guess_extension(ctype) or ".pdf"
469
+ except Exception as e:
470
+ logger.error(f"Error in HEAD request for extension: {e}")
471
+ guessed_ext = ".pdf"
472
+ new_path = os.path.splitext(path)[0] + guessed_ext
473
+ os.rename(path, new_path)
474
+ path = new_path
475
+ logger.info(f"Downloaded using gdown: {path}")
476
+ return path
477
+ except Exception as e:
478
+ logger.error(f"Error downloading using gdown: {e}")
479
+ return None
480
+
481
+ headers = {
482
+ 'Accept-Language': 'en-US,en;q=0.9',
483
+ 'Accept-Encoding': 'gzip, deflate, br',
484
+ 'Referer': referer
485
+ }
486
+ await human_like_interactions(self.page)
487
+ resp = await self.page.request.get(file_url, headers=headers, timeout=30000)
488
+ if resp.status == 403:
489
+ logger.error(f"403 Forbidden: {file_url}")
490
+ return None
491
+ if not resp.ok:
492
+ logger.error(f"Failed to download {file_url}: Status {resp.status}")
493
+ return None
494
+ data = await resp.body()
495
+ with open(path, 'wb') as f:
496
+ f.write(data)
497
+ logger.info(f"Downloaded: {path}")
498
+ return path
499
+ except PlaywrightTimeoutError:
500
+ logger.error(f"Timeout downloading {file_url}")
501
+ return None
502
+ except Exception as e:
503
+ logger.error(f"Error downloading {file_url}: {e}")
504
+ return None
505
+
506
+ async def deep_search(self, url, custom_ext_list, sublink_limit=2000, max_concurrency=500):
507
+ progress_text = st.empty()
508
+ progress_bar = st.progress(0)
509
+
510
+ progress_text.text("Analyzing main page...")
511
+ all_files = []
512
+ main_files = await self.extract_downloadable_files(url, custom_ext_list)
513
+ all_files.extend(main_files)
514
+
515
+ progress_text.text("Getting sublinks...")
516
+ sublinks = await self.get_sublinks(url, sublink_limit)
517
+ total_links = len(sublinks)
518
+
519
+ progress_text.text(f"Processing {total_links} sublinks...")
520
+ sem = asyncio.Semaphore(max_concurrency)
521
+
522
+ async def analyze_one_sublink(link, idx):
523
+ async with sem:
524
+ progress_text.text(f"Processing link {idx}/{total_links}: {link}")
525
+ progress_bar.progress(idx/total_links)
526
+ return await self.extract_downloadable_files(link, custom_ext_list)
527
+
528
+ tasks = [analyze_one_sublink(link, i) for i, link in enumerate(sublinks, 1)]
529
+ sub_results = await asyncio.gather(*tasks)
530
+
531
+ for sr in sub_results:
532
+ all_files.extend(sr)
533
+
534
+ unique_map = {f['url']: f for f in all_files}
535
+ combined = list(unique_map.values())
536
+
537
+ progress_text.text(f"Found {len(combined)} unique files.")
538
+ progress_bar.progress(1.0)
539
+ return combined
540
+
541
+ async def get_sublinks(self, url, limit=20000):
542
+ try:
543
+ await self.page.goto(url, timeout=30000)
544
+ content = await self.page.content()
545
+ soup = BeautifulSoup(content, "html.parser")
546
+ links = []
547
+ for a in soup.find_all('a', href=True):
548
+ href = a['href'].strip()
549
+ if href.startswith('http'):
550
+ links.append(href)
551
+ elif href.startswith('/'):
552
+ parsed = urlparse(url)
553
+ links.append(f"{parsed.scheme}://{parsed.netloc}{href}")
554
+ return list(set(links))[:limit]
555
+ except Exception as e:
556
+ logger.error(f"Error getting sublinks: {e}")
557
+ return []
558
+
559
+ def main():
560
+ st.set_page_config(page_title="Advanced File Downloader", layout="wide")
561
+
562
+ if 'session_state' not in st.session_state:
563
+ st.session_state.session_state = {
564
+ 'discovered_files': [],
565
+ 'current_url': None,
566
+ 'download_manager': None,
567
+ 'google_creds': None
568
+ }
569
+
570
+ st.title("Advanced File Downloader")
571
+
572
+ mode = st.sidebar.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
573
+
574
+ with st.sidebar.expander("Advanced Options"):
575
+ custom_extensions = st.text_input(
576
+ "Custom File Extensions",
577
+ placeholder=".csv, .txt, .epub"
578
+ )
579
+ max_concurrency = st.slider(
580
+ "Max Concurrency",
581
+ min_value=1,
582
+ max_value=1000,
583
+ value=200
584
+ )
585
+ use_proxy = st.checkbox("Use Proxy")
586
+ proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
587
+
588
+ # Google OAuth Section
589
+ with st.expander("Google Drive Integration"):
590
+ if st.button("Start Google Sign-In"):
591
+ auth_url = get_google_auth_url()
592
+ st.markdown(f"[Click here to authorize]({auth_url})")
593
+
594
+ auth_code = st.text_input("Enter authorization code")
595
+ if st.button("Complete Sign-In") and auth_code:
596
+ creds, msg = exchange_code_for_credentials(auth_code)
597
+ st.session_state.session_state['google_creds'] = creds
598
+ st.write(msg)
599
+
600
+ if mode == "Manual URL":
601
+ manual_url_mode()
602
+ elif mode == "Bing Search":
603
+ bing_search_mode()
604
+ else:
605
+ pdf_summarizer_mode()
606
+
607
+ def manual_url_mode():
608
+ st.header("Manual URL Mode")
609
+
610
+ url = st.text_input("Enter URL", placeholder="https://example.com")
611
+
612
+ if st.button("Deep Search"):
613
+ if url:
614
+ async def run_deep_search():
615
+ async with DownloadManager(
616
+ use_proxy=st.session_state.get('use_proxy', False),
617
+ proxy=st.session_state.get('proxy', None)
618
+ ) as dm:
619
+ files = await dm.deep_search(
620
+ url=url,
621
+ custom_ext_list=st.session_state.get('custom_extensions', '').split(','),
622
+ max_concurrency=st.session_state.get('max_concurrency', 200)
623
+ )
624
+ st.session_state.session_state['discovered_files'] = files
625
+ st.session_state.session_state['current_url'] = url
626
+
627
+ if files:
628
+ st.write(f"Found {len(files)} files:")
629
+ for f in files:
630
+ st.write(f"- {f['filename']} ({f['size']})")
631
+ else:
632
+ st.warning("No files found.")
633
+
634
+ asyncio.run(run_deep_search())
635
+
636
+ def bing_search_mode():
637
+ st.header("Bing Search Mode")
638
+
639
+ query = st.text_input("Enter search query")
640
+ num_results = st.slider("Number of results", 1, 50, 5)
641
+
642
+ if st.button("Search"):
643
+ if query:
644
+ async def run_search():
645
+ async with DownloadManager(
646
+ use_proxy=st.session_state.get('use_proxy', False),
647
+ proxy=st.session_state.get('proxy', None),
648
+ query=query,
649
+ num_results=num_results
650
+ ) as dm:
651
+ urls, info = await dm.search_bing()
652
+ if urls:
653
+ st.write("Search Results:")
654
+ for i, (url, info) in enumerate(zip(urls, info), 1):
655
+ st.write(f"{i}. {url}")
656
+ st.write(f" Snippet: {info['snippet']}")
657
+ else:
658
+ st.warning("No results found.")
659
+
660
+ asyncio.run(run_search())
661
+
662
+ def pdf_summarizer_mode():
663
+ st.header("PDF Summarizer")
664
+
665
+ pdf_url = st.text_input("Enter PDF URL")
666
+
667
+ if st.button("Summarize"):
668
+ if pdf_url:
669
+ summary = summarize_pdf_url(pdf_url)
670
+ st.write("Summary:")
671
+ st.write(summary)
672
+
673
+ if __name__ == "__main__":
674
+ main()
app_hf.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app import build_gradio_app
3
+
4
+ app = build_gradio_app()
5
+ app.launch(server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stablepy==0.6.0
2
+ gradio>=3.0.0
3
+ playwright>=1.35.0
4
+ spacy>=3.5.0
5
+ google-auth-oauthlib>=0.4.6
6
+ google-auth-httplib2>=0.1.0
7
+ google-api-python-client>=2.70.0
8
+ PyPDF2>=3.0.0
9
+ beautifulsoup4>=4.11.2
10
+ gdown
11
+ sentence-transformers
12
+ spacy-transformers
13
+ transformers