Update app.py
Browse files
app.py
CHANGED
@@ -1,77 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
3 |
|
4 |
-
#
|
5 |
-
import spacy
|
6 |
-
import spacy.cli
|
7 |
-
import os
|
8 |
-
|
9 |
-
@st.cache_resource
|
10 |
-
def load_models():
|
11 |
-
try:
|
12 |
-
# Try to load spaCy model
|
13 |
-
try:
|
14 |
-
nlp = spacy.load("en_core_web_sm")
|
15 |
-
except OSError:
|
16 |
-
st.info("Downloading spaCy model...")
|
17 |
-
spacy.cli.download("en_core_web_sm")
|
18 |
-
nlp = spacy.load("en_core_web_sm")
|
19 |
-
|
20 |
-
# Load SentenceTransformer with offline handling
|
21 |
-
try:
|
22 |
-
from sentence_transformers import SentenceTransformer
|
23 |
-
model_name = 'all-MiniLM-L6-v2'
|
24 |
-
cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
|
25 |
-
if os.path.exists(os.path.join(cache_dir, model_name)):
|
26 |
-
semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
|
27 |
-
else:
|
28 |
-
st.warning(f"Downloading SentenceTransformer model {model_name}...")
|
29 |
-
semantic_model = SentenceTransformer(model_name)
|
30 |
-
except Exception as e:
|
31 |
-
st.error(f"Error loading SentenceTransformer: {e}")
|
32 |
-
st.info("Continuing without semantic search capability...")
|
33 |
-
semantic_model = None
|
34 |
-
|
35 |
-
# Load Transformers pipeline with offline handling
|
36 |
-
try:
|
37 |
-
from transformers import pipeline
|
38 |
-
model_name = "facebook/bart-large-cnn"
|
39 |
-
cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
|
40 |
-
if os.path.exists(os.path.join(cache_dir, model_name)):
|
41 |
-
summarizer = pipeline("summarization", model=model_name)
|
42 |
-
else:
|
43 |
-
st.warning(f"Downloading Transformer model {model_name}...")
|
44 |
-
summarizer = pipeline("summarization")
|
45 |
-
except Exception as e:
|
46 |
-
st.error(f"Error loading Transformers: {e}")
|
47 |
-
st.info("Continuing without summarization capability...")
|
48 |
-
summarizer = None
|
49 |
-
|
50 |
-
return nlp, semantic_model, summarizer
|
51 |
-
|
52 |
-
except Exception as e:
|
53 |
-
st.error(f"Error loading models: {e}")
|
54 |
-
return None, None, None
|
55 |
-
|
56 |
-
# Initialize models with better error handling
|
57 |
-
with st.spinner("Loading models..."):
|
58 |
-
nlp_model, semantic_model, summarizer = load_models()
|
59 |
-
|
60 |
-
if nlp_model is None:
|
61 |
-
st.error("Failed to load essential NLP model. The application cannot continue.")
|
62 |
-
st.stop()
|
63 |
-
else:
|
64 |
-
# Continue with available features based on which models loaded successfully
|
65 |
-
if semantic_model is None:
|
66 |
-
st.warning("Semantic search features will be disabled.")
|
67 |
-
if summarizer is None:
|
68 |
-
st.warning("PDF summarization features will be disabled.")
|
69 |
-
|
70 |
-
|
71 |
-
# Rest of your imports and code here...
|
72 |
-
|
73 |
-
# Rest of your code...
|
74 |
-
|
75 |
import os
|
76 |
import subprocess
|
77 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
@@ -88,7 +18,28 @@ import zipfile
|
|
88 |
import tempfile
|
89 |
import mimetypes
|
90 |
import requests
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def install_playwright_dependencies():
|
93 |
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
94 |
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
|
@@ -126,23 +77,14 @@ def install_playwright_dependencies():
|
|
126 |
except Exception as e:
|
127 |
print(f"Error: {e}")
|
128 |
|
|
|
129 |
install_playwright_dependencies()
|
130 |
|
131 |
-
#
|
132 |
-
import spacy
|
133 |
-
import spacy.cli
|
134 |
-
from spacy.language import Language
|
135 |
-
|
136 |
-
@Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
|
137 |
-
def dummy_roberta_transformer(nlp, name):
|
138 |
-
def dummy(doc):
|
139 |
-
return doc
|
140 |
-
return dummy
|
141 |
-
|
142 |
@st.cache_resource
|
143 |
def load_models():
|
144 |
try:
|
145 |
-
#
|
146 |
try:
|
147 |
nlp = spacy.load("en_core_web_sm")
|
148 |
except OSError:
|
@@ -150,18 +92,30 @@ def load_models():
|
|
150 |
spacy.cli.download("en_core_web_sm")
|
151 |
nlp = spacy.load("en_core_web_sm")
|
152 |
|
153 |
-
# Load SentenceTransformer
|
154 |
try:
|
155 |
from sentence_transformers import SentenceTransformer
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
except Exception as e:
|
158 |
st.error(f"Error loading SentenceTransformer: {e}")
|
159 |
semantic_model = None
|
160 |
|
161 |
-
# Load Transformers pipeline with
|
162 |
try:
|
163 |
-
from transformers import pipeline
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
except Exception as e:
|
166 |
st.error(f"Error loading Transformers: {e}")
|
167 |
summarizer = None
|
@@ -172,55 +126,38 @@ def load_models():
|
|
172 |
st.error(f"Error loading models: {e}")
|
173 |
return None, None, None
|
174 |
|
175 |
-
#
|
176 |
-
|
177 |
-
|
178 |
-
def load_semantic_model():
|
179 |
-
return SentenceTransformer('all-MiniLM-L6-v2')
|
180 |
-
|
181 |
-
semantic_model = load_semantic_model()
|
182 |
-
|
183 |
-
# -------------------- Transformers Summarization Setup --------------------
|
184 |
-
from transformers import pipeline
|
185 |
-
@st.cache_resource
|
186 |
-
def load_summarizer():
|
187 |
-
return pipeline("summarization")
|
188 |
-
|
189 |
-
summarizer = load_summarizer()
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
os.remove(temp_pdf.name)
|
201 |
-
limited_text = text[:3000]
|
202 |
-
summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
|
203 |
-
return summary[0]["summary_text"]
|
204 |
-
except Exception as e:
|
205 |
-
return f"Error summarizing PDF: {e}"
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
"token_uri": "https://oauth2.googleapis.com/token",
|
214 |
-
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
215 |
-
"client_secret": "your_client_secret",
|
216 |
-
"redirect_uris": ["your_redirect_uri"]
|
217 |
-
}
|
218 |
-
}
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
|
|
224 |
def get_google_auth_url():
|
225 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
226 |
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
@@ -253,76 +190,16 @@ def exchange_code_for_credentials(auth_code):
|
|
253 |
except Exception as e:
|
254 |
return None, f"Error during token exchange: {e}"
|
255 |
|
256 |
-
|
257 |
-
def install_playwright_dependencies():
|
258 |
-
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
259 |
-
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
|
260 |
try:
|
261 |
-
|
|
|
|
|
|
|
|
|
262 |
except Exception as e:
|
263 |
-
|
264 |
-
|
265 |
-
# Initialize Playwright dependencies
|
266 |
-
install_playwright_dependencies()
|
267 |
-
|
268 |
-
# -------------------- Logging Setup --------------------
|
269 |
-
logging.basicConfig(
|
270 |
-
filename='advanced_download_log.txt',
|
271 |
-
level=logging.INFO,
|
272 |
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
273 |
-
)
|
274 |
-
logger = logging.getLogger()
|
275 |
-
|
276 |
-
# -------------------- Shared Utils --------------------
|
277 |
-
USER_AGENTS = [
|
278 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
279 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
280 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
281 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
282 |
-
]
|
283 |
-
|
284 |
-
def get_random_user_agent():
|
285 |
-
return random.choice(USER_AGENTS)
|
286 |
-
|
287 |
-
def sizeof_fmt(num, suffix='B'):
|
288 |
-
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
|
289 |
-
if abs(num) < 1024.0:
|
290 |
-
return f"{num:3.1f}{unit}{suffix}"
|
291 |
-
num /= 1024.0
|
292 |
-
return f"{num:.1f}Y{suffix}"
|
293 |
-
# ---------- Human-like Interactions -------------
|
294 |
-
async def human_like_scroll(page):
|
295 |
-
scroll_height = await page.evaluate('document.body.scrollHeight')
|
296 |
-
viewport_height = await page.evaluate('window.innerHeight')
|
297 |
-
current_scroll = 0
|
298 |
-
while current_scroll < scroll_height:
|
299 |
-
await page.evaluate(f'window.scrollTo(0, {current_scroll})')
|
300 |
-
await asyncio.sleep(random.uniform(0.5, 1.5))
|
301 |
-
current_scroll += viewport_height * random.uniform(0.5, 1.5)
|
302 |
-
scroll_height = await page.evaluate('document.body.scrollHeight')
|
303 |
-
|
304 |
-
async def human_like_interactions(page):
|
305 |
-
await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000))
|
306 |
-
await asyncio.sleep(random.uniform(0.5, 1.5))
|
307 |
-
await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000))
|
308 |
-
await asyncio.sleep(random.uniform(0.5, 1.5))
|
309 |
-
await page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
|
310 |
-
await asyncio.sleep(random.uniform(0.5, 1.5))
|
311 |
-
|
312 |
-
# ---------- NLP Helpers -------------
|
313 |
-
def nlp_preprocess(query: str) -> str:
|
314 |
-
doc = nlp_model(query)
|
315 |
-
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
|
316 |
-
processed = " ".join(tokens)
|
317 |
-
return processed if processed.strip() else query
|
318 |
-
|
319 |
-
def nlp_extract_entities(text: str):
|
320 |
-
doc = nlp_model(text)
|
321 |
-
return [(ent.text, ent.label_) for ent in doc.ents]
|
322 |
-
|
323 |
-
# ---------- AI-enhanced Query Preprocessing -------------
|
324 |
-
def ai_preprocess_query(query: str) -> str:
|
325 |
-
return query
|
326 |
class DownloadManager:
|
327 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
328 |
self.use_proxy = use_proxy
|
@@ -336,9 +213,20 @@ class DownloadManager:
|
|
336 |
|
337 |
async def __aenter__(self):
|
338 |
self.playwright = await async_playwright().start()
|
339 |
-
opts = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
if self.use_proxy and self.proxy:
|
341 |
opts["proxy"] = {"server": self.proxy}
|
|
|
342 |
self.browser = await self.playwright.chromium.launch(**opts)
|
343 |
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
|
344 |
self.page = await self.context.new_page()
|
@@ -391,10 +279,6 @@ class DownloadManager:
|
|
391 |
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
392 |
if response and response.headers.get('location'):
|
393 |
return response.headers['location']
|
394 |
-
content_type = response.headers.get('content-type', '')
|
395 |
-
if 'text/html' not in content_type.lower():
|
396 |
-
return url
|
397 |
-
content = await page.content()
|
398 |
return page.url
|
399 |
except Exception as e:
|
400 |
logger.error(f"Error extracting real download URL: {e}")
|
@@ -432,6 +316,23 @@ class DownloadManager:
|
|
432 |
|
433 |
for a in soup.find_all('a', href=True):
|
434 |
href = a['href'].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
436 |
file_url = href if href.startswith('http') else (
|
437 |
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
@@ -449,6 +350,7 @@ class DownloadManager:
|
|
449 |
'metadata': meta
|
450 |
})
|
451 |
|
|
|
452 |
elif ("drive.google.com" in href) or ("docs.google.com" in href):
|
453 |
file_id = None
|
454 |
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
@@ -477,7 +379,15 @@ class DownloadManager:
|
|
477 |
except Exception as e:
|
478 |
logger.error(f"Error processing Google Drive link: {e}")
|
479 |
|
480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
except Exception as e:
|
482 |
logger.error(f"Error extracting files from {url}: {e}")
|
483 |
return []
|
@@ -531,75 +441,27 @@ class DownloadManager:
|
|
531 |
logger.error(f"Error downloading {file_url}: {e}")
|
532 |
return None
|
533 |
|
534 |
-
async def search_bing(self):
|
535 |
-
if not self.query:
|
536 |
-
return [], []
|
537 |
-
|
538 |
-
search_query = self.query
|
539 |
-
if "filetype:pdf" not in search_query.lower():
|
540 |
-
search_query += " filetype:pdf"
|
541 |
-
|
542 |
-
search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
|
543 |
-
|
544 |
-
try:
|
545 |
-
await self.page.goto(search_url, timeout=30000)
|
546 |
-
await self.page.wait_for_selector('li.b_algo', timeout=30000)
|
547 |
-
await human_like_scroll(self.page)
|
548 |
-
|
549 |
-
results = []
|
550 |
-
elements = await self.page.query_selector_all('li.b_algo')
|
551 |
-
|
552 |
-
for element in elements:
|
553 |
-
link = await element.query_selector('h2 a')
|
554 |
-
if link:
|
555 |
-
url = await link.get_attribute('href')
|
556 |
-
if url:
|
557 |
-
results.append(url)
|
558 |
-
|
559 |
-
return results[:self.num_results]
|
560 |
-
|
561 |
-
except Exception as e:
|
562 |
-
logger.error(f"Bing search error: {e}")
|
563 |
-
return []
|
564 |
-
|
565 |
-
async def get_sublinks(self, url, limit=100):
|
566 |
-
try:
|
567 |
-
await self.page.goto(url, timeout=30000)
|
568 |
-
content = await self.page.content()
|
569 |
-
soup = BeautifulSoup(content, 'html.parser')
|
570 |
-
|
571 |
-
parsed_base = urlparse(url)
|
572 |
-
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
573 |
-
|
574 |
-
links = set()
|
575 |
-
for a in soup.find_all('a', href=True):
|
576 |
-
href = a['href'].strip()
|
577 |
-
if href.startswith('http'):
|
578 |
-
links.add(href)
|
579 |
-
elif href.startswith('/'):
|
580 |
-
links.add(f"{base_url}{href}")
|
581 |
-
|
582 |
-
return list(links)[:limit]
|
583 |
-
|
584 |
-
except Exception as e:
|
585 |
-
logger.error(f"Error getting sublinks: {e}")
|
586 |
-
return []
|
587 |
-
|
588 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
|
589 |
if not custom_ext_list:
|
590 |
custom_ext_list = []
|
591 |
|
592 |
progress_text = st.empty()
|
593 |
progress_bar = st.progress(0)
|
|
|
594 |
|
595 |
try:
|
596 |
# Search main page
|
597 |
progress_text.text("Analyzing main page...")
|
598 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
|
|
|
|
599 |
|
600 |
# Get and search sublinks
|
601 |
progress_text.text("Getting sublinks...")
|
602 |
sublinks = await self.get_sublinks(url, sublink_limit)
|
|
|
|
|
|
|
603 |
|
604 |
if not sublinks:
|
605 |
progress_bar.progress(1.0)
|
@@ -607,32 +469,67 @@ class DownloadManager:
|
|
607 |
|
608 |
# Process sublinks
|
609 |
all_files = main_files
|
610 |
-
total_links = len(sublinks)
|
611 |
|
612 |
for i, sublink in enumerate(sublinks, 1):
|
|
|
613 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
614 |
-
progress_bar.progress(
|
615 |
|
616 |
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
617 |
all_files.extend(sub_files)
|
618 |
|
|
|
|
|
|
|
619 |
# Make results unique
|
620 |
seen_urls = set()
|
621 |
unique_files = []
|
|
|
622 |
for f in all_files:
|
623 |
if f['url'] not in seen_urls:
|
624 |
seen_urls.add(f['url'])
|
625 |
unique_files.append(f)
|
626 |
|
627 |
-
|
|
|
|
|
628 |
progress_bar.progress(1.0)
|
629 |
|
630 |
return unique_files
|
631 |
|
632 |
except Exception as e:
|
633 |
logger.error(f"Deep search error: {e}")
|
|
|
634 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
def main():
|
637 |
if 'initialized' not in st.session_state:
|
638 |
st.session_state.initialized = True
|
@@ -642,6 +539,7 @@ def main():
|
|
642 |
|
643 |
st.title("Advanced File Downloader")
|
644 |
|
|
|
645 |
with st.sidebar:
|
646 |
st.header("Settings")
|
647 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
|
@@ -651,9 +549,28 @@ def main():
|
|
651 |
"Custom File Extensions",
|
652 |
placeholder=".csv, .txt, .epub"
|
653 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
use_proxy = st.checkbox("Use Proxy")
|
655 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
if mode == "Manual URL":
|
658 |
st.header("Manual URL Mode")
|
659 |
url = st.text_input("Enter URL", placeholder="https://example.com")
|
@@ -662,74 +579,99 @@ def main():
|
|
662 |
if url:
|
663 |
async def run_deep_search():
|
664 |
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
|
674 |
files = asyncio.run(run_deep_search())
|
675 |
if files:
|
676 |
st.success(f"Found {len(files)} files!")
|
677 |
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
|
686 |
-
#
|
687 |
-
st.subheader("Download Files")
|
688 |
selected_files = st.multiselect(
|
689 |
"Select files to download",
|
690 |
range(len(files)),
|
|
|
691 |
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
|
692 |
)
|
693 |
|
694 |
if selected_files:
|
695 |
-
col1, col2 = st.columns(
|
696 |
with col1:
|
697 |
download_dir = st.text_input("Download Directory", value="./downloads")
|
698 |
with col2:
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
705 |
|
706 |
-
|
707 |
-
|
708 |
-
progress_text.text(f"Downloading {files[idx]['filename']}...")
|
709 |
-
progress_bar.progress(progress)
|
710 |
-
|
711 |
-
path = await dm.download_file(
|
712 |
-
files[idx],
|
713 |
-
download_dir,
|
714 |
-
url
|
715 |
-
)
|
716 |
-
if path:
|
717 |
-
paths.append(path)
|
718 |
|
719 |
-
|
720 |
-
|
721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
733 |
else:
|
734 |
st.warning("No files found.")
|
735 |
|
@@ -753,64 +695,18 @@ def main():
|
|
753 |
st.success(f"Found {len(urls)} results!")
|
754 |
for i, url in enumerate(urls, 1):
|
755 |
with st.expander(f"Result {i}: {url}", expanded=i==1):
|
756 |
-
if st.button(f"Deep Search
|
757 |
files = await dm.deep_search(
|
758 |
url=url,
|
759 |
-
custom_ext_list=custom_extensions.split(',') if custom_extensions else []
|
|
|
760 |
)
|
|
|
761 |
if files:
|
762 |
st.session_state.discovered_files = files
|
763 |
st.session_state.current_url = url
|
764 |
st.success(f"Found {len(files)} files!")
|
765 |
-
|
766 |
-
with st.expander("Found Files", expanded=True):
|
767 |
-
for j, file in enumerate(files):
|
768 |
-
col1, col2 = st.columns([3, 1])
|
769 |
-
with col1:
|
770 |
-
st.write(f"{j+1}. {file['filename']}")
|
771 |
-
with col2:
|
772 |
-
st.write(f"Size: {file['size']}")
|
773 |
-
|
774 |
-
selected_files = st.multiselect(
|
775 |
-
"Select files to download",
|
776 |
-
range(len(files)),
|
777 |
-
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
|
778 |
-
)
|
779 |
-
|
780 |
-
if selected_files:
|
781 |
-
col1, col2 = st.columns([3, 1])
|
782 |
-
with col1:
|
783 |
-
download_dir = st.text_input("Download Directory", value="./downloads")
|
784 |
-
with col2:
|
785 |
-
if st.button("Download Selected Files"):
|
786 |
-
progress_text = st.empty()
|
787 |
-
progress_bar = st.progress(0)
|
788 |
-
|
789 |
-
paths = []
|
790 |
-
for k, idx in enumerate(selected_files):
|
791 |
-
progress = (k + 1) / len(selected_files)
|
792 |
-
progress_text.text(f"Downloading {files[idx]['filename']}...")
|
793 |
-
progress_bar.progress(progress)
|
794 |
-
|
795 |
-
path = await dm.download_file(
|
796 |
-
files[idx],
|
797 |
-
download_dir,
|
798 |
-
url
|
799 |
-
)
|
800 |
-
if path:
|
801 |
-
paths.append(path)
|
802 |
-
|
803 |
-
progress_text.empty()
|
804 |
-
progress_bar.empty()
|
805 |
-
|
806 |
-
if paths:
|
807 |
-
st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
|
808 |
-
if len(paths) > 1:
|
809 |
-
zip_path = os.path.join(download_dir, "downloads.zip")
|
810 |
-
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
811 |
-
for file in paths:
|
812 |
-
zipf.write(file, os.path.basename(file))
|
813 |
-
st.success(f"Created zip file: {zip_path}")
|
814 |
else:
|
815 |
st.warning("No files found on this page.")
|
816 |
else:
|
@@ -828,9 +724,20 @@ def main():
|
|
828 |
if st.button("Summarize"):
|
829 |
if pdf_url:
|
830 |
with st.spinner("Generating summary..."):
|
831 |
-
|
832 |
-
|
833 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
834 |
|
835 |
if __name__ == "__main__":
|
836 |
try:
|
|
|
1 |
import streamlit as st
|
2 |
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
|
3 |
|
4 |
+
# Core imports
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import os
|
6 |
import subprocess
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
|
|
18 |
import tempfile
|
19 |
import mimetypes
|
20 |
import requests
|
21 |
+
import datetime
|
22 |
+
import spacy
|
23 |
+
import spacy.cli
|
24 |
+
from spacy.language import Language
|
25 |
+
import google_auth_oauthlib.flow
|
26 |
+
import googleapiclient.discovery
|
27 |
+
import google.auth.transport.requests
|
28 |
+
|
29 |
+
# Google OAuth Configuration
|
30 |
+
GOOGLE_OAUTH_CONFIG = {
|
31 |
+
"web": {
|
32 |
+
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
33 |
+
"project_id": "huggingface-449214",
|
34 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
35 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
36 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
37 |
+
"client_secret": "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",
|
38 |
+
"redirect_uris": ["https://euler314-craw-web.hf.space/"]
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
# Playwright Setup
|
43 |
def install_playwright_dependencies():
|
44 |
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
45 |
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
|
|
|
77 |
except Exception as e:
|
78 |
print(f"Error: {e}")
|
79 |
|
80 |
+
# Initialize Playwright
|
81 |
install_playwright_dependencies()
|
82 |
|
83 |
+
# Model Loading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
@st.cache_resource
|
85 |
def load_models():
|
86 |
try:
|
87 |
+
# Try to load spaCy model
|
88 |
try:
|
89 |
nlp = spacy.load("en_core_web_sm")
|
90 |
except OSError:
|
|
|
92 |
spacy.cli.download("en_core_web_sm")
|
93 |
nlp = spacy.load("en_core_web_sm")
|
94 |
|
95 |
+
# Load SentenceTransformer with offline handling
|
96 |
try:
|
97 |
from sentence_transformers import SentenceTransformer
|
98 |
+
model_name = 'all-MiniLM-L6-v2'
|
99 |
+
cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
|
100 |
+
if os.path.exists(os.path.join(cache_dir, model_name)):
|
101 |
+
semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
|
102 |
+
else:
|
103 |
+
st.warning(f"Downloading SentenceTransformer model {model_name}...")
|
104 |
+
semantic_model = SentenceTransformer(model_name)
|
105 |
except Exception as e:
|
106 |
st.error(f"Error loading SentenceTransformer: {e}")
|
107 |
semantic_model = None
|
108 |
|
109 |
+
# Load Transformers pipeline with offline handling
|
110 |
try:
|
111 |
+
from transformers import pipeline
|
112 |
+
model_name = "facebook/bart-large-cnn"
|
113 |
+
cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
|
114 |
+
if os.path.exists(os.path.join(cache_dir, model_name)):
|
115 |
+
summarizer = pipeline("summarization", model=model_name)
|
116 |
+
else:
|
117 |
+
st.warning(f"Downloading Transformer model {model_name}...")
|
118 |
+
summarizer = pipeline("summarization")
|
119 |
except Exception as e:
|
120 |
st.error(f"Error loading Transformers: {e}")
|
121 |
summarizer = None
|
|
|
126 |
st.error(f"Error loading models: {e}")
|
127 |
return None, None, None
|
128 |
|
129 |
+
# Initialize models
|
130 |
+
with st.spinner("Loading models..."):
|
131 |
+
nlp_model, semantic_model, summarizer = load_models()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
+
# Utility Functions
|
134 |
+
def get_random_user_agent():
|
135 |
+
USER_AGENTS = [
|
136 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
137 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
|
138 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
139 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
|
140 |
+
]
|
141 |
+
return random.choice(USER_AGENTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
def sizeof_fmt(num, suffix='B'):
|
144 |
+
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
|
145 |
+
if abs(num) < 1024.0:
|
146 |
+
return f"{num:3.1f}{unit}{suffix}"
|
147 |
+
num /= 1024.0
|
148 |
+
return f"{num:.1f}Y{suffix}"
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
def create_zip_file(file_paths, output_dir):
|
151 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
152 |
+
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
|
153 |
+
|
154 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
155 |
+
for file_path in file_paths:
|
156 |
+
zipf.write(file_path, os.path.basename(file_path))
|
157 |
+
|
158 |
+
return zip_path
|
159 |
|
160 |
+
# Google Drive Functions
|
161 |
def get_google_auth_url():
|
162 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
163 |
flow = google_auth_oauthlib.flow.Flow.from_client_config(
|
|
|
190 |
except Exception as e:
|
191 |
return None, f"Error during token exchange: {e}"
|
192 |
|
193 |
+
def google_drive_upload(zip_path: str, credentials):
|
|
|
|
|
|
|
194 |
try:
|
195 |
+
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
196 |
+
file_metadata = {'name': os.path.basename(zip_path)}
|
197 |
+
media = googleapiclient.http.MediaFileUpload(zip_path, resumable=True)
|
198 |
+
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
199 |
+
return created.get("id", "")
|
200 |
except Exception as e:
|
201 |
+
return f"Error uploading to Drive: {str(e)}"
|
202 |
+
# DownloadManager Class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
class DownloadManager:
|
204 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
205 |
self.use_proxy = use_proxy
|
|
|
213 |
|
214 |
async def __aenter__(self):
|
215 |
self.playwright = await async_playwright().start()
|
216 |
+
opts = {
|
217 |
+
"headless": True,
|
218 |
+
"args": [
|
219 |
+
'--no-sandbox',
|
220 |
+
'--disable-setuid-sandbox',
|
221 |
+
'--disable-dev-shm-usage',
|
222 |
+
'--disable-gpu',
|
223 |
+
'--no-zygote',
|
224 |
+
'--single-process'
|
225 |
+
]
|
226 |
+
}
|
227 |
if self.use_proxy and self.proxy:
|
228 |
opts["proxy"] = {"server": self.proxy}
|
229 |
+
|
230 |
self.browser = await self.playwright.chromium.launch(**opts)
|
231 |
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
|
232 |
self.page = await self.context.new_page()
|
|
|
279 |
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
280 |
if response and response.headers.get('location'):
|
281 |
return response.headers['location']
|
|
|
|
|
|
|
|
|
282 |
return page.url
|
283 |
except Exception as e:
|
284 |
logger.error(f"Error extracting real download URL: {e}")
|
|
|
316 |
|
317 |
for a in soup.find_all('a', href=True):
|
318 |
href = a['href'].strip()
|
319 |
+
|
320 |
+
# Handle PHP scripts and redirects
|
321 |
+
if '.php' in href.lower() or 'download' in href.lower():
|
322 |
+
full_url = href if href.startswith('http') else (
|
323 |
+
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
324 |
+
)
|
325 |
+
real_url = await self.extract_real_download_url(full_url)
|
326 |
+
if real_url and real_url != full_url:
|
327 |
+
found_files.append({
|
328 |
+
'url': real_url,
|
329 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
330 |
+
'size': await self.get_file_size(real_url),
|
331 |
+
'metadata': {}
|
332 |
+
})
|
333 |
+
continue
|
334 |
+
|
335 |
+
# Handle direct file links
|
336 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
337 |
file_url = href if href.startswith('http') else (
|
338 |
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
|
|
350 |
'metadata': meta
|
351 |
})
|
352 |
|
353 |
+
# Handle Google Drive links
|
354 |
elif ("drive.google.com" in href) or ("docs.google.com" in href):
|
355 |
file_id = None
|
356 |
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
|
|
379 |
except Exception as e:
|
380 |
logger.error(f"Error processing Google Drive link: {e}")
|
381 |
|
382 |
+
# Make results unique based on URLs
|
383 |
+
seen_urls = set()
|
384 |
+
unique_files = []
|
385 |
+
for f in found_files:
|
386 |
+
if f['url'] not in seen_urls:
|
387 |
+
seen_urls.add(f['url'])
|
388 |
+
unique_files.append(f)
|
389 |
+
|
390 |
+
return unique_files
|
391 |
except Exception as e:
|
392 |
logger.error(f"Error extracting files from {url}: {e}")
|
393 |
return []
|
|
|
441 |
logger.error(f"Error downloading {file_url}: {e}")
|
442 |
return None
|
443 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
|
445 |
if not custom_ext_list:
|
446 |
custom_ext_list = []
|
447 |
|
448 |
progress_text = st.empty()
|
449 |
progress_bar = st.progress(0)
|
450 |
+
file_count_text = st.empty()
|
451 |
|
452 |
try:
|
453 |
# Search main page
|
454 |
progress_text.text("Analyzing main page...")
|
455 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
456 |
+
initial_count = len(main_files)
|
457 |
+
file_count_text.text(f"Found {initial_count} files on main page")
|
458 |
|
459 |
# Get and search sublinks
|
460 |
progress_text.text("Getting sublinks...")
|
461 |
sublinks = await self.get_sublinks(url, sublink_limit)
|
462 |
+
total_links = len(sublinks)
|
463 |
+
|
464 |
+
progress_text.text(f"Found {total_links} sublinks to process")
|
465 |
|
466 |
if not sublinks:
|
467 |
progress_bar.progress(1.0)
|
|
|
469 |
|
470 |
# Process sublinks
|
471 |
all_files = main_files
|
|
|
472 |
|
473 |
for i, sublink in enumerate(sublinks, 1):
|
474 |
+
progress = i/total_links
|
475 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
476 |
+
progress_bar.progress(progress)
|
477 |
|
478 |
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
479 |
all_files.extend(sub_files)
|
480 |
|
481 |
+
# Update count in real-time
|
482 |
+
file_count_text.text(f"Found {len(all_files)} total files")
|
483 |
+
|
484 |
# Make results unique
|
485 |
seen_urls = set()
|
486 |
unique_files = []
|
487 |
+
|
488 |
for f in all_files:
|
489 |
if f['url'] not in seen_urls:
|
490 |
seen_urls.add(f['url'])
|
491 |
unique_files.append(f)
|
492 |
|
493 |
+
final_count = len(unique_files)
|
494 |
+
progress_text.text(f"Deep search complete!")
|
495 |
+
file_count_text.text(f"Found {final_count} unique files")
|
496 |
progress_bar.progress(1.0)
|
497 |
|
498 |
return unique_files
|
499 |
|
500 |
except Exception as e:
|
501 |
logger.error(f"Deep search error: {e}")
|
502 |
+
progress_text.text(f"Error during deep search: {str(e)}")
|
503 |
return []
|
504 |
+
finally:
|
505 |
+
# Clean up progress indicators after a delay
|
506 |
+
await asyncio.sleep(2)
|
507 |
+
if not st.session_state.get('keep_progress', False):
|
508 |
+
progress_text.empty()
|
509 |
+
progress_bar.empty()
|
510 |
|
511 |
+
async def get_sublinks(self, url, limit=100):
|
512 |
+
try:
|
513 |
+
await self.page.goto(url, timeout=30000)
|
514 |
+
content = await self.page.content()
|
515 |
+
soup = BeautifulSoup(content, 'html.parser')
|
516 |
+
|
517 |
+
parsed_base = urlparse(url)
|
518 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
519 |
+
|
520 |
+
links = set()
|
521 |
+
for a in soup.find_all('a', href=True):
|
522 |
+
href = a['href'].strip()
|
523 |
+
if href.startswith('http'):
|
524 |
+
links.add(href)
|
525 |
+
elif href.startswith('/'):
|
526 |
+
links.add(f"{base_url}{href}")
|
527 |
+
|
528 |
+
return list(links)[:limit]
|
529 |
+
|
530 |
+
except Exception as e:
|
531 |
+
logger.error(f"Error getting sublinks: {e}")
|
532 |
+
return []
|
533 |
def main():
|
534 |
if 'initialized' not in st.session_state:
|
535 |
st.session_state.initialized = True
|
|
|
539 |
|
540 |
st.title("Advanced File Downloader")
|
541 |
|
542 |
+
# Sidebar settings
|
543 |
with st.sidebar:
|
544 |
st.header("Settings")
|
545 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
|
|
|
549 |
"Custom File Extensions",
|
550 |
placeholder=".csv, .txt, .epub"
|
551 |
)
|
552 |
+
max_sublinks = st.number_input(
|
553 |
+
"Maximum Sublinks to Process",
|
554 |
+
min_value=1,
|
555 |
+
max_value=10000,
|
556 |
+
value=100,
|
557 |
+
help="Maximum number of sublinks to process from the main page"
|
558 |
+
)
|
559 |
use_proxy = st.checkbox("Use Proxy")
|
560 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
|
561 |
|
562 |
+
# Google Drive Integration
|
563 |
+
with st.expander("Google Drive Integration"):
|
564 |
+
if st.button("Start Google Sign-In"):
|
565 |
+
auth_url = get_google_auth_url()
|
566 |
+
st.markdown(f"[Click here to authorize]({auth_url})")
|
567 |
+
|
568 |
+
auth_code = st.text_input("Enter authorization code")
|
569 |
+
if st.button("Complete Sign-In") and auth_code:
|
570 |
+
creds, msg = exchange_code_for_credentials(auth_code)
|
571 |
+
st.session_state.google_creds = creds
|
572 |
+
st.write(msg)
|
573 |
+
|
574 |
if mode == "Manual URL":
|
575 |
st.header("Manual URL Mode")
|
576 |
url = st.text_input("Enter URL", placeholder="https://example.com")
|
|
|
579 |
if url:
|
580 |
async def run_deep_search():
|
581 |
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
582 |
+
files = await dm.deep_search(
|
583 |
+
url=url,
|
584 |
+
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
585 |
+
sublink_limit=max_sublinks
|
586 |
+
)
|
587 |
+
st.session_state.discovered_files = files
|
588 |
+
st.session_state.current_url = url
|
589 |
+
return files
|
590 |
|
591 |
files = asyncio.run(run_deep_search())
|
592 |
if files:
|
593 |
st.success(f"Found {len(files)} files!")
|
594 |
|
595 |
+
# Select All/Clear Selection buttons
|
596 |
+
col1, col2 = st.columns([1, 4])
|
597 |
+
with col1:
|
598 |
+
if st.button("Select All"):
|
599 |
+
st.session_state.selected_files = list(range(len(files)))
|
600 |
+
if st.button("Clear Selection"):
|
601 |
+
st.session_state.selected_files = []
|
602 |
|
603 |
+
# File selection
|
|
|
604 |
selected_files = st.multiselect(
|
605 |
"Select files to download",
|
606 |
range(len(files)),
|
607 |
+
default=st.session_state.get('selected_files', []),
|
608 |
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
|
609 |
)
|
610 |
|
611 |
if selected_files:
|
612 |
+
col1, col2, col3, col4 = st.columns(4)
|
613 |
with col1:
|
614 |
download_dir = st.text_input("Download Directory", value="./downloads")
|
615 |
with col2:
|
616 |
+
create_zip = st.checkbox("Create ZIP file", value=True)
|
617 |
+
with col3:
|
618 |
+
delete_after = st.checkbox("Delete after creating ZIP")
|
619 |
+
with col4:
|
620 |
+
upload_to_drive = st.checkbox("Upload to Google Drive")
|
621 |
+
|
622 |
+
if st.button("Download Selected"):
|
623 |
+
if not os.path.exists(download_dir):
|
624 |
+
os.makedirs(download_dir)
|
625 |
+
|
626 |
+
async def download_files():
|
627 |
+
downloaded_paths = []
|
628 |
+
progress_bar = st.progress(0)
|
629 |
+
status_text = st.empty()
|
630 |
+
|
631 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
632 |
+
for i, idx in enumerate(selected_files):
|
633 |
+
progress = (i + 1) / len(selected_files)
|
634 |
+
file_info = files[idx]
|
635 |
|
636 |
+
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
|
637 |
+
progress_bar.progress(progress)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
638 |
|
639 |
+
path = await dm.download_file(
|
640 |
+
file_info,
|
641 |
+
download_dir,
|
642 |
+
url
|
643 |
+
)
|
644 |
+
if path:
|
645 |
+
downloaded_paths.append(path)
|
646 |
+
|
647 |
+
status_text.empty()
|
648 |
+
progress_bar.empty()
|
649 |
+
return downloaded_paths
|
650 |
+
|
651 |
+
downloaded = asyncio.run(download_files())
|
652 |
|
653 |
+
if downloaded:
|
654 |
+
st.success(f"Successfully downloaded {len(downloaded)} files")
|
655 |
+
|
656 |
+
if create_zip or upload_to_drive:
|
657 |
+
zip_path = create_zip_file(downloaded, download_dir)
|
658 |
+
st.success(f"Created ZIP file: {zip_path}")
|
659 |
+
|
660 |
+
if upload_to_drive and st.session_state.get('google_creds'):
|
661 |
+
with st.spinner("Uploading to Google Drive..."):
|
662 |
+
drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
|
663 |
+
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
664 |
+
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
665 |
+
else:
|
666 |
+
st.error(drive_id)
|
667 |
+
|
668 |
+
if delete_after:
|
669 |
+
for path in downloaded:
|
670 |
+
try:
|
671 |
+
os.remove(path)
|
672 |
+
except Exception as e:
|
673 |
+
st.warning(f"Could not delete {path}: {e}")
|
674 |
+
st.info("Deleted original files after ZIP creation")
|
675 |
else:
|
676 |
st.warning("No files found.")
|
677 |
|
|
|
695 |
st.success(f"Found {len(urls)} results!")
|
696 |
for i, url in enumerate(urls, 1):
|
697 |
with st.expander(f"Result {i}: {url}", expanded=i==1):
|
698 |
+
if st.button(f"Deep Search Result {i}"):
|
699 |
files = await dm.deep_search(
|
700 |
url=url,
|
701 |
+
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
702 |
+
sublink_limit=max_sublinks
|
703 |
)
|
704 |
+
# Reuse the same file handling logic as Manual URL mode
|
705 |
if files:
|
706 |
st.session_state.discovered_files = files
|
707 |
st.session_state.current_url = url
|
708 |
st.success(f"Found {len(files)} files!")
|
709 |
+
# Add file selection and download UI here (same as Manual URL mode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
else:
|
711 |
st.warning("No files found on this page.")
|
712 |
else:
|
|
|
724 |
if st.button("Summarize"):
|
725 |
if pdf_url:
|
726 |
with st.spinner("Generating summary..."):
|
727 |
+
try:
|
728 |
+
response = requests.get(pdf_url, stream=True)
|
729 |
+
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
730 |
+
with open(temp_pdf.name, "wb") as f:
|
731 |
+
f.write(response.content)
|
732 |
+
reader = PdfReader(temp_pdf.name)
|
733 |
+
text = " ".join([page.extract_text() or "" for page in reader.pages])
|
734 |
+
os.remove(temp_pdf.name)
|
735 |
+
limited_text = text[:3000]
|
736 |
+
summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
|
737 |
+
st.write("Summary:")
|
738 |
+
st.write(summary[0]['summary_text'])
|
739 |
+
except Exception as e:
|
740 |
+
st.error(f"Error summarizing PDF: {e}")
|
741 |
|
742 |
if __name__ == "__main__":
|
743 |
try:
|