Update app.py
Browse files
app.py
CHANGED
@@ -217,6 +217,7 @@ class DownloadManager:
|
|
217 |
self.browser = None
|
218 |
self.context = None
|
219 |
self.page = None
|
|
|
220 |
|
221 |
async def __aenter__(self):
|
222 |
self.playwright = await async_playwright().start()
|
@@ -250,6 +251,63 @@ class DownloadManager:
|
|
250 |
if self.playwright:
|
251 |
await self.playwright.stop()
|
252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
async def get_file_size(self, url):
|
254 |
try:
|
255 |
async with self.context.new_page() as page:
|
@@ -280,40 +338,32 @@ class DownloadManager:
|
|
280 |
except Exception:
|
281 |
return {}
|
282 |
|
283 |
-
async def extract_real_download_url(self, url):
|
284 |
-
try:
|
285 |
-
async with self.context.new_page() as page:
|
286 |
-
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
287 |
-
if response and response.headers.get('location'):
|
288 |
-
return response.headers['location']
|
289 |
-
return page.url
|
290 |
-
except Exception as e:
|
291 |
-
logger.error(f"Error extracting real download URL: {e}")
|
292 |
-
return url
|
293 |
-
|
294 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
295 |
found_files = []
|
296 |
try:
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
}
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
314 |
content = await self.page.content()
|
315 |
soup = BeautifulSoup(content, 'html.parser')
|
316 |
|
|
|
317 |
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
318 |
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
319 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
@@ -321,70 +371,73 @@ class DownloadManager:
|
|
321 |
parsed_base = urlparse(final_url)
|
322 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
323 |
|
|
|
|
|
|
|
324 |
for a in soup.find_all('a', href=True):
|
325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
-
#
|
328 |
-
if
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
size_str = await self.get_file_size(file_url)
|
349 |
-
meta = {}
|
350 |
-
if file_url.lower().endswith('.pdf'):
|
351 |
-
meta = await self.get_pdf_metadata(file_url)
|
352 |
-
|
353 |
-
found_files.append({
|
354 |
-
'url': file_url,
|
355 |
-
'filename': os.path.basename(file_url.split('?')[0]),
|
356 |
-
'size': size_str,
|
357 |
-
'metadata': meta
|
358 |
-
})
|
359 |
-
|
360 |
-
# Handle Google Drive links
|
361 |
-
elif ("drive.google.com" in href) or ("docs.google.com" in href):
|
362 |
-
file_id = None
|
363 |
-
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
364 |
-
match = re.search(pattern, href)
|
365 |
-
if match:
|
366 |
-
file_id = match.group(1)
|
367 |
-
break
|
368 |
-
|
369 |
-
if file_id:
|
370 |
-
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
371 |
-
filename = file_id
|
372 |
-
try:
|
373 |
-
response = await self.page.request.head(direct_url, timeout=15000)
|
374 |
-
cd = response.headers.get("Content-Disposition", "")
|
375 |
-
if cd:
|
376 |
-
mt = re.search(r'filename\*?="?([^";]+)', cd)
|
377 |
-
if mt:
|
378 |
-
filename = mt.group(1).strip('"').strip()
|
379 |
-
|
380 |
found_files.append({
|
381 |
-
'url':
|
382 |
-
'filename':
|
383 |
-
'size': await self.get_file_size(
|
384 |
'metadata': {}
|
385 |
})
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
# Make results unique based on URLs
|
390 |
seen_urls = set()
|
@@ -395,11 +448,11 @@ class DownloadManager:
|
|
395 |
unique_files.append(f)
|
396 |
|
397 |
return unique_files
|
|
|
398 |
except Exception as e:
|
399 |
logger.error(f"Error extracting files from {url}: {e}")
|
400 |
return []
|
401 |
-
|
402 |
-
async def download_file(self, file_info, save_dir, referer):
|
403 |
file_url = file_info['url']
|
404 |
fname = file_info['filename']
|
405 |
path = os.path.join(save_dir, fname)
|
@@ -412,11 +465,14 @@ class DownloadManager:
|
|
412 |
os.makedirs(save_dir, exist_ok=True)
|
413 |
|
414 |
try:
|
415 |
-
|
|
|
|
|
|
|
416 |
import gdown
|
417 |
try:
|
418 |
st.write(f"Downloading from Google Drive: {fname}")
|
419 |
-
output = gdown.download(
|
420 |
if output:
|
421 |
return path
|
422 |
return None
|
@@ -433,7 +489,7 @@ class DownloadManager:
|
|
433 |
'Referer': referer
|
434 |
}
|
435 |
|
436 |
-
response = await page.request.get(
|
437 |
|
438 |
if response.status == 200:
|
439 |
content = await response.body()
|
@@ -441,61 +497,86 @@ class DownloadManager:
|
|
441 |
f.write(content)
|
442 |
return path
|
443 |
else:
|
444 |
-
logger.error(f"Download failed with status {response.status}: {
|
445 |
return None
|
446 |
|
447 |
except Exception as e:
|
448 |
logger.error(f"Error downloading {file_url}: {e}")
|
449 |
return None
|
450 |
|
451 |
-
async def search_bing(self):
|
452 |
-
if not self.query:
|
453 |
-
return [], []
|
454 |
-
|
455 |
-
search_query = self.query
|
456 |
-
if "filetype:pdf" not in search_query.lower():
|
457 |
-
search_query += " filetype:pdf"
|
458 |
-
|
459 |
-
search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
|
460 |
-
|
461 |
-
try:
|
462 |
-
await self.page.goto(search_url, timeout=30000)
|
463 |
-
await self.page.wait_for_selector('li.b_algo', timeout=30000)
|
464 |
-
|
465 |
-
results = []
|
466 |
-
elements = await self.page.query_selector_all('li.b_algo')
|
467 |
-
|
468 |
-
for element in elements:
|
469 |
-
link = await element.query_selector('h2 a')
|
470 |
-
if link:
|
471 |
-
url = await link.get_attribute('href')
|
472 |
-
if url:
|
473 |
-
results.append(url)
|
474 |
-
|
475 |
-
return results[:self.num_results]
|
476 |
-
|
477 |
-
except Exception as e:
|
478 |
-
logger.error(f"Bing search error: {e}")
|
479 |
-
return []
|
480 |
-
|
481 |
async def get_sublinks(self, url, limit=100):
|
482 |
try:
|
483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
content = await self.page.content()
|
485 |
soup = BeautifulSoup(content, 'html.parser')
|
486 |
|
487 |
-
parsed_base = urlparse(
|
488 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
|
489 |
|
490 |
links = set()
|
|
|
|
|
|
|
491 |
for a in soup.find_all('a', href=True):
|
492 |
href = a['href'].strip()
|
493 |
-
if href.startswith('
|
494 |
links.add(href)
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
500 |
except Exception as e:
|
501 |
logger.error(f"Error getting sublinks: {e}")
|
@@ -510,15 +591,21 @@ class DownloadManager:
|
|
510 |
file_count_text = st.empty()
|
511 |
|
512 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
# Search main page
|
514 |
progress_text.text("Analyzing main page...")
|
515 |
-
main_files = await self.extract_downloadable_files(
|
516 |
initial_count = len(main_files)
|
517 |
file_count_text.text(f"Found {initial_count} files on main page")
|
518 |
|
519 |
# Get and search sublinks
|
520 |
progress_text.text("Getting sublinks...")
|
521 |
-
sublinks = await self.get_sublinks(
|
522 |
total_links = len(sublinks)
|
523 |
|
524 |
progress_text.text(f"Found {total_links} sublinks to process")
|
@@ -542,7 +629,14 @@ class DownloadManager:
|
|
542 |
|
543 |
# Set timeout for this sublink
|
544 |
async with async_timeout.timeout(timeout):
|
545 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
546 |
return sub_files
|
547 |
except asyncio.TimeoutError:
|
548 |
logger.warning(f"Timeout processing sublink: {sublink}")
|
@@ -603,160 +697,169 @@ def main():
|
|
603 |
|
604 |
st.title("Advanced File Downloader")
|
605 |
|
606 |
-
#
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
|
|
|
|
636 |
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
|
|
|
649 |
if mode == "Manual URL":
|
650 |
st.header("Manual URL Mode")
|
651 |
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
|
652 |
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
col1, col2 = st.columns([1, 4])
|
674 |
-
with col1:
|
675 |
-
if st.button("Select All", key="select_all_btn"):
|
676 |
-
st.session_state.selected_files = list(range(len(files)))
|
677 |
-
st.experimental_rerun()
|
678 |
-
if st.button("Clear Selection", key="clear_selection_btn"):
|
679 |
-
st.session_state.selected_files = []
|
680 |
-
st.experimental_rerun()
|
681 |
-
|
682 |
-
# File selection
|
683 |
-
selected_files = st.multiselect(
|
684 |
-
"Select files to download",
|
685 |
-
options=list(range(len(files))),
|
686 |
-
default=st.session_state.selected_files,
|
687 |
-
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
|
688 |
-
key="file_multiselect"
|
689 |
-
)
|
690 |
-
|
691 |
-
# Update session state
|
692 |
-
st.session_state.selected_files = selected_files
|
693 |
|
694 |
-
|
695 |
-
|
|
|
|
|
|
|
|
|
696 |
with col1:
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
|
705 |
-
if
|
706 |
-
|
707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
|
709 |
-
|
710 |
-
downloaded_paths = []
|
711 |
-
progress_bar = st.progress(0)
|
712 |
-
status_text = st.empty()
|
713 |
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
|
|
|
|
718 |
|
719 |
-
|
720 |
-
|
|
|
|
|
|
|
|
|
|
|
721 |
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
progress_bar.empty()
|
732 |
-
return downloaded_paths
|
733 |
-
|
734 |
-
downloaded = asyncio.run(download_files())
|
735 |
-
|
736 |
-
if downloaded:
|
737 |
-
st.success(f"Successfully downloaded {len(downloaded)} files")
|
738 |
-
|
739 |
-
if create_zip or upload_to_drive:
|
740 |
-
zip_path = create_zip_file(downloaded, download_dir)
|
741 |
-
st.success(f"Created ZIP file: {zip_path}")
|
742 |
-
|
743 |
-
if upload_to_drive and st.session_state.get('google_creds'):
|
744 |
-
with st.spinner("Uploading to Google Drive..."):
|
745 |
-
drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
|
746 |
-
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
747 |
-
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
748 |
-
else:
|
749 |
-
st.error(drive_id)
|
750 |
-
|
751 |
-
if delete_after:
|
752 |
-
for path in downloaded:
|
753 |
-
try:
|
754 |
-
os.remove(path)
|
755 |
-
except Exception as e:
|
756 |
-
st.warning(f"Could not delete {path}: {e}")
|
757 |
-
st.info("Deleted original files after ZIP creation")
|
758 |
-
else:
|
759 |
-
st.warning("No files found.")
|
760 |
|
761 |
# Display current files if they exist in session state
|
762 |
elif st.session_state.discovered_files:
|
|
|
217 |
self.browser = None
|
218 |
self.context = None
|
219 |
self.page = None
|
220 |
+
self.base_domains = set() # Store base domains and their variations
|
221 |
|
222 |
async def __aenter__(self):
|
223 |
self.playwright = await async_playwright().start()
|
|
|
251 |
if self.playwright:
|
252 |
await self.playwright.stop()
|
253 |
|
254 |
+
def get_base_domain(self, url):
|
255 |
+
"""Extract base domain and add variations to self.base_domains"""
|
256 |
+
parsed = urlparse(url)
|
257 |
+
domain = parsed.netloc.split(':')[0] # Remove port if present
|
258 |
+
|
259 |
+
# Add the main domain and possible variations
|
260 |
+
base_parts = domain.split('.')
|
261 |
+
if len(base_parts) > 2:
|
262 |
+
main_domain = '.'.join(base_parts[-2:])
|
263 |
+
self.base_domains.add(main_domain)
|
264 |
+
# Add variations like files.domain.com for domain.com
|
265 |
+
self.base_domains.add(domain)
|
266 |
+
|
267 |
+
# Handle www and non-www versions
|
268 |
+
if base_parts[0] == 'www':
|
269 |
+
self.base_domains.add('.'.join(base_parts[1:]))
|
270 |
+
else:
|
271 |
+
self.base_domains.add(f"www.{domain}")
|
272 |
+
else:
|
273 |
+
self.base_domains.add(domain)
|
274 |
+
|
275 |
+
return domain
|
276 |
+
|
277 |
+
def is_related_domain(self, url):
|
278 |
+
"""Check if URL belongs to any of the known domain variations"""
|
279 |
+
parsed = urlparse(url)
|
280 |
+
domain = parsed.netloc.split(':')[0]
|
281 |
+
|
282 |
+
# Check if this domain or any of its parts match our base domains
|
283 |
+
parts = domain.split('.')
|
284 |
+
for i in range(len(parts) - 1):
|
285 |
+
check_domain = '.'.join(parts[i:])
|
286 |
+
if check_domain in self.base_domains:
|
287 |
+
return True
|
288 |
+
return False
|
289 |
+
|
290 |
+
async def get_real_url(self, url):
|
291 |
+
"""Follow redirects and get the final URL"""
|
292 |
+
try:
|
293 |
+
async with self.context.new_page() as page:
|
294 |
+
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
295 |
+
final_url = page.url
|
296 |
+
|
297 |
+
# Check for meta refresh redirects
|
298 |
+
content = await page.content()
|
299 |
+
soup = BeautifulSoup(content, 'html.parser')
|
300 |
+
meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
|
301 |
+
if meta_refresh:
|
302 |
+
content = meta_refresh.get('content', '')
|
303 |
+
if 'url=' in content.lower():
|
304 |
+
final_url = content.split('url=')[-1].strip("'").strip('"')
|
305 |
+
|
306 |
+
return final_url, response.headers if response else {}
|
307 |
+
except Exception as e:
|
308 |
+
logger.error(f"Error getting real URL for {url}: {e}")
|
309 |
+
return url, {}
|
310 |
+
|
311 |
async def get_file_size(self, url):
|
312 |
try:
|
313 |
async with self.context.new_page() as page:
|
|
|
338 |
except Exception:
|
339 |
return {}
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
342 |
found_files = []
|
343 |
try:
|
344 |
+
# Follow redirects and get the final URL
|
345 |
+
final_url, headers = await self.get_real_url(url)
|
346 |
+
|
347 |
+
# Add this domain to our known domains
|
348 |
+
self.get_base_domain(final_url)
|
349 |
+
|
350 |
+
# Check if the URL itself is a file
|
351 |
+
content_type = headers.get('content-type', '').lower()
|
352 |
+
if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
|
353 |
+
found_files.append({
|
354 |
+
'url': final_url,
|
355 |
+
'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
|
356 |
+
'size': await self.get_file_size(final_url),
|
357 |
+
'metadata': {}
|
358 |
+
})
|
359 |
+
return found_files
|
360 |
+
|
361 |
+
# Load the page
|
362 |
+
await self.page.goto(final_url, timeout=30000, wait_until='networkidle')
|
363 |
content = await self.page.content()
|
364 |
soup = BeautifulSoup(content, 'html.parser')
|
365 |
|
366 |
+
# Define extensions to look for
|
367 |
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
368 |
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
|
369 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
|
|
371 |
parsed_base = urlparse(final_url)
|
372 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
373 |
|
374 |
+
# Find all links including those in scripts and other elements
|
375 |
+
links = set()
|
376 |
+
# Regular links
|
377 |
for a in soup.find_all('a', href=True):
|
378 |
+
links.add(a['href'])
|
379 |
+
# Script-embedded links
|
380 |
+
scripts = soup.find_all('script')
|
381 |
+
for script in scripts:
|
382 |
+
if script.string:
|
383 |
+
urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
|
384 |
+
links.update(urls)
|
385 |
+
|
386 |
+
for href in links:
|
387 |
+
href = href.strip()
|
388 |
|
389 |
+
# Skip empty or javascript links
|
390 |
+
if not href or href.startswith(('javascript:', '#', 'mailto:')):
|
391 |
+
continue
|
392 |
+
|
393 |
+
# Handle both direct file links and PHP/script downloads
|
394 |
+
if '.php' in href.lower() or 'download' in href.lower() or 'visit' in href.lower():
|
395 |
+
try:
|
396 |
+
# Convert to absolute URL if needed
|
397 |
+
if not href.startswith(('http://', 'https://')):
|
398 |
+
if href.startswith('/'):
|
399 |
+
href = base_url + href
|
400 |
+
else:
|
401 |
+
href = base_url + '/' + href
|
402 |
+
|
403 |
+
# Follow the link to get the real file
|
404 |
+
real_url, real_headers = await self.get_real_url(href)
|
405 |
+
|
406 |
+
# Check if it leads to a file
|
407 |
+
content_type = real_headers.get('content-type', '').lower()
|
408 |
+
if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
found_files.append({
|
410 |
+
'url': real_url,
|
411 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
412 |
+
'size': await self.get_file_size(real_url),
|
413 |
'metadata': {}
|
414 |
})
|
415 |
+
except Exception as e:
|
416 |
+
logger.error(f"Error processing PHP/script link {href}: {e}")
|
417 |
+
continue
|
418 |
+
|
419 |
+
# Handle direct file links
|
420 |
+
elif any(href.lower().endswith(ext) for ext in all_exts):
|
421 |
+
# Convert to absolute URL if needed
|
422 |
+
if not href.startswith(('http://', 'https://')):
|
423 |
+
if href.startswith('/'):
|
424 |
+
href = base_url + href
|
425 |
+
else:
|
426 |
+
href = base_url + '/' + href
|
427 |
+
|
428 |
+
# Verify if it's from a related domain
|
429 |
+
if self.is_related_domain(href):
|
430 |
+
size_str = await self.get_file_size(href)
|
431 |
+
meta = {}
|
432 |
+
if href.lower().endswith('.pdf'):
|
433 |
+
meta = await self.get_pdf_metadata(href)
|
434 |
+
|
435 |
+
found_files.append({
|
436 |
+
'url': href,
|
437 |
+
'filename': os.path.basename(href.split('?')[0]),
|
438 |
+
'size': size_str,
|
439 |
+
'metadata': meta
|
440 |
+
})
|
441 |
|
442 |
# Make results unique based on URLs
|
443 |
seen_urls = set()
|
|
|
448 |
unique_files.append(f)
|
449 |
|
450 |
return unique_files
|
451 |
+
|
452 |
except Exception as e:
|
453 |
logger.error(f"Error extracting files from {url}: {e}")
|
454 |
return []
|
455 |
+
async def download_file(self, file_info, save_dir, referer):
|
|
|
456 |
file_url = file_info['url']
|
457 |
fname = file_info['filename']
|
458 |
path = os.path.join(save_dir, fname)
|
|
|
465 |
os.makedirs(save_dir, exist_ok=True)
|
466 |
|
467 |
try:
|
468 |
+
# Get the real URL first
|
469 |
+
real_url, _ = await self.get_real_url(file_url)
|
470 |
+
|
471 |
+
if "drive.google.com" in real_url:
|
472 |
import gdown
|
473 |
try:
|
474 |
st.write(f"Downloading from Google Drive: {fname}")
|
475 |
+
output = gdown.download(real_url, path, quiet=False)
|
476 |
if output:
|
477 |
return path
|
478 |
return None
|
|
|
489 |
'Referer': referer
|
490 |
}
|
491 |
|
492 |
+
response = await page.request.get(real_url, headers=headers, timeout=30000)
|
493 |
|
494 |
if response.status == 200:
|
495 |
content = await response.body()
|
|
|
497 |
f.write(content)
|
498 |
return path
|
499 |
else:
|
500 |
+
logger.error(f"Download failed with status {response.status}: {real_url}")
|
501 |
return None
|
502 |
|
503 |
except Exception as e:
|
504 |
logger.error(f"Error downloading {file_url}: {e}")
|
505 |
return None
|
506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
async def get_sublinks(self, url, limit=100):
|
508 |
try:
|
509 |
+
# Get the real URL first
|
510 |
+
real_url, _ = await self.get_real_url(url)
|
511 |
+
await self.page.goto(real_url, timeout=30000)
|
512 |
+
|
513 |
+
# Wait for dynamic content
|
514 |
+
await self.page.wait_for_load_state('networkidle')
|
515 |
+
|
516 |
content = await self.page.content()
|
517 |
soup = BeautifulSoup(content, 'html.parser')
|
518 |
|
519 |
+
parsed_base = urlparse(real_url)
|
520 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
521 |
+
current_path = os.path.dirname(parsed_base.path)
|
522 |
|
523 |
links = set()
|
524 |
+
|
525 |
+
# Find links from various sources
|
526 |
+
# 1. Regular links
|
527 |
for a in soup.find_all('a', href=True):
|
528 |
href = a['href'].strip()
|
529 |
+
if href and not href.startswith(('javascript:', '#', 'mailto:')):
|
530 |
links.add(href)
|
531 |
+
|
532 |
+
# 2. Script-embedded links
|
533 |
+
scripts = soup.find_all('script')
|
534 |
+
for script in scripts:
|
535 |
+
if script.string:
|
536 |
+
urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
|
537 |
+
links.update(urls)
|
538 |
+
|
539 |
+
# 3. Form actions
|
540 |
+
forms = soup.find_all('form', action=True)
|
541 |
+
for form in forms:
|
542 |
+
links.add(form['action'])
|
543 |
+
|
544 |
+
# Process and clean links
|
545 |
+
clean_links = set()
|
546 |
+
for href in links:
|
547 |
+
try:
|
548 |
+
# Skip empty links
|
549 |
+
if not href.strip():
|
550 |
+
continue
|
551 |
+
|
552 |
+
# Convert to absolute URL
|
553 |
+
if href.startswith('http'):
|
554 |
+
full_url = href
|
555 |
+
elif href.startswith('//'):
|
556 |
+
full_url = parsed_base.scheme + ':' + href
|
557 |
+
elif href.startswith('/'):
|
558 |
+
full_url = base_url + href
|
559 |
+
else:
|
560 |
+
# Handle relative paths
|
561 |
+
if current_path and current_path != '/':
|
562 |
+
full_url = base_url + current_path + '/' + href
|
563 |
+
else:
|
564 |
+
full_url = base_url + '/' + href
|
565 |
+
|
566 |
+
# Clean the URL
|
567 |
+
full_url = full_url.split('#')[0] # Remove fragments
|
568 |
|
569 |
+
# Only add if it's a related domain
|
570 |
+
if self.is_related_domain(full_url):
|
571 |
+
clean_links.add(full_url)
|
572 |
+
|
573 |
+
except Exception as e:
|
574 |
+
logger.error(f"Error processing link {href}: {e}")
|
575 |
+
continue
|
576 |
+
|
577 |
+
# Sort links for consistency
|
578 |
+
sorted_links = sorted(list(clean_links))
|
579 |
+
return sorted_links[:limit]
|
580 |
|
581 |
except Exception as e:
|
582 |
logger.error(f"Error getting sublinks: {e}")
|
|
|
591 |
file_count_text = st.empty()
|
592 |
|
593 |
try:
|
594 |
+
# Initialize base domains with the original URL
|
595 |
+
self.get_base_domain(url)
|
596 |
+
|
597 |
+
# Get the real initial URL
|
598 |
+
real_url, _ = await self.get_real_url(url)
|
599 |
+
|
600 |
# Search main page
|
601 |
progress_text.text("Analyzing main page...")
|
602 |
+
main_files = await self.extract_downloadable_files(real_url, custom_ext_list)
|
603 |
initial_count = len(main_files)
|
604 |
file_count_text.text(f"Found {initial_count} files on main page")
|
605 |
|
606 |
# Get and search sublinks
|
607 |
progress_text.text("Getting sublinks...")
|
608 |
+
sublinks = await self.get_sublinks(real_url, limit=sublink_limit)
|
609 |
total_links = len(sublinks)
|
610 |
|
611 |
progress_text.text(f"Found {total_links} sublinks to process")
|
|
|
629 |
|
630 |
# Set timeout for this sublink
|
631 |
async with async_timeout.timeout(timeout):
|
632 |
+
# Get real URL before processing
|
633 |
+
real_sublink, _ = await self.get_real_url(sublink)
|
634 |
+
sub_files = await self.extract_downloadable_files(real_sublink, custom_ext_list)
|
635 |
+
|
636 |
+
if sub_files:
|
637 |
+
logger.info(f"Found {len(sub_files)} files at {real_sublink}")
|
638 |
+
st.write(f"Found {len(sub_files)} files at {real_sublink}")
|
639 |
+
|
640 |
return sub_files
|
641 |
except asyncio.TimeoutError:
|
642 |
logger.warning(f"Timeout processing sublink: {sublink}")
|
|
|
697 |
|
698 |
st.title("Advanced File Downloader")
|
699 |
|
700 |
+
# Sidebar
|
701 |
+
with st.sidebar:
|
702 |
+
# Mode Selection
|
703 |
+
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
|
704 |
+
|
705 |
+
# Advanced Options
|
706 |
+
with st.expander("Advanced Options", expanded=True):
|
707 |
+
custom_extensions = st.text_input(
|
708 |
+
"Custom File Extensions",
|
709 |
+
placeholder=".csv, .txt, .epub",
|
710 |
+
key="custom_ext_input"
|
711 |
+
)
|
712 |
+
max_sublinks = st.number_input(
|
713 |
+
"Maximum Sublinks to Process",
|
714 |
+
min_value=1,
|
715 |
+
max_value=10000,
|
716 |
+
value=100,
|
717 |
+
step=50,
|
718 |
+
help="Maximum number of sublinks to process from the main page",
|
719 |
+
key="max_sublinks_input"
|
720 |
+
)
|
721 |
+
sublink_timeout = st.number_input(
|
722 |
+
"Search Timeout (seconds per sublink)",
|
723 |
+
min_value=1,
|
724 |
+
max_value=3000,
|
725 |
+
value=30,
|
726 |
+
step=5,
|
727 |
+
help="Maximum time to spend searching each sublink",
|
728 |
+
key="timeout_input"
|
729 |
+
)
|
730 |
+
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
731 |
+
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
732 |
|
733 |
+
# Google Drive Integration
|
734 |
+
with st.expander("Google Drive Integration", expanded=False):
|
735 |
+
if st.button("Start Google Sign-In", key="google_signin_btn"):
|
736 |
+
auth_url = get_google_auth_url()
|
737 |
+
st.markdown(f"[Click here to authorize]({auth_url})")
|
738 |
+
|
739 |
+
auth_code = st.text_input("Enter authorization code", key="auth_code_input")
|
740 |
+
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
|
741 |
+
creds, msg = exchange_code_for_credentials(auth_code)
|
742 |
+
st.session_state.google_creds = creds
|
743 |
+
st.write(msg)
|
744 |
|
745 |
+
# Main content area
|
746 |
if mode == "Manual URL":
|
747 |
st.header("Manual URL Mode")
|
748 |
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
|
749 |
|
750 |
+
col1, col2 = st.columns([3, 1])
|
751 |
+
with col1:
|
752 |
+
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
|
753 |
+
if url:
|
754 |
+
async def run_deep_search():
|
755 |
+
try:
|
756 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
757 |
+
files = await dm.deep_search(
|
758 |
+
url=url,
|
759 |
+
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
760 |
+
sublink_limit=int(max_sublinks),
|
761 |
+
timeout=int(sublink_timeout)
|
762 |
+
)
|
763 |
+
if files:
|
764 |
+
st.session_state.discovered_files = files
|
765 |
+
st.session_state.current_url = url
|
766 |
+
return files
|
767 |
+
except Exception as e:
|
768 |
+
st.error(f"Error during deep search: {str(e)}")
|
769 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
|
771 |
+
files = asyncio.run(run_deep_search())
|
772 |
+
if files:
|
773 |
+
st.success(f"Found {len(files)} files!")
|
774 |
+
|
775 |
+
# Select All/Clear Selection buttons
|
776 |
+
col1, col2 = st.columns([1, 4])
|
777 |
with col1:
|
778 |
+
if st.button("Select All", key="select_all_btn"):
|
779 |
+
st.session_state.selected_files = list(range(len(files)))
|
780 |
+
st.experimental_rerun()
|
781 |
+
if st.button("Clear Selection", key="clear_selection_btn"):
|
782 |
+
st.session_state.selected_files = []
|
783 |
+
st.experimental_rerun()
|
784 |
+
|
785 |
+
# File selection
|
786 |
+
selected_files = st.multiselect(
|
787 |
+
"Select files to download",
|
788 |
+
options=list(range(len(files))),
|
789 |
+
default=st.session_state.selected_files,
|
790 |
+
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
|
791 |
+
key="file_multiselect"
|
792 |
+
)
|
793 |
+
|
794 |
+
# Update session state
|
795 |
+
st.session_state.selected_files = selected_files
|
796 |
|
797 |
+
if selected_files:
|
798 |
+
col1, col2, col3, col4 = st.columns(4)
|
799 |
+
with col1:
|
800 |
+
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
801 |
+
with col2:
|
802 |
+
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
803 |
+
with col3:
|
804 |
+
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
|
805 |
+
with col4:
|
806 |
+
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
807 |
+
|
808 |
+
if st.button("Download Selected", key="download_btn"):
|
809 |
+
if not os.path.exists(download_dir):
|
810 |
+
os.makedirs(download_dir)
|
811 |
+
|
812 |
+
async def download_files():
|
813 |
+
downloaded_paths = []
|
814 |
+
progress_bar = st.progress(0)
|
815 |
+
status_text = st.empty()
|
816 |
+
|
817 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
818 |
+
for i, idx in enumerate(selected_files):
|
819 |
+
progress = (i + 1) / len(selected_files)
|
820 |
+
file_info = files[idx]
|
821 |
+
|
822 |
+
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
|
823 |
+
progress_bar.progress(progress)
|
824 |
+
|
825 |
+
path = await dm.download_file(
|
826 |
+
file_info,
|
827 |
+
download_dir,
|
828 |
+
url
|
829 |
+
)
|
830 |
+
if path:
|
831 |
+
downloaded_paths.append(path)
|
832 |
+
|
833 |
+
status_text.empty()
|
834 |
+
progress_bar.empty()
|
835 |
+
return downloaded_paths
|
836 |
|
837 |
+
downloaded = asyncio.run(download_files())
|
|
|
|
|
|
|
838 |
|
839 |
+
if downloaded:
|
840 |
+
st.success(f"Successfully downloaded {len(downloaded)} files")
|
841 |
+
|
842 |
+
if create_zip or upload_to_drive:
|
843 |
+
zip_path = create_zip_file(downloaded, download_dir)
|
844 |
+
st.success(f"Created ZIP file: {zip_path}")
|
845 |
|
846 |
+
if upload_to_drive and st.session_state.get('google_creds'):
|
847 |
+
with st.spinner("Uploading to Google Drive..."):
|
848 |
+
drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
|
849 |
+
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
850 |
+
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
851 |
+
else:
|
852 |
+
st.error(drive_id)
|
853 |
|
854 |
+
if delete_after:
|
855 |
+
for path in downloaded:
|
856 |
+
try:
|
857 |
+
os.remove(path)
|
858 |
+
except Exception as e:
|
859 |
+
st.warning(f"Could not delete {path}: {e}")
|
860 |
+
st.info("Deleted original files after ZIP creation")
|
861 |
+
else:
|
862 |
+
st.warning("No files found.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
863 |
|
864 |
# Display current files if they exist in session state
|
865 |
elif st.session_state.discovered_files:
|