Update app.py
Browse files
app.py
CHANGED
@@ -26,6 +26,18 @@ import google_auth_oauthlib.flow
|
|
26 |
import googleapiclient.discovery
|
27 |
import google.auth.transport.requests
|
28 |
from async_timeout import timeout as async_timeout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# -------------------- Logging Setup --------------------
|
30 |
logging.basicConfig(
|
31 |
filename='advanced_download_log.txt',
|
@@ -33,7 +45,7 @@ logging.basicConfig(
|
|
33 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
34 |
)
|
35 |
logger = logging.getLogger(__name__)
|
36 |
-
|
37 |
GOOGLE_OAUTH_CONFIG = {
|
38 |
"web": {
|
39 |
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
@@ -49,49 +61,22 @@ GOOGLE_OAUTH_CONFIG = {
|
|
49 |
# Playwright Setup
|
50 |
def install_playwright_dependencies():
|
51 |
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
61 |
-
os.makedirs('/usr/lib/playwright', exist_ok=True)
|
62 |
-
symlinks = {
|
63 |
-
'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
|
64 |
-
'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
|
65 |
-
'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
|
66 |
-
'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
|
67 |
-
'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
|
68 |
-
'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
|
69 |
-
'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
|
70 |
-
'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
|
71 |
-
'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
|
72 |
-
'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
|
73 |
-
}
|
74 |
-
for link_name, target in symlinks.items():
|
75 |
-
link_path = os.path.join('/usr/lib/playwright', link_name)
|
76 |
-
if not os.path.exists(link_path):
|
77 |
-
os.symlink(target, link_path)
|
78 |
-
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
79 |
-
browser_path = os.path.expanduser("~/.cache/ms-playwright")
|
80 |
-
os.makedirs(browser_path, exist_ok=True)
|
81 |
-
subprocess.run(['chmod', '-R', '755', browser_path], check=True)
|
82 |
-
except subprocess.CalledProcessError as e:
|
83 |
-
print(f"Error installing dependencies: {e}")
|
84 |
-
except Exception as e:
|
85 |
-
print(f"Error: {e}")
|
86 |
|
87 |
-
# Initialize Playwright
|
88 |
install_playwright_dependencies()
|
89 |
|
90 |
# Model Loading
|
91 |
@st.cache_resource
|
92 |
def load_models():
|
93 |
try:
|
94 |
-
#
|
95 |
try:
|
96 |
nlp = spacy.load("en_core_web_sm")
|
97 |
except OSError:
|
@@ -99,43 +84,26 @@ def load_models():
|
|
99 |
spacy.cli.download("en_core_web_sm")
|
100 |
nlp = spacy.load("en_core_web_sm")
|
101 |
|
102 |
-
# Load SentenceTransformer
|
103 |
try:
|
104 |
-
|
105 |
-
model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
|
106 |
-
cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
|
107 |
-
if os.path.exists(os.path.join(cache_dir, model_name)):
|
108 |
-
semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
|
109 |
-
else:
|
110 |
-
st.warning(f"Downloading SentenceTransformer model {model_name}...")
|
111 |
-
semantic_model = SentenceTransformer(model_name)
|
112 |
except Exception as e:
|
113 |
st.error(f"Error loading SentenceTransformer: {e}")
|
114 |
semantic_model = None
|
115 |
|
116 |
-
# Load Transformers pipeline
|
117 |
try:
|
118 |
-
|
119 |
-
model_name = "facebook/bart-large-cnn"
|
120 |
-
cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
|
121 |
-
if os.path.exists(os.path.join(cache_dir, model_name)):
|
122 |
-
summarizer = pipeline("summarization", model=model_name)
|
123 |
-
else:
|
124 |
-
st.warning(f"Downloading Transformer model {model_name}...")
|
125 |
-
summarizer = pipeline("summarization")
|
126 |
except Exception as e:
|
127 |
st.error(f"Error loading Transformers: {e}")
|
128 |
summarizer = None
|
129 |
|
130 |
return nlp, semantic_model, summarizer
|
131 |
-
|
132 |
except Exception as e:
|
133 |
st.error(f"Error loading models: {e}")
|
134 |
return None, None, None
|
135 |
|
136 |
-
|
137 |
-
with st.spinner("Loading models..."):
|
138 |
-
nlp_model, semantic_model, summarizer = load_models()
|
139 |
|
140 |
# Utility Functions
|
141 |
def get_random_user_agent():
|
@@ -157,11 +125,9 @@ def sizeof_fmt(num, suffix='B'):
|
|
157 |
def create_zip_file(file_paths, output_dir):
|
158 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
159 |
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
|
160 |
-
|
161 |
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
162 |
for file_path in file_paths:
|
163 |
zipf.write(file_path, os.path.basename(file_path))
|
164 |
-
|
165 |
return zip_path
|
166 |
|
167 |
# Google Drive Functions
|
@@ -197,16 +163,23 @@ def exchange_code_for_credentials(auth_code):
|
|
197 |
except Exception as e:
|
198 |
return None, f"Error during token exchange: {e}"
|
199 |
|
200 |
-
def google_drive_upload(
|
201 |
try:
|
202 |
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
203 |
-
file_metadata = {'name': os.path.basename(
|
204 |
-
|
|
|
|
|
205 |
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
206 |
return created.get("id", "")
|
207 |
except Exception as e:
|
208 |
return f"Error uploading to Drive: {str(e)}"
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
210 |
# DownloadManager Class
|
211 |
class DownloadManager:
|
212 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
@@ -234,7 +207,6 @@ class DownloadManager:
|
|
234 |
}
|
235 |
if self.use_proxy and self.proxy:
|
236 |
opts["proxy"] = {"server": self.proxy}
|
237 |
-
|
238 |
self.browser = await self.playwright.chromium.launch(**opts)
|
239 |
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
|
240 |
self.page = await self.context.new_page()
|
@@ -257,14 +229,11 @@ class DownloadManager:
|
|
257 |
search_url = f"https://www.bing.com/search?q={self.query}"
|
258 |
await self.page.goto(search_url, timeout=30000)
|
259 |
await self.page.wait_for_load_state('networkidle')
|
260 |
-
|
261 |
-
# Extract search result links
|
262 |
links = await self.page.query_selector_all("li.b_algo h2 a")
|
263 |
for link in links[:self.num_results]:
|
264 |
href = await link.get_attribute('href')
|
265 |
if href:
|
266 |
urls.append(href)
|
267 |
-
|
268 |
return urls
|
269 |
except Exception as e:
|
270 |
logger.error(f"Error searching Bing: {e}")
|
@@ -335,7 +304,8 @@ class DownloadManager:
|
|
335 |
soup = BeautifulSoup(content, 'html.parser')
|
336 |
|
337 |
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
338 |
-
|
|
|
339 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
340 |
|
341 |
parsed_base = urlparse(final_url)
|
@@ -344,11 +314,8 @@ class DownloadManager:
|
|
344 |
for a in soup.find_all('a', href=True):
|
345 |
href = a['href'].strip()
|
346 |
|
347 |
-
# Handle PHP scripts and redirects
|
348 |
if '.php' in href.lower() or 'download' in href.lower():
|
349 |
-
full_url = href if href.startswith('http') else
|
350 |
-
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
351 |
-
)
|
352 |
real_url = await self.extract_real_download_url(full_url)
|
353 |
if real_url and real_url != full_url:
|
354 |
found_files.append({
|
@@ -359,17 +326,12 @@ class DownloadManager:
|
|
359 |
})
|
360 |
continue
|
361 |
|
362 |
-
# Handle direct file links
|
363 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
364 |
-
file_url = href if href.startswith('http') else
|
365 |
-
f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
|
366 |
-
)
|
367 |
-
|
368 |
size_str = await self.get_file_size(file_url)
|
369 |
meta = {}
|
370 |
if file_url.lower().endswith('.pdf'):
|
371 |
meta = await self.get_pdf_metadata(file_url)
|
372 |
-
|
373 |
found_files.append({
|
374 |
'url': file_url,
|
375 |
'filename': os.path.basename(file_url.split('?')[0]),
|
@@ -385,7 +347,6 @@ class DownloadManager:
|
|
385 |
if match:
|
386 |
file_id = match.group(1)
|
387 |
break
|
388 |
-
|
389 |
if file_id:
|
390 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
391 |
filename = file_id
|
@@ -396,7 +357,6 @@ class DownloadManager:
|
|
396 |
mt = re.search(r'filename\*?="?([^";]+)', cd)
|
397 |
if mt:
|
398 |
filename = mt.group(1).strip('"').strip()
|
399 |
-
|
400 |
found_files.append({
|
401 |
'url': direct_url,
|
402 |
'filename': filename,
|
@@ -406,14 +366,12 @@ class DownloadManager:
|
|
406 |
except Exception as e:
|
407 |
logger.error(f"Error processing Google Drive link: {e}")
|
408 |
|
409 |
-
# Make results unique based on URLs
|
410 |
seen_urls = set()
|
411 |
unique_files = []
|
412 |
for f in found_files:
|
413 |
if f['url'] not in seen_urls:
|
414 |
seen_urls.add(f['url'])
|
415 |
unique_files.append(f)
|
416 |
-
|
417 |
return unique_files
|
418 |
except Exception as e:
|
419 |
logger.error(f"Error extracting files from {url}: {e}")
|
@@ -428,106 +386,29 @@ class DownloadManager:
|
|
428 |
while os.path.exists(path):
|
429 |
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
430 |
counter += 1
|
431 |
-
|
432 |
os.makedirs(save_dir, exist_ok=True)
|
433 |
-
|
434 |
try:
|
435 |
if "drive.google.com" in file_url:
|
436 |
import gdown
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
if not ext or ext == "":
|
442 |
-
# Try to determine file type from content-type header
|
443 |
-
async with self.context.new_page() as page:
|
444 |
-
response = await page.request.head(file_url, timeout=15000)
|
445 |
-
content_type = response.headers.get('Content-Type', '')
|
446 |
-
|
447 |
-
# Map content types to extensions
|
448 |
-
extension_map = {
|
449 |
-
'application/pdf': '.pdf',
|
450 |
-
'image/jpeg': '.jpg',
|
451 |
-
'image/png': '.png',
|
452 |
-
'application/msword': '.doc',
|
453 |
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
454 |
-
'application/zip': '.zip',
|
455 |
-
'text/plain': '.txt',
|
456 |
-
'application/vnd.ms-excel': '.xls',
|
457 |
-
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
458 |
-
'video/mp4': '.mp4',
|
459 |
-
'audio/mpeg': '.mp3',
|
460 |
-
'video/x-msvideo': '.avi',
|
461 |
-
'video/x-matroska': '.mkv'
|
462 |
-
}
|
463 |
-
|
464 |
-
# Get extension from content type or use .bin as fallback
|
465 |
-
ext = extension_map.get(content_type.split(';')[0], '.bin')
|
466 |
-
path = os.path.join(save_dir, f"{base}{ext}")
|
467 |
-
|
468 |
-
# Handle name collisions
|
469 |
-
counter = 1
|
470 |
-
while os.path.exists(path):
|
471 |
-
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
472 |
-
counter += 1
|
473 |
-
|
474 |
-
output = gdown.download(file_url, path, quiet=False)
|
475 |
-
if output:
|
476 |
-
return path
|
477 |
-
return None
|
478 |
-
except Exception as e:
|
479 |
-
logger.error(f"Google Drive download error: {e}")
|
480 |
-
return None
|
481 |
-
|
482 |
async with self.context.new_page() as page:
|
483 |
-
st.write(f"Downloading: {fname}")
|
484 |
-
|
485 |
headers = {
|
486 |
'Accept': '*/*',
|
487 |
'Accept-Encoding': 'gzip, deflate, br',
|
488 |
'Referer': referer
|
489 |
}
|
490 |
-
|
491 |
response = await page.request.get(file_url, headers=headers, timeout=30000)
|
492 |
-
|
493 |
if response.status == 200:
|
494 |
content = await response.body()
|
495 |
-
|
496 |
-
# Check if we need to add an extension based on content type
|
497 |
-
if not ext or ext == "":
|
498 |
-
content_type = response.headers.get('Content-Type', '')
|
499 |
-
extension_map = {
|
500 |
-
'application/pdf': '.pdf',
|
501 |
-
'image/jpeg': '.jpg',
|
502 |
-
'image/png': '.png',
|
503 |
-
'application/msword': '.doc',
|
504 |
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
505 |
-
'application/zip': '.zip',
|
506 |
-
'text/plain': '.txt',
|
507 |
-
'application/vnd.ms-excel': '.xls',
|
508 |
-
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
509 |
-
'video/mp4': '.mp4',
|
510 |
-
'audio/mpeg': '.mp3',
|
511 |
-
'video/x-msvideo': '.avi',
|
512 |
-
'video/x-matroska': '.mkv'
|
513 |
-
}
|
514 |
-
|
515 |
-
ext = extension_map.get(content_type.split(';')[0], '.bin')
|
516 |
-
path = os.path.join(save_dir, f"{base}{ext}")
|
517 |
-
|
518 |
-
# Handle name collisions again
|
519 |
-
counter = 1
|
520 |
-
while os.path.exists(path):
|
521 |
-
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
522 |
-
counter += 1
|
523 |
-
|
524 |
with open(path, 'wb') as f:
|
525 |
f.write(content)
|
526 |
return path
|
527 |
else:
|
528 |
logger.error(f"Download failed with status {response.status}: {file_url}")
|
529 |
return None
|
530 |
-
|
531 |
except Exception as e:
|
532 |
logger.error(f"Error downloading {file_url}: {e}")
|
533 |
return None
|
@@ -535,65 +416,45 @@ class DownloadManager:
|
|
535 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
536 |
if not custom_ext_list:
|
537 |
custom_ext_list = []
|
538 |
-
|
539 |
progress_text = st.empty()
|
540 |
progress_bar = st.progress(0)
|
541 |
file_count_text = st.empty()
|
542 |
-
|
543 |
try:
|
544 |
-
# Search main page
|
545 |
progress_text.text("Analyzing main page...")
|
546 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
547 |
initial_count = len(main_files)
|
548 |
file_count_text.text(f"Found {initial_count} files on main page")
|
549 |
-
|
550 |
-
# Get and search sublinks
|
551 |
progress_text.text("Getting sublinks...")
|
552 |
sublinks = await self.get_sublinks(url, sublink_limit)
|
553 |
total_links = len(sublinks)
|
554 |
-
|
555 |
progress_text.text(f"Found {total_links} sublinks to process")
|
556 |
-
|
557 |
if not sublinks:
|
558 |
progress_bar.progress(1.0)
|
559 |
return main_files
|
560 |
-
|
561 |
-
# Process sublinks
|
562 |
all_files = main_files
|
563 |
-
|
564 |
for i, sublink in enumerate(sublinks, 1):
|
565 |
-
progress = i/total_links
|
566 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
567 |
progress_bar.progress(progress)
|
568 |
-
|
569 |
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
570 |
all_files.extend(sub_files)
|
571 |
-
|
572 |
-
# Update count in real-time
|
573 |
file_count_text.text(f"Found {len(all_files)} total files")
|
574 |
-
|
575 |
-
# Make results unique
|
576 |
seen_urls = set()
|
577 |
unique_files = []
|
578 |
-
|
579 |
for f in all_files:
|
580 |
if f['url'] not in seen_urls:
|
581 |
seen_urls.add(f['url'])
|
582 |
unique_files.append(f)
|
583 |
-
|
584 |
final_count = len(unique_files)
|
585 |
progress_text.text(f"Deep search complete!")
|
586 |
file_count_text.text(f"Found {final_count} unique files")
|
587 |
progress_bar.progress(1.0)
|
588 |
-
|
589 |
return unique_files
|
590 |
-
|
591 |
except Exception as e:
|
592 |
logger.error(f"Deep search error: {e}")
|
593 |
progress_text.text(f"Error during deep search: {str(e)}")
|
594 |
return []
|
595 |
finally:
|
596 |
-
# Clean up progress indicators after a delay
|
597 |
await asyncio.sleep(2)
|
598 |
if not st.session_state.get('keep_progress', False):
|
599 |
progress_text.empty()
|
@@ -604,10 +465,8 @@ class DownloadManager:
|
|
604 |
await self.page.goto(url, timeout=30000)
|
605 |
content = await self.page.content()
|
606 |
soup = BeautifulSoup(content, 'html.parser')
|
607 |
-
|
608 |
parsed_base = urlparse(url)
|
609 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
610 |
-
|
611 |
links = set()
|
612 |
for a in soup.find_all('a', href=True):
|
613 |
href = a['href'].strip()
|
@@ -615,56 +474,48 @@ class DownloadManager:
|
|
615 |
links.add(href)
|
616 |
elif href.startswith('/'):
|
617 |
links.add(f"{base_url}{href}")
|
618 |
-
|
619 |
return list(links)[:limit]
|
620 |
-
|
621 |
except Exception as e:
|
622 |
logger.error(f"Error getting sublinks: {e}")
|
623 |
return []
|
624 |
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
|
|
629 |
|
|
|
|
|
|
|
|
|
630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
def main():
|
632 |
-
# Initialize session state on first run
|
633 |
if 'initialized' not in st.session_state:
|
634 |
st.session_state.initialized = True
|
635 |
st.session_state.discovered_files = []
|
636 |
st.session_state.current_url = None
|
637 |
st.session_state.google_creds = None
|
638 |
st.session_state.selected_files = []
|
639 |
-
st.session_state.do_deep_search = False
|
640 |
-
st.session_state.deep_search_url = None
|
641 |
-
st.session_state.search_results = []
|
642 |
|
643 |
st.title("Advanced File Downloader")
|
644 |
-
|
645 |
-
# Sidebar configuration
|
646 |
with st.sidebar:
|
647 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
|
648 |
with st.expander("Advanced Options", expanded=True):
|
649 |
-
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input")
|
650 |
-
max_sublinks = st.number_input(
|
651 |
-
|
652 |
-
min_value=1,
|
653 |
-
max_value=100000,
|
654 |
-
value=10000,
|
655 |
-
step=50,
|
656 |
-
help="Maximum number of sublinks to process from the main page",
|
657 |
-
key="max_sublinks_input"
|
658 |
-
)
|
659 |
-
sublink_timeout = st.number_input(
|
660 |
-
"Search Timeout (seconds per sublink)",
|
661 |
-
min_value=1,
|
662 |
-
max_value=3000,
|
663 |
-
value=30,
|
664 |
-
step=5,
|
665 |
-
help="Maximum time to spend searching each sublink",
|
666 |
-
key="timeout_input"
|
667 |
-
)
|
668 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
669 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
670 |
with st.expander("Google Drive Integration", expanded=False):
|
@@ -676,158 +527,54 @@ def main():
|
|
676 |
creds, msg = exchange_code_for_credentials(auth_code)
|
677 |
st.session_state.google_creds = creds
|
678 |
st.write(msg)
|
679 |
-
|
680 |
-
# Manual URL mode
|
681 |
if mode == "Manual URL":
|
682 |
st.header("Manual URL Mode")
|
683 |
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
|
684 |
-
|
685 |
col1, col2 = st.columns([3, 1])
|
686 |
with col1:
|
687 |
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
|
688 |
if url:
|
|
|
|
|
|
|
|
|
689 |
async def run_deep_search():
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
url=url,
|
694 |
-
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
695 |
-
sublink_limit=int(max_sublinks),
|
696 |
-
timeout=int(sublink_timeout)
|
697 |
-
)
|
698 |
-
return files
|
699 |
-
except Exception as e:
|
700 |
-
st.error(f"Error during deep search: {str(e)}")
|
701 |
-
return None
|
702 |
-
|
703 |
files = asyncio.run(run_deep_search())
|
704 |
if files:
|
705 |
-
# Save all discovered files—even duplicates
|
706 |
st.session_state.discovered_files = files
|
707 |
st.session_state.current_url = url
|
708 |
st.success(f"Found {len(files)} files!")
|
709 |
-
|
710 |
-
# File selection block (Select/Clear)
|
711 |
-
col1, col2 = st.columns([1, 4])
|
712 |
-
with col1:
|
713 |
-
if st.button("Select All", key="select_all_btn"):
|
714 |
-
st.session_state.selected_files = list(range(len(files)))
|
715 |
-
safe_rerun()
|
716 |
-
if st.button("Clear Selection", key="clear_selection_btn"):
|
717 |
-
st.session_state.selected_files = []
|
718 |
-
safe_rerun()
|
719 |
-
|
720 |
-
selected_files = st.multiselect(
|
721 |
-
"Select files to download",
|
722 |
-
options=list(range(len(files))),
|
723 |
-
default=st.session_state.selected_files,
|
724 |
-
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
|
725 |
-
key="file_multiselect"
|
726 |
-
)
|
727 |
-
st.session_state.selected_files = selected_files
|
728 |
-
|
729 |
-
if selected_files:
|
730 |
-
col1, col2, col3, col4 = st.columns(4)
|
731 |
-
with col1:
|
732 |
-
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
733 |
-
with col2:
|
734 |
-
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
735 |
-
with col3:
|
736 |
-
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
|
737 |
-
with col4:
|
738 |
-
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
739 |
-
|
740 |
-
if st.button("Download Selected", key="download_btn"):
|
741 |
-
if not os.path.exists(download_dir):
|
742 |
-
os.makedirs(download_dir)
|
743 |
-
|
744 |
-
async def download_files():
|
745 |
-
downloaded_paths = []
|
746 |
-
progress_bar = st.progress(0)
|
747 |
-
status_text = st.empty()
|
748 |
-
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
749 |
-
for i, idx in enumerate(selected_files):
|
750 |
-
progress = (i + 1) / len(selected_files)
|
751 |
-
file_info = files[idx]
|
752 |
-
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
|
753 |
-
progress_bar.progress(progress)
|
754 |
-
# Download the file (ensure DownloadManager.download_file downloads duplicates)
|
755 |
-
path = await dm.download_file(file_info, download_dir, url)
|
756 |
-
if path:
|
757 |
-
downloaded_paths.append(path)
|
758 |
-
status_text.empty()
|
759 |
-
progress_bar.empty()
|
760 |
-
return downloaded_paths
|
761 |
-
|
762 |
-
downloaded = asyncio.run(download_files())
|
763 |
-
|
764 |
-
if downloaded:
|
765 |
-
st.success(f"Successfully downloaded {len(downloaded)} files")
|
766 |
-
# If the user chose to create a ZIP, generate it and offer a download button
|
767 |
-
if create_zip:
|
768 |
-
zip_path = create_zip_file(downloaded, download_dir)
|
769 |
-
st.success(f"Created ZIP file: {zip_path}")
|
770 |
-
with open(zip_path, "rb") as f:
|
771 |
-
zip_data = f.read()
|
772 |
-
st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
|
773 |
-
|
774 |
-
if upload_to_drive and st.session_state.get('google_creds'):
|
775 |
-
with st.spinner("Uploading to Google Drive..."):
|
776 |
-
drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
|
777 |
-
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
778 |
-
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
779 |
-
else:
|
780 |
-
st.error(drive_id)
|
781 |
-
if delete_after:
|
782 |
-
for path in downloaded:
|
783 |
-
try:
|
784 |
-
os.remove(path)
|
785 |
-
except Exception as e:
|
786 |
-
st.warning(f"Could not delete {path}: {e}")
|
787 |
-
st.info("Deleted original files after ZIP creation")
|
788 |
-
else:
|
789 |
-
# Otherwise, generate an individual download button for each file
|
790 |
-
for path in downloaded:
|
791 |
-
with open(path, "rb") as f:
|
792 |
-
file_data = f.read()
|
793 |
-
st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
|
794 |
else:
|
795 |
st.warning("No files found.")
|
796 |
-
|
797 |
-
# If files were discovered in a previous search, show them here as well.
|
798 |
if st.session_state.discovered_files:
|
799 |
files = st.session_state.discovered_files
|
800 |
st.success(f"Found {len(files)} files!")
|
801 |
col1, col2 = st.columns([1, 4])
|
802 |
with col1:
|
803 |
-
if st.button("Select All", key="
|
804 |
st.session_state.selected_files = list(range(len(files)))
|
805 |
-
|
806 |
-
if st.button("Clear Selection", key="clear_selection_btn2"):
|
807 |
st.session_state.selected_files = []
|
808 |
-
|
809 |
-
selected_files = st.multiselect(
|
810 |
-
"Select files to download",
|
811 |
-
options=list(range(len(files))),
|
812 |
-
default=st.session_state.selected_files,
|
813 |
-
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
|
814 |
-
key="file_multiselect2"
|
815 |
-
)
|
816 |
st.session_state.selected_files = selected_files
|
817 |
if selected_files:
|
818 |
col1, col2, col3, col4 = st.columns(4)
|
819 |
with col1:
|
820 |
-
download_dir = st.text_input("Download Directory", value="./downloads", key="
|
821 |
with col2:
|
822 |
-
create_zip = st.checkbox("Create ZIP file", value=True, key="
|
823 |
with col3:
|
824 |
-
delete_after = st.checkbox("Delete after creating ZIP", key="
|
825 |
with col4:
|
826 |
-
upload_to_drive = st.checkbox("Upload to Google Drive", key="
|
827 |
-
if st.button("Download Selected", key="
|
828 |
if not os.path.exists(download_dir):
|
829 |
os.makedirs(download_dir)
|
830 |
-
|
831 |
async def download_files():
|
832 |
downloaded_paths = []
|
833 |
progress_bar = st.progress(0)
|
@@ -838,7 +585,7 @@ def main():
|
|
838 |
file_info = files[idx]
|
839 |
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
|
840 |
progress_bar.progress(progress)
|
841 |
-
path = await dm.download_file(file_info, download_dir,
|
842 |
if path:
|
843 |
downloaded_paths.append(path)
|
844 |
status_text.empty()
|
@@ -853,13 +600,14 @@ def main():
|
|
853 |
with open(zip_path, "rb") as f:
|
854 |
zip_data = f.read()
|
855 |
st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
|
856 |
-
if upload_to_drive and st.session_state.
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
|
|
863 |
if delete_after:
|
864 |
for path in downloaded:
|
865 |
try:
|
@@ -872,163 +620,29 @@ def main():
|
|
872 |
with open(path, "rb") as f:
|
873 |
file_data = f.read()
|
874 |
st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
|
875 |
-
|
876 |
elif mode == "Bing Search":
|
877 |
st.header("Bing Search Mode")
|
878 |
query = st.text_input("Enter search query", key="search_query_input")
|
879 |
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
|
880 |
-
|
881 |
-
# Check if deep search was requested
|
882 |
-
if st.session_state.get('do_deep_search', False):
|
883 |
-
url_to_search = st.session_state.get('deep_search_url')
|
884 |
-
st.write(f"Running deep search on: {url_to_search}")
|
885 |
-
|
886 |
-
async def perform_deep_search():
|
887 |
-
async with DownloadManager(
|
888 |
-
use_proxy=use_proxy,
|
889 |
-
proxy=proxy
|
890 |
-
) as dm:
|
891 |
-
files = await dm.deep_search(
|
892 |
-
url=url_to_search,
|
893 |
-
custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
|
894 |
-
sublink_limit=max_sublinks,
|
895 |
-
timeout=sublink_timeout
|
896 |
-
)
|
897 |
-
if files:
|
898 |
-
st.session_state.discovered_files = files
|
899 |
-
st.session_state.current_url = url_to_search
|
900 |
-
st.session_state.selected_files = []
|
901 |
-
else:
|
902 |
-
st.warning("No files found on this page.")
|
903 |
-
|
904 |
-
# Clear the deep search flag after execution
|
905 |
-
st.session_state.do_deep_search = False
|
906 |
-
|
907 |
-
asyncio.run(perform_deep_search())
|
908 |
-
|
909 |
if st.button("Search", key="search_btn"):
|
910 |
if query:
|
911 |
async def run_search():
|
912 |
-
async with DownloadManager(
|
913 |
-
use_proxy=use_proxy,
|
914 |
-
proxy=proxy,
|
915 |
-
query=query,
|
916 |
-
num_results=num_results
|
917 |
-
) as dm:
|
918 |
with st.spinner("Searching..."):
|
919 |
urls = await dm.search_bing()
|
920 |
if urls:
|
921 |
-
st.session_state.search_results = urls
|
922 |
st.success(f"Found {len(urls)} results!")
|
923 |
for i, url in enumerate(urls, 1):
|
924 |
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
925 |
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
|
926 |
-
st.session_state.deep_search_url = url
|
927 |
-
st.session_state.do_deep_search = True
|
928 |
-
safe_rerun() # Rerun to apply state change
|
929 |
else:
|
930 |
st.warning("No search results found.")
|
931 |
asyncio.run(run_search())
|
932 |
-
|
933 |
-
# Display search results if they exist
|
934 |
-
if hasattr(st.session_state, 'search_results') and st.session_state.search_results:
|
935 |
-
urls = st.session_state.search_results
|
936 |
-
st.success(f"Found {len(urls)} results!")
|
937 |
-
for i, url in enumerate(urls, 1):
|
938 |
-
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
939 |
-
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_saved_{i}"):
|
940 |
-
st.session_state.deep_search_url = url
|
941 |
-
st.session_state.do_deep_search = True
|
942 |
-
safe_rerun()
|
943 |
-
|
944 |
-
# If files were discovered in a previous search, show them
|
945 |
-
if st.session_state.discovered_files:
|
946 |
-
files = st.session_state.discovered_files
|
947 |
-
st.success(f"Found {len(files)} files on {st.session_state.current_url}!")
|
948 |
-
|
949 |
-
# File selection and download UI
|
950 |
-
col1, col2 = st.columns([1, 4])
|
951 |
-
with col1:
|
952 |
-
if st.button("Select All", key="bing_select_all_btn"):
|
953 |
-
st.session_state.selected_files = list(range(len(files)))
|
954 |
-
safe_rerun()
|
955 |
-
if st.button("Clear Selection", key="bing_clear_selection_btn"):
|
956 |
-
st.session_state.selected_files = []
|
957 |
-
safe_rerun()
|
958 |
-
|
959 |
-
selected_files = st.multiselect(
|
960 |
-
"Select files to download",
|
961 |
-
options=list(range(len(files))),
|
962 |
-
default=st.session_state.selected_files,
|
963 |
-
format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})",
|
964 |
-
key="bing_file_multiselect"
|
965 |
-
)
|
966 |
-
st.session_state.selected_files = selected_files
|
967 |
-
|
968 |
-
if selected_files:
|
969 |
-
col1, col2, col3, col4 = st.columns(4)
|
970 |
-
with col1:
|
971 |
-
download_dir = st.text_input("Download Directory", value="./downloads", key="bing_download_dir_input")
|
972 |
-
with col2:
|
973 |
-
create_zip = st.checkbox("Create ZIP file", value=True, key="bing_create_zip_checkbox")
|
974 |
-
with col3:
|
975 |
-
delete_after = st.checkbox("Delete after creating ZIP", key="bing_delete_after_checkbox")
|
976 |
-
with col4:
|
977 |
-
upload_to_drive = st.checkbox("Upload to Google Drive", key="bing_upload_drive_checkbox")
|
978 |
-
|
979 |
-
if st.button("Download Selected", key="bing_download_btn"):
|
980 |
-
if not os.path.exists(download_dir):
|
981 |
-
os.makedirs(download_dir)
|
982 |
-
|
983 |
-
async def download_files():
|
984 |
-
downloaded_paths = []
|
985 |
-
progress_bar = st.progress(0)
|
986 |
-
status_text = st.empty()
|
987 |
-
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
988 |
-
for i, idx in enumerate(selected_files):
|
989 |
-
progress = (i + 1) / len(selected_files)
|
990 |
-
file_info = files[idx]
|
991 |
-
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
|
992 |
-
progress_bar.progress(progress)
|
993 |
-
path = await dm.download_file(file_info, download_dir, st.session_state.current_url)
|
994 |
-
if path:
|
995 |
-
downloaded_paths.append(path)
|
996 |
-
status_text.empty()
|
997 |
-
progress_bar.empty()
|
998 |
-
return downloaded_paths
|
999 |
-
|
1000 |
-
downloaded = asyncio.run(download_files())
|
1001 |
-
|
1002 |
-
if downloaded:
|
1003 |
-
st.success(f"Successfully downloaded {len(downloaded)} files")
|
1004 |
-
if create_zip:
|
1005 |
-
zip_path = create_zip_file(downloaded, download_dir)
|
1006 |
-
st.success(f"Created ZIP file: {zip_path}")
|
1007 |
-
with open(zip_path, "rb") as f:
|
1008 |
-
zip_data = f.read()
|
1009 |
-
st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
|
1010 |
-
|
1011 |
-
if upload_to_drive and st.session_state.get('google_creds'):
|
1012 |
-
with st.spinner("Uploading to Google Drive..."):
|
1013 |
-
drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
|
1014 |
-
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
1015 |
-
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
1016 |
-
else:
|
1017 |
-
st.error(drive_id)
|
1018 |
-
|
1019 |
-
if delete_after:
|
1020 |
-
for path in downloaded:
|
1021 |
-
try:
|
1022 |
-
os.remove(path)
|
1023 |
-
except Exception as e:
|
1024 |
-
st.warning(f"Could not delete {path}: {e}")
|
1025 |
-
st.info("Deleted original files after ZIP creation")
|
1026 |
-
else:
|
1027 |
-
for path in downloaded:
|
1028 |
-
with open(path, "rb") as f:
|
1029 |
-
file_data = f.read()
|
1030 |
-
st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
|
1031 |
-
|
1032 |
else: # PDF Summarizer mode
|
1033 |
if summarizer is None:
|
1034 |
st.error("PDF summarization is not available due to model loading errors.")
|
@@ -1046,15 +660,10 @@ def main():
|
|
1046 |
reader = PdfReader(temp_pdf.name)
|
1047 |
text = " ".join([page.extract_text() or "" for page in reader.pages])
|
1048 |
os.remove(temp_pdf.name)
|
1049 |
-
|
1050 |
-
|
1051 |
-
st.write("Summary:")
|
1052 |
-
st.write(summary[0]['summary_text'])
|
1053 |
except Exception as e:
|
1054 |
st.error(f"Error summarizing PDF: {e}")
|
1055 |
|
1056 |
if __name__ == "__main__":
|
1057 |
-
|
1058 |
-
main()
|
1059 |
-
except Exception as e:
|
1060 |
-
st.error(f"An error occurred: {str(e)}")
|
|
|
26 |
import googleapiclient.discovery
|
27 |
import google.auth.transport.requests
|
28 |
from async_timeout import timeout as async_timeout
|
29 |
+
import pandas as pd
|
30 |
+
from sentence_transformers import SentenceTransformer
|
31 |
+
from transformers import pipeline
|
32 |
+
import schedule
|
33 |
+
import threading
|
34 |
+
import time
|
35 |
+
import hashlib
|
36 |
+
from reportlab.lib.pagesizes import letter
|
37 |
+
from reportlab.pdfgen import canvas
|
38 |
+
from sklearn.cluster import KMeans
|
39 |
+
import numpy as np
|
40 |
+
|
41 |
# -------------------- Logging Setup --------------------
|
42 |
logging.basicConfig(
|
43 |
filename='advanced_download_log.txt',
|
|
|
45 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
46 |
)
|
47 |
logger = logging.getLogger(__name__)
|
48 |
+
|
49 |
GOOGLE_OAUTH_CONFIG = {
|
50 |
"web": {
|
51 |
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
|
|
|
61 |
# Playwright Setup
|
62 |
def install_playwright_dependencies():
|
63 |
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
|
64 |
+
subprocess.run(['apt-get', 'update', '-y'], check=True)
|
65 |
+
packages = [
|
66 |
+
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
|
67 |
+
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
|
68 |
+
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
|
69 |
+
]
|
70 |
+
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
|
71 |
+
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
|
|
73 |
install_playwright_dependencies()
|
74 |
|
75 |
# Model Loading
|
76 |
@st.cache_resource
|
77 |
def load_models():
|
78 |
try:
|
79 |
+
# Load spaCy model
|
80 |
try:
|
81 |
nlp = spacy.load("en_core_web_sm")
|
82 |
except OSError:
|
|
|
84 |
spacy.cli.download("en_core_web_sm")
|
85 |
nlp = spacy.load("en_core_web_sm")
|
86 |
|
87 |
+
# Load SentenceTransformer
|
88 |
try:
|
89 |
+
semantic_model = SentenceTransformer('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
except Exception as e:
|
91 |
st.error(f"Error loading SentenceTransformer: {e}")
|
92 |
semantic_model = None
|
93 |
|
94 |
+
# Load Transformers pipeline
|
95 |
try:
|
96 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
except Exception as e:
|
98 |
st.error(f"Error loading Transformers: {e}")
|
99 |
summarizer = None
|
100 |
|
101 |
return nlp, semantic_model, summarizer
|
|
|
102 |
except Exception as e:
|
103 |
st.error(f"Error loading models: {e}")
|
104 |
return None, None, None
|
105 |
|
106 |
+
nlp_model, semantic_model, summarizer = load_models()
|
|
|
|
|
107 |
|
108 |
# Utility Functions
|
109 |
def get_random_user_agent():
|
|
|
125 |
def create_zip_file(file_paths, output_dir):
|
126 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
127 |
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
|
|
|
128 |
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
129 |
for file_path in file_paths:
|
130 |
zipf.write(file_path, os.path.basename(file_path))
|
|
|
131 |
return zip_path
|
132 |
|
133 |
# Google Drive Functions
|
|
|
163 |
except Exception as e:
|
164 |
return None, f"Error during token exchange: {e}"
|
165 |
|
166 |
+
def google_drive_upload(file_path, credentials, folder_id=None):
|
167 |
try:
|
168 |
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
|
169 |
+
file_metadata = {'name': os.path.basename(file_path)}
|
170 |
+
if folder_id:
|
171 |
+
file_metadata['parents'] = [folder_id]
|
172 |
+
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
|
173 |
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
174 |
return created.get("id", "")
|
175 |
except Exception as e:
|
176 |
return f"Error uploading to Drive: {str(e)}"
|
177 |
+
|
178 |
+
def create_drive_folder(drive_service, name):
|
179 |
+
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
|
180 |
+
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
|
181 |
+
return folder.get('id')
|
182 |
+
|
183 |
# DownloadManager Class
|
184 |
class DownloadManager:
|
185 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
|
|
207 |
}
|
208 |
if self.use_proxy and self.proxy:
|
209 |
opts["proxy"] = {"server": self.proxy}
|
|
|
210 |
self.browser = await self.playwright.chromium.launch(**opts)
|
211 |
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
|
212 |
self.page = await self.context.new_page()
|
|
|
229 |
search_url = f"https://www.bing.com/search?q={self.query}"
|
230 |
await self.page.goto(search_url, timeout=30000)
|
231 |
await self.page.wait_for_load_state('networkidle')
|
|
|
|
|
232 |
links = await self.page.query_selector_all("li.b_algo h2 a")
|
233 |
for link in links[:self.num_results]:
|
234 |
href = await link.get_attribute('href')
|
235 |
if href:
|
236 |
urls.append(href)
|
|
|
237 |
return urls
|
238 |
except Exception as e:
|
239 |
logger.error(f"Error searching Bing: {e}")
|
|
|
304 |
soup = BeautifulSoup(content, 'html.parser')
|
305 |
|
306 |
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
|
307 |
+
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
|
308 |
+
'.pptx', '.odt', '.txt']
|
309 |
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
310 |
|
311 |
parsed_base = urlparse(final_url)
|
|
|
314 |
for a in soup.find_all('a', href=True):
|
315 |
href = a['href'].strip()
|
316 |
|
|
|
317 |
if '.php' in href.lower() or 'download' in href.lower():
|
318 |
+
full_url = href if href.startswith('http') else f"{base_url}{href}"
|
|
|
|
|
319 |
real_url = await self.extract_real_download_url(full_url)
|
320 |
if real_url and real_url != full_url:
|
321 |
found_files.append({
|
|
|
326 |
})
|
327 |
continue
|
328 |
|
|
|
329 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
330 |
+
file_url = href if href.startswith('http') else f"{base_url}{href}"
|
|
|
|
|
|
|
331 |
size_str = await self.get_file_size(file_url)
|
332 |
meta = {}
|
333 |
if file_url.lower().endswith('.pdf'):
|
334 |
meta = await self.get_pdf_metadata(file_url)
|
|
|
335 |
found_files.append({
|
336 |
'url': file_url,
|
337 |
'filename': os.path.basename(file_url.split('?')[0]),
|
|
|
347 |
if match:
|
348 |
file_id = match.group(1)
|
349 |
break
|
|
|
350 |
if file_id:
|
351 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
352 |
filename = file_id
|
|
|
357 |
mt = re.search(r'filename\*?="?([^";]+)', cd)
|
358 |
if mt:
|
359 |
filename = mt.group(1).strip('"').strip()
|
|
|
360 |
found_files.append({
|
361 |
'url': direct_url,
|
362 |
'filename': filename,
|
|
|
366 |
except Exception as e:
|
367 |
logger.error(f"Error processing Google Drive link: {e}")
|
368 |
|
|
|
369 |
seen_urls = set()
|
370 |
unique_files = []
|
371 |
for f in found_files:
|
372 |
if f['url'] not in seen_urls:
|
373 |
seen_urls.add(f['url'])
|
374 |
unique_files.append(f)
|
|
|
375 |
return unique_files
|
376 |
except Exception as e:
|
377 |
logger.error(f"Error extracting files from {url}: {e}")
|
|
|
386 |
while os.path.exists(path):
|
387 |
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
388 |
counter += 1
|
|
|
389 |
os.makedirs(save_dir, exist_ok=True)
|
|
|
390 |
try:
|
391 |
if "drive.google.com" in file_url:
|
392 |
import gdown
|
393 |
+
output = gdown.download(file_url, path, quiet=False)
|
394 |
+
if output:
|
395 |
+
return path
|
396 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
async with self.context.new_page() as page:
|
|
|
|
|
398 |
headers = {
|
399 |
'Accept': '*/*',
|
400 |
'Accept-Encoding': 'gzip, deflate, br',
|
401 |
'Referer': referer
|
402 |
}
|
|
|
403 |
response = await page.request.get(file_url, headers=headers, timeout=30000)
|
|
|
404 |
if response.status == 200:
|
405 |
content = await response.body()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
with open(path, 'wb') as f:
|
407 |
f.write(content)
|
408 |
return path
|
409 |
else:
|
410 |
logger.error(f"Download failed with status {response.status}: {file_url}")
|
411 |
return None
|
|
|
412 |
except Exception as e:
|
413 |
logger.error(f"Error downloading {file_url}: {e}")
|
414 |
return None
|
|
|
416 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
417 |
if not custom_ext_list:
|
418 |
custom_ext_list = []
|
|
|
419 |
progress_text = st.empty()
|
420 |
progress_bar = st.progress(0)
|
421 |
file_count_text = st.empty()
|
|
|
422 |
try:
|
|
|
423 |
progress_text.text("Analyzing main page...")
|
424 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
425 |
initial_count = len(main_files)
|
426 |
file_count_text.text(f"Found {initial_count} files on main page")
|
|
|
|
|
427 |
progress_text.text("Getting sublinks...")
|
428 |
sublinks = await self.get_sublinks(url, sublink_limit)
|
429 |
total_links = len(sublinks)
|
|
|
430 |
progress_text.text(f"Found {total_links} sublinks to process")
|
|
|
431 |
if not sublinks:
|
432 |
progress_bar.progress(1.0)
|
433 |
return main_files
|
|
|
|
|
434 |
all_files = main_files
|
|
|
435 |
for i, sublink in enumerate(sublinks, 1):
|
436 |
+
progress = i / total_links
|
437 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
438 |
progress_bar.progress(progress)
|
|
|
439 |
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
440 |
all_files.extend(sub_files)
|
|
|
|
|
441 |
file_count_text.text(f"Found {len(all_files)} total files")
|
|
|
|
|
442 |
seen_urls = set()
|
443 |
unique_files = []
|
|
|
444 |
for f in all_files:
|
445 |
if f['url'] not in seen_urls:
|
446 |
seen_urls.add(f['url'])
|
447 |
unique_files.append(f)
|
|
|
448 |
final_count = len(unique_files)
|
449 |
progress_text.text(f"Deep search complete!")
|
450 |
file_count_text.text(f"Found {final_count} unique files")
|
451 |
progress_bar.progress(1.0)
|
|
|
452 |
return unique_files
|
|
|
453 |
except Exception as e:
|
454 |
logger.error(f"Deep search error: {e}")
|
455 |
progress_text.text(f"Error during deep search: {str(e)}")
|
456 |
return []
|
457 |
finally:
|
|
|
458 |
await asyncio.sleep(2)
|
459 |
if not st.session_state.get('keep_progress', False):
|
460 |
progress_text.empty()
|
|
|
465 |
await self.page.goto(url, timeout=30000)
|
466 |
content = await self.page.content()
|
467 |
soup = BeautifulSoup(content, 'html.parser')
|
|
|
468 |
parsed_base = urlparse(url)
|
469 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
|
470 |
links = set()
|
471 |
for a in soup.find_all('a', href=True):
|
472 |
href = a['href'].strip()
|
|
|
474 |
links.add(href)
|
475 |
elif href.startswith('/'):
|
476 |
links.add(f"{base_url}{href}")
|
|
|
477 |
return list(links)[:limit]
|
|
|
478 |
except Exception as e:
|
479 |
logger.error(f"Error getting sublinks: {e}")
|
480 |
return []
|
481 |
|
482 |
+
# Utility Functions for New Features
|
483 |
+
def extract_keywords(text, n=5):
|
484 |
+
doc = nlp_model(text)
|
485 |
+
keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
|
486 |
+
return keywords
|
487 |
|
488 |
+
def analyze_sentiment(text):
|
489 |
+
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
490 |
+
result = sentiment_analyzer(text[:512])[0]
|
491 |
+
return result['label'], result['score']
|
492 |
|
493 |
+
def get_file_hash(file_path):
|
494 |
+
hasher = hashlib.md5()
|
495 |
+
with open(file_path, 'rb') as f:
|
496 |
+
hasher.update(f.read())
|
497 |
+
return hasher.hexdigest()
|
498 |
+
|
499 |
+
# Main Function
|
500 |
def main():
|
|
|
501 |
if 'initialized' not in st.session_state:
|
502 |
st.session_state.initialized = True
|
503 |
st.session_state.discovered_files = []
|
504 |
st.session_state.current_url = None
|
505 |
st.session_state.google_creds = None
|
506 |
st.session_state.selected_files = []
|
507 |
+
st.session_state.do_deep_search = False
|
508 |
+
st.session_state.deep_search_url = None
|
509 |
+
st.session_state.search_results = []
|
510 |
|
511 |
st.title("Advanced File Downloader")
|
512 |
+
|
|
|
513 |
with st.sidebar:
|
514 |
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
|
515 |
with st.expander("Advanced Options", expanded=True):
|
516 |
+
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
|
517 |
+
max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
|
518 |
+
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
|
520 |
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
|
521 |
with st.expander("Google Drive Integration", expanded=False):
|
|
|
527 |
creds, msg = exchange_code_for_credentials(auth_code)
|
528 |
st.session_state.google_creds = creds
|
529 |
st.write(msg)
|
530 |
+
|
|
|
531 |
if mode == "Manual URL":
|
532 |
st.header("Manual URL Mode")
|
533 |
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
|
|
|
534 |
col1, col2 = st.columns([3, 1])
|
535 |
with col1:
|
536 |
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
|
537 |
if url:
|
538 |
+
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
|
539 |
+
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
540 |
+
if custom_ext_list != valid_ext_list:
|
541 |
+
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
542 |
async def run_deep_search():
|
543 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
|
544 |
+
files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
|
545 |
+
return files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
546 |
files = asyncio.run(run_deep_search())
|
547 |
if files:
|
|
|
548 |
st.session_state.discovered_files = files
|
549 |
st.session_state.current_url = url
|
550 |
st.success(f"Found {len(files)} files!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
else:
|
552 |
st.warning("No files found.")
|
553 |
+
|
|
|
554 |
if st.session_state.discovered_files:
|
555 |
files = st.session_state.discovered_files
|
556 |
st.success(f"Found {len(files)} files!")
|
557 |
col1, col2 = st.columns([1, 4])
|
558 |
with col1:
|
559 |
+
if st.button("Select All", key="select_all_btn"):
|
560 |
st.session_state.selected_files = list(range(len(files)))
|
561 |
+
if st.button("Clear Selection", key="clear_selection_btn"):
|
|
|
562 |
st.session_state.selected_files = []
|
563 |
+
selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
st.session_state.selected_files = selected_files
|
565 |
if selected_files:
|
566 |
col1, col2, col3, col4 = st.columns(4)
|
567 |
with col1:
|
568 |
+
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
|
569 |
with col2:
|
570 |
+
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
|
571 |
with col3:
|
572 |
+
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
|
573 |
with col4:
|
574 |
+
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
|
575 |
+
if st.button("Download Selected", key="download_btn"):
|
576 |
if not os.path.exists(download_dir):
|
577 |
os.makedirs(download_dir)
|
|
|
578 |
async def download_files():
|
579 |
downloaded_paths = []
|
580 |
progress_bar = st.progress(0)
|
|
|
585 |
file_info = files[idx]
|
586 |
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
|
587 |
progress_bar.progress(progress)
|
588 |
+
path = await dm.download_file(file_info, download_dir, url)
|
589 |
if path:
|
590 |
downloaded_paths.append(path)
|
591 |
status_text.empty()
|
|
|
600 |
with open(zip_path, "rb") as f:
|
601 |
zip_data = f.read()
|
602 |
st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
|
603 |
+
if upload_to_drive and st.session_state.google_creds:
|
604 |
+
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
|
605 |
+
folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
|
606 |
+
drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
|
607 |
+
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
|
608 |
+
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
|
609 |
+
else:
|
610 |
+
st.error(drive_id)
|
611 |
if delete_after:
|
612 |
for path in downloaded:
|
613 |
try:
|
|
|
620 |
with open(path, "rb") as f:
|
621 |
file_data = f.read()
|
622 |
st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
|
623 |
+
|
624 |
elif mode == "Bing Search":
|
625 |
st.header("Bing Search Mode")
|
626 |
query = st.text_input("Enter search query", key="search_query_input")
|
627 |
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
628 |
if st.button("Search", key="search_btn"):
|
629 |
if query:
|
630 |
async def run_search():
|
631 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm:
|
|
|
|
|
|
|
|
|
|
|
632 |
with st.spinner("Searching..."):
|
633 |
urls = await dm.search_bing()
|
634 |
if urls:
|
635 |
+
st.session_state.search_results = urls
|
636 |
st.success(f"Found {len(urls)} results!")
|
637 |
for i, url in enumerate(urls, 1):
|
638 |
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
639 |
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
|
640 |
+
st.session_state.deep_search_url = url
|
641 |
+
st.session_state.do_deep_search = True
|
|
|
642 |
else:
|
643 |
st.warning("No search results found.")
|
644 |
asyncio.run(run_search())
|
645 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
else: # PDF Summarizer mode
|
647 |
if summarizer is None:
|
648 |
st.error("PDF summarization is not available due to model loading errors.")
|
|
|
660 |
reader = PdfReader(temp_pdf.name)
|
661 |
text = " ".join([page.extract_text() or "" for page in reader.pages])
|
662 |
os.remove(temp_pdf.name)
|
663 |
+
summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
|
664 |
+
st.write("Summary:", summary[0]['summary_text'])
|
|
|
|
|
665 |
except Exception as e:
|
666 |
st.error(f"Error summarizing PDF: {e}")
|
667 |
|
668 |
if __name__ == "__main__":
|
669 |
+
main()
|
|
|
|
|
|