|
import streamlit as st |
|
|
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="📁") |
|
|
|
|
|
import os |
|
import subprocess |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
import asyncio |
|
import logging |
|
from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote |
|
import re |
|
from pathlib import Path |
|
from io import BytesIO |
|
import random |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import zipfile |
|
import tempfile |
|
import mimetypes |
|
import requests |
|
import datetime |
|
import traceback |
|
import base64 |
|
import shutil |
|
import json |
|
import time |
|
from PIL import Image |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas |
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
import googleapiclient.http |
|
|
|
|
|
import nltk |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
import docx2txt |
|
|
|
|
|
try: |
|
from sentence_transformers import SentenceTransformer |
|
HAVE_TRANSFORMERS = True |
|
except ImportError: |
|
HAVE_TRANSFORMERS = False |
|
|
|
|
|
try: |
|
nltk.data.find('tokenizers/punkt') |
|
except LookupError: |
|
try: |
|
nltk.download('punkt', quiet=True) |
|
except: |
|
pass |
|
|
|
try: |
|
nltk.data.find('corpora/stopwords') |
|
except LookupError: |
|
try: |
|
nltk.download('stopwords', quiet=True) |
|
from nltk.corpus import stopwords |
|
STOPWORDS = set(stopwords.words('english')) |
|
except: |
|
STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by']) |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": os.environ.get("GOOGLE_CLIENT_ID"), |
|
"project_id": os.environ.get("GOOGLE_PROJECT_ID"), |
|
"auth_uri": os.environ.get("GOOGLE_AUTH_URI", "https://accounts.google.com/o/oauth2/auth"), |
|
"token_uri": os.environ.get("GOOGLE_TOKEN_URI", "https://oauth2.googleapis.com/token"), |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": os.environ.get("GOOGLE_CLIENT_SECRET"), |
|
"redirect_uris": [os.environ.get("GOOGLE_REDIRECT_URI")] |
|
} |
|
} |
|
|
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', |
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' |
|
] |
|
|
|
|
|
STEALTH_SETTINGS = { |
|
|
|
"hardware_concurrency": 4, |
|
"device_memory": 8, |
|
|
|
"webgl_vendor": "Google Inc. (Intel)", |
|
"webgl_renderer": "Intel Iris OpenGL Engine", |
|
"languages": ["en-US", "en"], |
|
"disable_webrtc": True, |
|
|
|
"navigator_platform": "Win32", |
|
"touch_support": False |
|
} |
|
|
|
|
|
PROXY_ROTATION_CONFIG = { |
|
"enabled": False, |
|
"rotation_interval": 10, |
|
"proxies": [] |
|
} |
|
|
|
|
|
class EnhancedRAGSearch: |
|
def __init__(self): |
|
self.file_texts = [] |
|
self.chunks = [] |
|
self.chunk_metadata = [] |
|
self.file_metadata = [] |
|
self.languages = [] |
|
self.model = None |
|
|
|
|
|
if HAVE_TRANSFORMERS: |
|
try: |
|
|
|
self.model = SentenceTransformer('all-MiniLM-L6-v2') |
|
self.use_transformer = True |
|
logger.info("Using sentence-transformers for RAG") |
|
except Exception as e: |
|
logger.warning(f"Error loading sentence-transformer: {e}") |
|
self.use_transformer = False |
|
else: |
|
self.use_transformer = False |
|
|
|
|
|
if not self.use_transformer: |
|
self.vectorizer = TfidfVectorizer( |
|
stop_words='english', |
|
ngram_range=(1, 2), |
|
max_features=15000, |
|
min_df=1 |
|
) |
|
|
|
self.vectors = None |
|
self.chunk_vectors = None |
|
|
|
def add_file(self, file_data, file_info): |
|
"""Add a file to the search index with improved processing""" |
|
file_ext = os.path.splitext(file_info['filename'])[1].lower() |
|
text = self.extract_text(file_data, file_ext) |
|
|
|
if text: |
|
|
|
self.file_texts.append(text) |
|
self.file_metadata.append(file_info) |
|
|
|
|
|
try: |
|
|
|
words = re.findall(r'\b\w+\b', text.lower()) |
|
english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100])) |
|
lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown' |
|
self.languages.append(lang) |
|
except: |
|
self.languages.append('en') |
|
|
|
|
|
chunks = self.create_chunks(text) |
|
for chunk in chunks: |
|
self.chunks.append(chunk) |
|
self.chunk_metadata.append({ |
|
'file_info': file_info, |
|
'chunk_size': len(chunk), |
|
'file_index': len(self.file_texts) - 1 |
|
}) |
|
|
|
return True |
|
return False |
|
|
|
def create_chunks(self, text, chunk_size=1000, overlap=200): |
|
"""Split text into overlapping chunks for better search precision""" |
|
|
|
try: |
|
sentences = nltk.sent_tokenize(text) |
|
chunks = [] |
|
current_chunk = "" |
|
|
|
for sentence in sentences: |
|
if len(current_chunk) + len(sentence) <= chunk_size: |
|
current_chunk += sentence + " " |
|
else: |
|
|
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
|
|
|
|
if len(current_chunk) > overlap: |
|
|
|
overlap_text = current_chunk[-overlap:] |
|
last_space = overlap_text.rfind(' ') |
|
if last_space != -1: |
|
current_chunk = current_chunk[-(overlap-last_space):] + sentence + " " |
|
else: |
|
current_chunk = sentence + " " |
|
else: |
|
current_chunk = sentence + " " |
|
|
|
|
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
|
|
return chunks |
|
except: |
|
|
|
chunks = [] |
|
for i in range(0, len(text), chunk_size - overlap): |
|
chunk = text[i:i + chunk_size] |
|
if chunk: |
|
chunks.append(chunk) |
|
return chunks |
|
|
|
def extract_text(self, file_data, file_ext): |
|
"""Extract text from different file types with enhanced support""" |
|
try: |
|
if file_ext.lower() == '.pdf': |
|
reader = PyPDF2.PdfReader(BytesIO(file_data)) |
|
text = "" |
|
for page in reader.pages: |
|
extracted = page.extract_text() |
|
if extracted: |
|
text += extracted + "\n" |
|
|
|
return text |
|
elif file_ext.lower() in ['.docx', '.doc']: |
|
return docx2txt.process(BytesIO(file_data)) |
|
elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']: |
|
|
|
try: |
|
return file_data.decode('utf-8', errors='ignore') |
|
except: |
|
encodings = ['latin-1', 'iso-8859-1', 'windows-1252'] |
|
for enc in encodings: |
|
try: |
|
return file_data.decode(enc, errors='ignore') |
|
except: |
|
pass |
|
|
|
return file_data.decode('utf-8', errors='ignore') |
|
elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']: |
|
|
|
|
|
return f"[Content of {file_ext} file - install additional libraries for full text extraction]" |
|
else: |
|
return "" |
|
except Exception as e: |
|
logger.error(f"Error extracting text: {e}") |
|
return "" |
|
|
|
def build_index(self): |
|
"""Build both document and chunk search indices""" |
|
if not self.file_texts: |
|
return False |
|
|
|
try: |
|
if self.use_transformer: |
|
|
|
logger.info("Building document and chunk embeddings with transformer model...") |
|
self.vectors = self.model.encode(self.file_texts, show_progress_bar=False) |
|
|
|
|
|
if self.chunks: |
|
|
|
batch_size = 32 |
|
chunk_vectors = [] |
|
for i in range(0, len(self.chunks), batch_size): |
|
batch = self.chunks[i:i+batch_size] |
|
batch_vectors = self.model.encode(batch, show_progress_bar=False) |
|
chunk_vectors.append(batch_vectors) |
|
self.chunk_vectors = np.vstack(chunk_vectors) |
|
else: |
|
|
|
self.vectors = self.vectorizer.fit_transform(self.file_texts) |
|
|
|
|
|
if self.chunks: |
|
self.chunk_vectors = self.vectorizer.transform(self.chunks) |
|
|
|
return True |
|
except Exception as e: |
|
logger.error(f"Error building search index: {e}") |
|
return False |
|
|
|
def expand_query(self, query): |
|
"""Add related terms to query for better recall - mini LLM function""" |
|
|
|
expansions = { |
|
"exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"], |
|
"test": ["exam", "quiz", "assessment", "paper"], |
|
"document": ["file", "paper", "report", "doc", "documentation"], |
|
"manual": ["guide", "instruction", "documentation", "handbook"], |
|
"tutorial": ["guide", "instructions", "how-to", "lesson"], |
|
"article": ["paper", "publication", "journal", "research"], |
|
"research": ["study", "investigation", "paper", "analysis"], |
|
"book": ["textbook", "publication", "volume", "edition"], |
|
"thesis": ["dissertation", "paper", "research", "study"], |
|
"report": ["document", "paper", "analysis", "summary"], |
|
"assignment": ["homework", "task", "project", "work"], |
|
"lecture": ["class", "presentation", "talk", "lesson"], |
|
"notes": ["annotations", "summary", "outline", "study material"], |
|
"syllabus": ["curriculum", "course outline", "program", "plan"], |
|
"paper": ["document", "article", "publication", "exam", "test"], |
|
"question": ["problem", "query", "exercise", "inquiry"], |
|
"solution": ["answer", "resolution", "explanation", "result"], |
|
"reference": ["source", "citation", "bibliography", "resource"], |
|
"analysis": ["examination", "study", "evaluation", "assessment"], |
|
"guide": ["manual", "instruction", "handbook", "tutorial"], |
|
"worksheet": ["exercise", "activity", "handout", "practice"], |
|
"review": ["evaluation", "assessment", "critique", "feedback"], |
|
"material": ["resource", "content", "document", "information"], |
|
"data": ["information", "statistics", "figures", "numbers"] |
|
} |
|
|
|
|
|
query_words = re.findall(r'\b\w+\b', query.lower()) |
|
expanded_terms = set() |
|
|
|
|
|
for word in query_words: |
|
if word in expansions: |
|
expanded_terms.update(expansions[word]) |
|
|
|
|
|
if any(term in query.lower() for term in ["file", "document", "download", "paper"]): |
|
if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]): |
|
expanded_terms.update(["pdf", "docx", "pptx", "xlsx"]) |
|
|
|
|
|
if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]): |
|
expanded_terms.update(["syllabus", "lecture", "notes", "textbook"]) |
|
|
|
|
|
if expanded_terms: |
|
expanded_query = f"{query} {' '.join(expanded_terms)}" |
|
logger.info(f"Expanded query: '{query}' -> '{expanded_query}'") |
|
return expanded_query |
|
return query |
|
|
|
def search(self, query, top_k=5, search_chunks=True): |
|
"""Enhanced search with both document and chunk-level search""" |
|
if self.vectors is None: |
|
return [] |
|
|
|
|
|
expanded_query = self.expand_query(query) |
|
|
|
try: |
|
results = [] |
|
|
|
if self.use_transformer: |
|
|
|
query_vector = self.model.encode([expanded_query])[0] |
|
|
|
|
|
if self.vectors is not None: |
|
|
|
doc_similarities = cosine_similarity( |
|
query_vector.reshape(1, -1), |
|
self.vectors |
|
).flatten() |
|
|
|
top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] |
|
|
|
for i, idx in enumerate(top_doc_indices): |
|
if doc_similarities[idx] > 0.2: |
|
results.append({ |
|
'file_info': self.file_metadata[idx], |
|
'score': float(doc_similarities[idx]), |
|
'rank': i+1, |
|
'match_type': 'document', |
|
'language': self.languages[idx] if idx < len(self.languages) else 'unknown' |
|
}) |
|
|
|
|
|
if search_chunks and self.chunk_vectors is not None: |
|
|
|
chunk_similarities = cosine_similarity( |
|
query_vector.reshape(1, -1), |
|
self.chunk_vectors |
|
).flatten() |
|
|
|
top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] |
|
|
|
|
|
seen_files = set(r['file_info']['url'] for r in results) |
|
|
|
for i, idx in enumerate(top_chunk_indices): |
|
if chunk_similarities[idx] > 0.25: |
|
file_index = self.chunk_metadata[idx]['file_index'] |
|
file_info = self.file_metadata[file_index] |
|
|
|
|
|
if file_info['url'] not in seen_files: |
|
seen_files.add(file_info['url']) |
|
results.append({ |
|
'file_info': file_info, |
|
'score': float(chunk_similarities[idx]), |
|
'rank': len(results) + 1, |
|
'match_type': 'chunk', |
|
'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', |
|
'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] |
|
}) |
|
|
|
|
|
if len(results) >= top_k*1.5: |
|
break |
|
else: |
|
|
|
query_vector = self.vectorizer.transform([expanded_query]) |
|
|
|
|
|
if self.vectors is not None: |
|
doc_similarities = cosine_similarity(query_vector, self.vectors).flatten() |
|
top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] |
|
|
|
for i, idx in enumerate(top_doc_indices): |
|
if doc_similarities[idx] > 0.1: |
|
results.append({ |
|
'file_info': self.file_metadata[idx], |
|
'score': float(doc_similarities[idx]), |
|
'rank': i+1, |
|
'match_type': 'document', |
|
'language': self.languages[idx] if idx < len(self.languages) else 'unknown' |
|
}) |
|
|
|
|
|
if search_chunks and self.chunk_vectors is not None: |
|
chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten() |
|
top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] |
|
|
|
|
|
seen_files = set(r['file_info']['url'] for r in results) |
|
|
|
for i, idx in enumerate(top_chunk_indices): |
|
if chunk_similarities[idx] > 0.15: |
|
file_index = self.chunk_metadata[idx]['file_index'] |
|
file_info = self.file_metadata[file_index] |
|
|
|
if file_info['url'] not in seen_files: |
|
seen_files.add(file_info['url']) |
|
results.append({ |
|
'file_info': file_info, |
|
'score': float(chunk_similarities[idx]), |
|
'rank': len(results) + 1, |
|
'match_type': 'chunk', |
|
'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', |
|
'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] |
|
}) |
|
|
|
if len(results) >= top_k*1.5: |
|
break |
|
|
|
|
|
results.sort(key=lambda x: x['score'], reverse=True) |
|
|
|
|
|
for i, result in enumerate(results[:top_k]): |
|
result['rank'] = i+1 |
|
|
|
return results[:top_k] |
|
except Exception as e: |
|
logger.error(f"Error during search: {e}") |
|
return [] |
|
|
|
|
|
def get_random_user_agent(): |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
def create_zip_file(file_paths, output_dir): |
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file_path in file_paths: |
|
zipf.write(file_path, os.path.basename(file_path)) |
|
return zip_path |
|
|
|
def get_file_extension(url, default='.pdf'): |
|
"""Extract file extension from URL or filename""" |
|
path = urlparse(url).path |
|
ext = os.path.splitext(path)[1].lower() |
|
if not ext: |
|
return default |
|
return ext |
|
|
|
def humanize_file_size(size_bytes): |
|
"""Format file size in human-readable format""" |
|
if size_bytes < 1024: |
|
return f"{size_bytes} bytes" |
|
for unit in ['KB', 'MB', 'GB', 'TB']: |
|
size_bytes /= 1024.0 |
|
if size_bytes < 1024.0: |
|
return f"{size_bytes:.1f} {unit}" |
|
return f"{size_bytes:.1f} PB" |
|
|
|
def get_domain(url): |
|
"""Extract domain from URL""" |
|
parsed = urlparse(url) |
|
return parsed.netloc |
|
|
|
def is_valid_file_url(url, extensions): |
|
"""Check if URL is a valid file URL based on extension""" |
|
return any(url.lower().endswith(ext) for ext in extensions) |
|
|
|
def detect_captcha(html_content): |
|
"""Detect common captcha patterns in HTML content""" |
|
captcha_patterns = [ |
|
'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', |
|
'challenge', 'solve the following', 'verify you are human' |
|
] |
|
html_lower = html_content.lower() |
|
return any(pattern in html_lower for pattern in captcha_patterns) |
|
|
|
def is_download_link(url): |
|
"""Enhanced function to detect if a URL is likely a download link""" |
|
|
|
url_lower = url.lower() |
|
|
|
|
|
download_terms = [ |
|
'download', 'dl', 'get', 'file', 'attachment', 'export', 'view', |
|
'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document' |
|
] |
|
if any(term in url_lower for term in download_terms): |
|
return True |
|
|
|
|
|
script_patterns = [ |
|
'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php', |
|
'download.aspx', 'getfile.aspx', 'file.aspx', |
|
'downloadhandler', 'filehandler', 'filedownload', |
|
'download.jsp', 'download.cgi', 'download.do', |
|
'download-file', 'get-file', |
|
'downloadfile', 'getfile', 'viewfile', |
|
'Action=downloadfile', 'action=download', 'action=view', |
|
'download?', 'file?', 'get?', 'view?' |
|
] |
|
if any(pattern in url_lower for pattern in script_patterns): |
|
return True |
|
|
|
|
|
path = urlparse(url).path |
|
common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', |
|
'.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg', |
|
'.png', '.gif', '.mp3', '.mp4', '.avi', '.mov'] |
|
|
|
if any(ext in path.lower() for ext in common_extensions): |
|
return True |
|
|
|
|
|
params = parse_qs(urlparse(url).query) |
|
param_keys = params.keys() |
|
file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid'] |
|
if any(key.lower() in file_param_indicators for key in param_keys): |
|
return True |
|
|
|
|
|
if 'Action=downloadfile' in url or 'fname=' in url: |
|
return True |
|
|
|
return False |
|
|
|
def normalize_download_url(url): |
|
"""Normalize download URLs to handle various formats and encodings""" |
|
try: |
|
|
|
parsed = urlparse(url) |
|
|
|
|
|
if 'Action=downloadfile' in url and 'file=' in url: |
|
|
|
params = parse_qs(parsed.query) |
|
if 'file' in params: |
|
|
|
|
|
encoded_file = params['file'][0] |
|
|
|
return url |
|
|
|
|
|
if 'fname=' in url: |
|
|
|
return url |
|
|
|
|
|
path = parsed.path |
|
|
|
if '%' not in path and ' ' in path: |
|
path = quote(path) |
|
|
|
|
|
normalized = parsed._replace(path=path).geturl() |
|
return normalized |
|
except Exception as e: |
|
logger.error(f"Error normalizing URL {url}: {e}") |
|
return url |
|
|
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
def google_drive_upload(file_path, credentials, folder_id=None): |
|
try: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) |
|
file_metadata = {'name': os.path.basename(file_path)} |
|
if folder_id: |
|
file_metadata['parents'] = [folder_id] |
|
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True) |
|
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() |
|
return created.get("id", "") |
|
except Exception as e: |
|
return f"Error uploading to Drive: {str(e)}" |
|
|
|
def create_drive_folder(drive_service, name): |
|
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'} |
|
folder = drive_service.files().create(body=folder_metadata, fields='id').execute() |
|
return folder.get('id') |
|
|
|
|
|
def install_playwright_dependencies(): |
|
try: |
|
|
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
|
|
|
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
|
|
|
|
subprocess.run(['pip', 'install', 'playwright'], check=True) |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
|
|
st.success("Playwright dependencies installed successfully!") |
|
except Exception as e: |
|
st.error(f"Error installing Playwright dependencies: {e}") |
|
st.info("You may need to manually install dependencies. Check console for details.") |
|
logger.error(f"Playwright setup error: {e}") |
|
traceback.print_exc() |
|
|
|
|
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
self.use_stealth = use_stealth |
|
self.proxy_rotation = proxy_rotation |
|
self.request_count = 0 |
|
self.captcha_detected = False |
|
self.download_timeout = 300 |
|
|
|
self.visited_urls = set() |
|
|
|
self.downloaded_files = set() |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
|
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-gpu', |
|
'--no-zygote', |
|
'--single-process', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins', |
|
'--disable-site-isolation-trials' |
|
] |
|
|
|
|
|
if self.use_stealth: |
|
browser_args.extend([ |
|
'--disable-blink-features=AutomationControlled', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-webgl', |
|
'--disable-webrtc' |
|
]) |
|
|
|
|
|
opts = { |
|
"headless": True, |
|
"args": browser_args |
|
} |
|
|
|
|
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
|
|
|
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
|
|
|
|
context_opts = { |
|
"user_agent": get_random_user_agent(), |
|
"viewport": {"width": 1920, "height": 1080}, |
|
"device_scale_factor": 1, |
|
"has_touch": False, |
|
"is_mobile": False, |
|
"ignore_https_errors": True, |
|
"accept_downloads": True |
|
} |
|
|
|
|
|
if self.use_stealth: |
|
|
|
context_opts["bypass_csp"] = True |
|
self.context = await self.browser.new_context(**context_opts) |
|
|
|
|
|
await self.context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change navigator properties |
|
const newProto = navigator.__proto__; |
|
delete newProto.webdriver; |
|
|
|
// Overwrite the plugins |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
// Handle languages more naturally |
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
|
|
// Modify hardware concurrency |
|
Object.defineProperty(navigator, 'hardwareConcurrency', { |
|
get: () => 4 |
|
}); |
|
|
|
// Modify deviceMemory |
|
Object.defineProperty(navigator, 'deviceMemory', { |
|
get: () => 8 |
|
}); |
|
|
|
// WebGL modifications |
|
const getParameter = WebGLRenderingContext.prototype.getParameter; |
|
WebGLRenderingContext.prototype.getParameter = function(parameter) { |
|
if (parameter === 37445) { |
|
return 'Intel Inc.'; |
|
} |
|
if (parameter === 37446) { |
|
return 'Intel Iris OpenGL Engine'; |
|
} |
|
return getParameter.apply(this, arguments); |
|
}; |
|
} |
|
""") |
|
else: |
|
|
|
self.context = await self.browser.new_context(**context_opts) |
|
|
|
|
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
|
'Cache-Control': 'max-age=0', |
|
'DNT': '1', |
|
'Referer': 'https://www.google.com/', |
|
'Sec-Fetch-Dest': 'document', |
|
'Sec-Fetch-Mode': 'navigate', |
|
'Sec-Fetch-Site': 'cross-site', |
|
'Sec-Fetch-User': '?1', |
|
'Upgrade-Insecure-Requests': '1' |
|
}) |
|
|
|
|
|
if self.use_stealth: |
|
await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) |
|
await self.page.wait_for_timeout(random.randint(200, 500)) |
|
|
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
async def rotate_proxy_if_needed(self): |
|
"""Rotate proxy if proxy rotation is enabled and threshold is reached""" |
|
if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: |
|
self.request_count += 1 |
|
if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: |
|
|
|
next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) |
|
PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) |
|
|
|
|
|
if self.context: |
|
await self.context.close() |
|
|
|
|
|
context_opts = { |
|
"user_agent": get_random_user_agent(), |
|
"proxy": {"server": next_proxy}, |
|
"accept_downloads": True |
|
} |
|
self.context = await self.browser.new_context(**context_opts) |
|
self.page = await self.context.new_page() |
|
|
|
|
|
self.request_count = 0 |
|
logger.info(f"Rotated to new proxy: {next_proxy}") |
|
|
|
async def handle_captcha(self, page): |
|
"""Detect and handle captchas if possible""" |
|
|
|
content = await page.content() |
|
if detect_captcha(content): |
|
self.captcha_detected = True |
|
logger.warning("Captcha detected on page") |
|
|
|
|
|
|
|
captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') |
|
if captcha_img: |
|
logger.info("Found captcha image, attempting to capture") |
|
|
|
|
|
captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") |
|
await captcha_img.screenshot(path=captcha_path) |
|
|
|
|
|
|
|
logger.info(f"Captcha image saved to {captcha_path}") |
|
|
|
|
|
return False |
|
|
|
|
|
recaptcha = await page.query_selector('iframe[src*="recaptcha"]') |
|
if recaptcha: |
|
logger.warning("reCAPTCHA detected, would require external solving service") |
|
return False |
|
|
|
|
|
await self.perform_human_actions(page) |
|
|
|
|
|
content = await page.content() |
|
if detect_captcha(content): |
|
logger.warning("Captcha still present after human-like actions") |
|
return False |
|
else: |
|
logger.info("Captcha appears to be resolved") |
|
return True |
|
|
|
return True |
|
|
|
async def perform_human_actions(self, page): |
|
"""Perform human-like actions on the page to possibly bypass simple bot checks""" |
|
try: |
|
|
|
for i in range(3): |
|
await page.evaluate(f"window.scrollTo(0, {i * 300})") |
|
await page.wait_for_timeout(random.randint(300, 700)) |
|
|
|
|
|
for _ in range(3): |
|
x = random.randint(100, 800) |
|
y = random.randint(100, 600) |
|
await page.mouse.move(x=x, y=y) |
|
await page.wait_for_timeout(random.randint(200, 500)) |
|
|
|
|
|
try: |
|
await page.click("body", position={"x": 50, "y": 50}) |
|
except: |
|
pass |
|
|
|
|
|
await page.wait_for_timeout(1000) |
|
|
|
except Exception as e: |
|
logger.warning(f"Error during human-like actions: {e}") |
|
|
|
async def search_bing(self): |
|
urls = [] |
|
try: |
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
search_url = f"https://www.bing.com/search?q={self.query}" |
|
await self.page.goto(search_url, timeout=30000) |
|
await self.page.wait_for_load_state('networkidle') |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected during search, results may be limited") |
|
|
|
|
|
for i in range(3): |
|
await self.page.evaluate(f"window.scrollTo(0, {i * 400})") |
|
await self.page.wait_for_timeout(random.randint(300, 800)) |
|
|
|
|
|
links = await self.page.query_selector_all("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = await link.get_attribute('href') |
|
if href: |
|
urls.append(href) |
|
|
|
|
|
if len(urls) < self.num_results: |
|
alt_links = await self.page.query_selector_all(".b_caption a") |
|
for link in alt_links: |
|
href = await link.get_attribute('href') |
|
if href and href not in urls: |
|
urls.append(href) |
|
if len(urls) >= self.num_results: |
|
break |
|
|
|
return urls |
|
except Exception as e: |
|
logger.error(f"Error searching Bing: {e}") |
|
return [] |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: |
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Range': 'bytes=0-0' |
|
} |
|
|
|
try: |
|
with requests.get(url, headers=headers, stream=True, timeout=10) as r: |
|
if 'Content-Range' in r.headers: |
|
content_range = r.headers['Content-Range'] |
|
match = re.search(r'bytes 0-0/(\d+)', content_range) |
|
if match: |
|
size = int(match.group(1)) |
|
return sizeof_fmt(size) |
|
|
|
if 'Content-Length' in r.headers: |
|
size = int(r.headers['Content-Length']) |
|
|
|
if size > 1: |
|
return sizeof_fmt(size) |
|
except Exception as e: |
|
logger.warning(f"Error getting file size with Range request: {e}") |
|
|
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
except Exception as e: |
|
logger.warning(f"Error getting file size with browser: {e}") |
|
|
|
return "Unknown Size" |
|
else: |
|
|
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception as e: |
|
logger.warning(f"Error getting file size: {e}") |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
resp = await page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception as e: |
|
logger.warning(f"Error reading PDF metadata: {e}") |
|
return {} |
|
|
|
async def extract_real_download_url(self, url): |
|
"""Enhanced method to extract real download URL, handling complex URLs""" |
|
try: |
|
|
|
if 'Action=downloadfile' in url or 'fname=' in url: |
|
logger.info(f"Complex download URL detected: {url}") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
|
|
await page.route('**', lambda route: route.continue_()) |
|
|
|
|
|
responses = [] |
|
page.on('response', lambda response: responses.append(response)) |
|
|
|
try: |
|
|
|
await page.goto(url, wait_until='networkidle', timeout=30000) |
|
|
|
|
|
for response in responses: |
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
if 'attachment' in content_disposition or 'filename=' in content_disposition: |
|
return response.url |
|
|
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if content_type and content_type != 'text/html' and not content_type.startswith('text/'): |
|
return response.url |
|
|
|
|
|
return page.url |
|
except Exception as e: |
|
logger.warning(f"Error extracting real download URL: {e}") |
|
return url |
|
else: |
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
response = await page.goto(url, wait_until='networkidle', timeout=30000) |
|
if response and response.headers.get('location'): |
|
return response.headers['location'] |
|
return page.url |
|
except Exception as e: |
|
logger.error(f"Error extracting real download URL: {e}") |
|
return url |
|
|
|
|
|
async def get_edu_exam_links(self, url): |
|
"""Specialized method for educational exam websites that follows a common pattern.""" |
|
try: |
|
logger.info(f"Fetching exam links from {url}") |
|
links = set() |
|
|
|
|
|
headers = { |
|
"User-Agent": get_random_user_agent(), |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
"Accept-Language": "en-US,en;q=0.9", |
|
"Referer": "https://www.google.com/", |
|
"DNT": "1" |
|
} |
|
|
|
try: |
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
|
if response.status_code == 200: |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all("a", href=True): |
|
href = a["href"] |
|
full_url = urljoin(url, href) |
|
|
|
|
|
link_text = a.get_text().lower() |
|
|
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_", |
|
"/resource/", "/subject/", "/course/", "/material/" |
|
] |
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"subject", "course", "resource", "pdf", "document", |
|
"view", "open", "get", "solution", "answer" |
|
] |
|
|
|
|
|
if any(pattern in full_url.lower() for pattern in url_patterns): |
|
links.add(full_url) |
|
continue |
|
|
|
|
|
if any(pattern in link_text for pattern in text_patterns): |
|
links.add(full_url) |
|
continue |
|
|
|
|
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
|
|
|
|
if "Action=downloadfile" in url or "fname=" in url: |
|
links.add(url) |
|
except Exception as e: |
|
logger.warning(f"Request-based extraction failed: {e}") |
|
|
|
|
|
try: |
|
|
|
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: |
|
logger.info("Using browser for enhanced link extraction") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
await self.page.goto(url, timeout=45000, wait_until='networkidle') |
|
await self.page.wait_for_timeout(random.randint(1000, 2000)) |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected, extraction may be limited") |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
page_height = await self.page.evaluate("document.body.scrollHeight") |
|
viewport_height = await self.page.evaluate("window.innerHeight") |
|
|
|
for scroll_pos in range(0, page_height, viewport_height // 2): |
|
await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") |
|
await self.page.wait_for_timeout(random.randint(300, 800)) |
|
|
|
|
|
await self.page.evaluate("window.scrollTo(0, 0)") |
|
await self.page.wait_for_timeout(500) |
|
|
|
|
|
all_links = await self.page.evaluate(""" |
|
() => { |
|
const results = []; |
|
|
|
// Get all anchor tags |
|
const anchors = document.querySelectorAll('a[href]'); |
|
for (const a of anchors) { |
|
if (a.href) { |
|
results.push({ |
|
href: a.href, |
|
text: a.innerText || a.textContent || '', |
|
isButton: a.classList.contains('btn') || a.role === 'button' |
|
}); |
|
} |
|
} |
|
|
|
// Get buttons that might contain links |
|
const buttons = document.querySelectorAll('button'); |
|
for (const btn of buttons) { |
|
const onclick = btn.getAttribute('onclick') || ''; |
|
if (onclick.includes('window.location') || onclick.includes('download')) { |
|
results.push({ |
|
href: '#button', |
|
text: btn.innerText || btn.textContent || '', |
|
isButton: true, |
|
onclick: onclick |
|
}); |
|
} |
|
} |
|
|
|
return results; |
|
} |
|
""") |
|
|
|
|
|
for link_info in all_links: |
|
href = link_info.get('href', '') |
|
text = link_info.get('text', '').lower() |
|
|
|
if href and href != '#button': |
|
|
|
url_patterns = [ |
|
"/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/", |
|
"paper_", "question_", "exam_", "test_", "past_", |
|
"assignment_", "sample_", "study_material", "notes_" |
|
] |
|
|
|
|
|
text_patterns = [ |
|
"exam", "paper", "test", "question", "past", "download", |
|
"assignment", "sample", "study", "material", "notes", |
|
"pdf", "document", "view", "open", "solution" |
|
] |
|
|
|
if any(pattern in href.lower() for pattern in url_patterns) or \ |
|
any(pattern in text for pattern in text_patterns) or \ |
|
any(href.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(href) |
|
|
|
|
|
download_links = await self.page.evaluate(""" |
|
() => { |
|
// Find all links that might be download links |
|
const links = Array.from(document.querySelectorAll('a[href]')); |
|
return links |
|
.filter(a => { |
|
const href = a.href.toLowerCase(); |
|
return href.includes('download') || |
|
href.includes('getfile') || |
|
href.includes('view.php') || |
|
href.includes('action=downloadfile') || |
|
href.includes('fname='); |
|
}) |
|
.map(a => a.href); |
|
} |
|
""") |
|
|
|
for dl_link in download_links: |
|
links.add(dl_link) |
|
|
|
|
|
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') |
|
for grid in grid_elements: |
|
grid_links = await grid.query_selector_all('a[href]') |
|
for a in grid_links: |
|
href = await a.get_attribute('href') |
|
text = await a.text_content() |
|
|
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
links.add(full_url) |
|
|
|
|
|
pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') |
|
for i, button in enumerate(pagination_buttons[:5]): |
|
try: |
|
|
|
button_text = await button.text_content() |
|
if button_text and button_text.strip().isdigit(): |
|
logger.info(f"Clicking pagination button: {button_text}") |
|
await button.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_page_links = await self.page.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); |
|
} |
|
""") |
|
|
|
for href in new_page_links: |
|
if href and not href.startswith('javascript:'): |
|
if any(pattern in href.lower() for pattern in url_patterns) or \ |
|
any(href.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(href) |
|
except Exception as e: |
|
logger.warning(f"Error clicking pagination button: {e}") |
|
|
|
|
|
show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') |
|
for button in show_buttons: |
|
button_text = (await button.text_content() or "").lower() |
|
button_value = (await button.get_attribute("value") or "").lower() |
|
button_id = (await button.get_attribute("id") or "").lower() |
|
|
|
|
|
promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", |
|
"download", "resource", "material", "browse", "file"] |
|
|
|
if any(term in button_text or term in button_value or term in button_id |
|
for term in promising_terms): |
|
try: |
|
logger.info(f"Clicking button: {button_text or button_value}") |
|
await button.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
|
|
|
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ |
|
any(pattern in full_url.lower() for pattern in url_patterns): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
try: |
|
|
|
postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') |
|
for i, element in enumerate(postback_elements[:10]): |
|
try: |
|
onclick = await element.get_attribute('onclick') |
|
if onclick and '__doPostBack' in onclick: |
|
element_text = await element.text_content() |
|
|
|
|
|
promising_terms = ["show", "view", "list", "exam", "paper", "test", |
|
"download", "resource", "material"] |
|
|
|
if any(term in element_text.lower() for term in promising_terms): |
|
logger.info(f"Clicking ASP.NET postback element: {element_text}") |
|
|
|
|
|
await element.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=10000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
if any(full_url.lower().endswith(ext) for ext in |
|
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with postback element: {e}") |
|
except Exception as e: |
|
logger.warning(f"Error during postback handling: {e}") |
|
|
|
except Exception as e: |
|
logger.error(f"Browser-based extraction failed: {e}") |
|
|
|
|
|
filtered_links = [] |
|
for link in links: |
|
|
|
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if any(pattern in link.lower() for pattern in [ |
|
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", |
|
"/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", |
|
"/resource/", "/material/", "/notes/", "/subjectmaterial/" |
|
]): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if is_download_link(link): |
|
filtered_links.append(link) |
|
|
|
logger.info(f"Found {len(filtered_links)} potential exam document links") |
|
return filtered_links |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting exam links: {e}") |
|
return [] |
|
|
|
async def discover_hidden_links(self, page): |
|
"""Discover hidden links that might be in JavaScript, iframes, or dynamic content""" |
|
hidden_links = set() |
|
|
|
|
|
js_links = await page.evaluate(""" |
|
() => { |
|
const links = new Set(); |
|
|
|
// Extract URLs from script tags |
|
const scripts = document.querySelectorAll('script'); |
|
for (const script of scripts) { |
|
const content = script.textContent || ''; |
|
const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; |
|
for (let match of urlMatches) { |
|
links.add(match.replace(/["']/g, '')); |
|
} |
|
} |
|
|
|
// Look for download-related variables in scripts |
|
for (const script of scripts) { |
|
const content = script.textContent || ''; |
|
// Look for common patterns for file URLs in JavaScript |
|
if (content.includes('downloadURL') || content.includes('fileURL') || |
|
content.includes('pdfURL') || content.includes('documentURL')) { |
|
|
|
// Extract potential URLs |
|
const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; |
|
for (let match of potentialUrls) { |
|
const url = match.replace(/["']/g, ''); |
|
// Try to resolve relative URLs |
|
if (url.startsWith('/') || !url.includes('://')) { |
|
if (url.startsWith('/')) { |
|
links.add(window.location.origin + url); |
|
} else { |
|
// Handle relative paths more carefully |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + url); |
|
} |
|
} else if (url.startsWith('http')) { |
|
links.add(url); |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Check for links in data attributes |
|
const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); |
|
for (const el of elements) { |
|
for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { |
|
const val = el.getAttribute(attr); |
|
if (val) { |
|
// Try to resolve relative URLs |
|
if (val.startsWith('/')) { |
|
links.add(window.location.origin + val); |
|
} else if (val.startsWith('http')) { |
|
links.add(val); |
|
} else if (!val.startsWith('javascript:') && !val.startsWith('#')) { |
|
// Handle relative paths |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + val); |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Look for URLs in inline event handlers |
|
const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); |
|
for (const el of clickableElements) { |
|
for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { |
|
const val = el.getAttribute(attr); |
|
if (val) { |
|
// Check for JavaScript URLs with window.location |
|
if (val.includes('window.location') || val.includes('document.location')) { |
|
const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); |
|
if (urlMatch && urlMatch[1]) { |
|
const url = urlMatch[1]; |
|
if (url.startsWith('/')) { |
|
links.add(window.location.origin + url); |
|
} else if (url.startsWith('http')) { |
|
links.add(url); |
|
} else if (!url.startsWith('javascript:') && !url.startsWith('#')) { |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + url); |
|
} |
|
} |
|
} |
|
|
|
// Check for direct URLs in attributes |
|
const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; |
|
for (let match of urlMatches) { |
|
links.add(match.replace(/["']/g, '')); |
|
} |
|
|
|
// Check for download.php and similar patterns |
|
if (val.includes('download.php') || val.includes('getfile.php') || |
|
val.includes('Action=downloadfile') || val.includes('viewfile.php')) { |
|
|
|
// Handle both onclick handlers and direct hrefs |
|
let url = ''; |
|
if (attr === 'href') { |
|
url = val; |
|
} else { |
|
// Extract URL from JavaScript |
|
const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); |
|
if (jsUrlMatch) { |
|
url = jsUrlMatch[1]; |
|
} |
|
} |
|
|
|
// Resolve URL if needed |
|
if (url) { |
|
if (url.startsWith('/')) { |
|
links.add(window.location.origin + url); |
|
} else if (url.startsWith('http')) { |
|
links.add(url); |
|
} else if (!url.startsWith('javascript:') && !url.startsWith('#')) { |
|
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); |
|
links.add(base + url); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Find PHP/ASP file download links |
|
const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); |
|
for (const link of fileLinks) { |
|
links.add(link.href); |
|
} |
|
|
|
return Array.from(links); |
|
} |
|
""") |
|
|
|
for link in js_links: |
|
hidden_links.add(link) |
|
|
|
|
|
iframes = await page.query_selector_all('iframe') |
|
for iframe in iframes: |
|
try: |
|
frame = await iframe.content_frame() |
|
if frame: |
|
iframe_links = await frame.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')) |
|
.map(a => a.href) |
|
.filter(href => href.startsWith('http')); |
|
} |
|
""") |
|
for link in iframe_links: |
|
hidden_links.add(link) |
|
except Exception as e: |
|
logger.warning(f"Could not extract links from iframe: {e}") |
|
|
|
|
|
shadow_links = await page.evaluate(""" |
|
() => { |
|
const links = new Set(); |
|
|
|
// Helper function to recursively process shadow roots |
|
function processShadowRoot(root) { |
|
if (!root) return; |
|
|
|
// Get links in this shadow root |
|
const shadowLinks = root.querySelectorAll('a[href]'); |
|
for (const link of shadowLinks) { |
|
if (link.href && link.href.startsWith('http')) { |
|
links.add(link.href); |
|
} |
|
} |
|
|
|
// Process nested shadow roots |
|
const elements = root.querySelectorAll('*'); |
|
for (const el of elements) { |
|
if (el.shadowRoot) { |
|
processShadowRoot(el.shadowRoot); |
|
} |
|
} |
|
} |
|
|
|
// Find all shadow roots in the document |
|
const elements = document.querySelectorAll('*'); |
|
for (const el of elements) { |
|
if (el.shadowRoot) { |
|
processShadowRoot(el.shadowRoot); |
|
} |
|
} |
|
|
|
return Array.from(links); |
|
} |
|
""") |
|
|
|
for link in shadow_links: |
|
hidden_links.add(link) |
|
|
|
|
|
form_links = await page.evaluate(""" |
|
() => { |
|
const links = new Set(); |
|
|
|
// Check for form actions that might be download endpoints |
|
const forms = document.querySelectorAll('form'); |
|
for (const form of forms) { |
|
const action = form.action || ''; |
|
if (action && ( |
|
action.includes('download') || |
|
action.includes('getfile') || |
|
action.includes('viewfile') || |
|
action.includes('Action=downloadfile') |
|
)) { |
|
// Collect input values that might be needed for the download |
|
const inputs = {}; |
|
const formInputs = form.querySelectorAll('input[name]'); |
|
for (const input of formInputs) { |
|
inputs[input.name] = input.value; |
|
} |
|
|
|
// Store both the form action and any important inputs |
|
links.add(action); |
|
} |
|
} |
|
|
|
return Array.from(links); |
|
} |
|
""") |
|
|
|
for link in form_links: |
|
hidden_links.add(link) |
|
|
|
return hidden_links |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
normalized_url = normalize_download_url(url) |
|
|
|
|
|
if normalized_url in self.visited_urls: |
|
logger.info(f"Skipping already visited URL: {normalized_url}") |
|
return [] |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
if is_download_link(normalized_url): |
|
logger.info(f"Processing potential direct download link: {normalized_url}") |
|
|
|
|
|
real_url = await self.extract_real_download_url(normalized_url) |
|
|
|
|
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): |
|
|
|
params = parse_qs(urlparse(normalized_url).query) |
|
|
|
|
|
for param in ['file', 'filename', 'name', 'fname', 'f']: |
|
if param in params and params[param]: |
|
potential_filename = params[param][0] |
|
if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: |
|
filename = os.path.basename(potential_filename) |
|
break |
|
|
|
|
|
if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): |
|
domain = get_domain(real_url) |
|
|
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: |
|
if common_ext in normalized_url.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': {}, |
|
'download_url': normalized_url |
|
}) |
|
|
|
|
|
if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): |
|
return found_files |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized handler for educational exam site") |
|
|
|
|
|
exam_links = await self.get_edu_exam_links(url) |
|
|
|
for link in exam_links: |
|
|
|
real_url = await self.extract_real_download_url(link) |
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
if not filename or filename == '/': |
|
domain = get_domain(real_url) |
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: |
|
if common_ext in link.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
meta = {} |
|
if real_url.lower().endswith('.pdf'): |
|
try: |
|
meta = await self.get_pdf_metadata(real_url) |
|
except Exception: |
|
pass |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': link |
|
}) |
|
|
|
|
|
if found_files: |
|
return found_files |
|
|
|
|
|
response = await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
if not response: |
|
return [] |
|
|
|
|
|
if not await self.handle_captcha(self.page): |
|
logger.warning("Captcha detected, file extraction may be limited") |
|
|
|
|
|
await self.page.evaluate(""" |
|
(async () => { |
|
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); |
|
const height = document.body.scrollHeight; |
|
const scrollStep = Math.floor(window.innerHeight / 2); |
|
|
|
for (let i = 0; i < height; i += scrollStep) { |
|
window.scrollTo(0, i); |
|
await delay(100); |
|
} |
|
|
|
window.scrollTo(0, 0); |
|
})() |
|
""") |
|
await self.page.wait_for_timeout(1000) |
|
|
|
final_url = self.page.url |
|
if '.php' in final_url or 'download' in final_url: |
|
real_url = await self.extract_real_download_url(final_url) |
|
if real_url != final_url: |
|
|
|
response = await self.page.request.head(real_url, timeout=15000) |
|
filename = None |
|
|
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
if 'filename=' in content_disposition: |
|
filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) |
|
if filename_match: |
|
filename = filename_match.group(1) |
|
|
|
|
|
if not filename: |
|
filename = os.path.basename(urlparse(real_url).path) |
|
if not filename or filename == '/': |
|
|
|
domain = get_domain(real_url) |
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: |
|
if common_ext in real_url.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {}, |
|
'download_url': final_url |
|
}) |
|
return found_files |
|
|
|
await self.page.wait_for_load_state('networkidle', timeout=30000) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', |
|
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', |
|
'.pptx', '.odt', '.txt'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
parsed_base = urlparse(final_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
|
|
if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
real_url = await self.extract_real_download_url(full_url) |
|
if real_url and real_url != full_url: |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {}, |
|
'download_url': full_url |
|
}) |
|
continue |
|
|
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': file_url |
|
}) |
|
|
|
|
|
elif ("drive.google.com" in href) or ("docs.google.com" in href): |
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
if file_id: |
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
|
|
|
|
filename = f"gdrive_{file_id}" |
|
if file_type: |
|
filename = f"{filename}.{file_type}" |
|
|
|
size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") |
|
|
|
found_files.append({ |
|
'url': href, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': { |
|
'view_only': is_view_only, |
|
'file_type': file_type, |
|
'file_id': file_id |
|
}, |
|
'download_url': href |
|
}) |
|
|
|
|
|
other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) |
|
for elem in other_elements: |
|
src = elem.get('src') or elem.get('data') |
|
if src and any(src.lower().endswith(ext) for ext in all_exts): |
|
file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': file_url |
|
}) |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') |
|
for elem in onclick_elements: |
|
onclick = await elem.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url_match in urls: |
|
if any(url_match.lower().endswith(ext) for ext in all_exts): |
|
size_str = await self.get_file_size(url_match) |
|
meta = {} |
|
if url_match.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(url_match) |
|
found_files.append({ |
|
'url': url_match, |
|
'filename': os.path.basename(url_match.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta, |
|
'download_url': url_match |
|
}) |
|
|
|
|
|
data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') |
|
for elem in data_elements: |
|
for attr in ['data-src', 'data-url', 'data-href', 'data-download']: |
|
try: |
|
value = await elem.get_attribute(attr) |
|
if value and any(value.lower().endswith(ext) for ext in all_exts): |
|
file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': await self.get_file_size(file_url), |
|
'metadata': {}, |
|
'download_url': file_url |
|
}) |
|
except: |
|
pass |
|
|
|
|
|
script_elements = soup.find_all('script', type='application/json') |
|
for script in script_elements: |
|
try: |
|
json_data = json.loads(script.string) |
|
|
|
def extract_urls_from_json(obj, urls_found=None): |
|
if urls_found is None: |
|
urls_found = [] |
|
if isinstance(obj, dict): |
|
for k, v in obj.items(): |
|
|
|
url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] |
|
if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): |
|
urls_found.append(v) |
|
else: |
|
extract_urls_from_json(v, urls_found) |
|
elif isinstance(obj, list): |
|
for item in obj: |
|
extract_urls_from_json(item, urls_found) |
|
return urls_found |
|
|
|
json_urls = extract_urls_from_json(json_data) |
|
for json_url in json_urls: |
|
if any(json_url.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': json_url, |
|
'filename': os.path.basename(json_url.split('?')[0]), |
|
'size': await self.get_file_size(json_url), |
|
'metadata': {}, |
|
'download_url': json_url |
|
}) |
|
except: |
|
pass |
|
|
|
|
|
hidden_elements = await self.page.evaluate(""" |
|
() => { |
|
const results = []; |
|
|
|
// Check for hidden forms with download actions |
|
const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); |
|
for (const form of forms) { |
|
const action = form.getAttribute('action') || ''; |
|
results.push({ |
|
type: 'form', |
|
action: action, |
|
inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { |
|
return {name: input.name, value: input.value}; |
|
}) |
|
}); |
|
} |
|
|
|
// Check for hidden download links/buttons |
|
const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { |
|
const style = window.getComputedStyle(a); |
|
return (style.display === 'none' || style.visibility === 'hidden') && |
|
(a.href.includes('download') || a.href.includes('file')); |
|
}); |
|
|
|
for (const link of hiddenLinks) { |
|
results.push({ |
|
type: 'link', |
|
href: link.href, |
|
text: link.innerText || link.textContent |
|
}); |
|
} |
|
|
|
return results; |
|
} |
|
""") |
|
|
|
|
|
for elem in hidden_elements: |
|
if elem['type'] == 'link' and 'href' in elem: |
|
href = elem['href'] |
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': href, |
|
'filename': os.path.basename(href.split('?')[0]), |
|
'size': await self.get_file_size(href), |
|
'metadata': {}, |
|
'download_url': href |
|
}) |
|
|
|
|
|
hidden_links = await self.discover_hidden_links(self.page) |
|
for link in hidden_links: |
|
if any(link.lower().endswith(ext) for ext in all_exts): |
|
found_files.append({ |
|
'url': link, |
|
'filename': os.path.basename(link.split('?')[0]), |
|
'size': await self.get_file_size(link), |
|
'metadata': {}, |
|
'download_url': link |
|
}) |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
return unique_files |
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
traceback.print_exc() |
|
return [] |
|
|
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info.get('download_url', file_info['url']) |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
|
|
if file_url in self.downloaded_files: |
|
logger.info(f"File already downloaded: {file_url}") |
|
return None |
|
|
|
try: |
|
|
|
if "drive.google.com" in file_url or "docs.google.com" in file_url: |
|
|
|
is_view_only = file_info.get('metadata', {}).get('view_only', False) |
|
|
|
|
|
if is_view_only: |
|
logger.info(f"Attempting to download view-only file: {file_url}") |
|
result_path = await self.force_download_viewonly(file_info, path) |
|
if result_path: |
|
self.downloaded_files.add(file_url) |
|
return result_path |
|
|
|
|
|
logger.info("Primary method failed, trying fallback methods") |
|
|
|
|
|
success = await self.download_from_google_drive(file_url, path) |
|
if success: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
|
|
|
|
logger.warning("All standard methods failed, attempting force download") |
|
result_path = await self.force_download_viewonly(file_info, path) |
|
if result_path: |
|
self.downloaded_files.add(file_url) |
|
return result_path if result_path else None |
|
|
|
|
|
if 'Action=downloadfile' in file_url or 'fname=' in file_url: |
|
logger.info(f"Using browser download approach for complex URL: {file_url}") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
async with self.context.new_page() as page: |
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
await page.goto(file_url, timeout=60000) |
|
|
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(path) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except Exception as e: |
|
logger.error(f"Browser download failed: {e}") |
|
|
|
|
|
download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') |
|
for button in download_buttons: |
|
try: |
|
await button.click() |
|
try: |
|
download = await download_promise |
|
await download.save_as(path) |
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except: |
|
pass |
|
except: |
|
continue |
|
|
|
|
|
logger.info("Browser approach failed, trying direct request") |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
try: |
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer, |
|
'DNT': '1' |
|
} |
|
|
|
with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: |
|
if response.status_code == 200: |
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if 'text/html' in content_type and not file_url.endswith('.html'): |
|
logger.warning(f"Received HTML instead of expected file: {file_url}") |
|
else: |
|
with open(path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except Exception as e: |
|
logger.warning(f"Direct download failed: {e}, trying browser approach") |
|
|
|
|
|
async with self.context.new_page() as page: |
|
headers = { |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
|
|
|
|
try: |
|
response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) |
|
if response.status == 200: |
|
content = await response.body() |
|
with open(path, 'wb') as f: |
|
f.write(content) |
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
else: |
|
logger.error(f"Download failed with status {response.status}: {file_url}") |
|
|
|
|
|
error_info = await response.text() |
|
logger.debug(f"Error response: {error_info[:200]}...") |
|
|
|
|
|
if detect_captcha(error_info): |
|
logger.warning("Captcha detected during download") |
|
|
|
|
|
except PlaywrightTimeoutError: |
|
logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") |
|
|
|
|
|
try: |
|
logger.info("Trying browser download manager approach") |
|
download_promise = page.wait_for_event("download") |
|
await page.goto(file_url, timeout=60000) |
|
|
|
|
|
download = await download_promise |
|
await download.save_as(path) |
|
|
|
if os.path.exists(path) and os.path.getsize(path) > 0: |
|
self.downloaded_files.add(file_url) |
|
return path |
|
except Exception as e: |
|
logger.error(f"Browser download manager approach failed: {e}") |
|
|
|
return None |
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
async def force_download_viewonly(self, file_info, save_path): |
|
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" |
|
try: |
|
|
|
file_id = file_info.get('metadata', {}).get('file_id') |
|
if not file_id: |
|
url = file_info['url'] |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error("Could not extract file ID") |
|
return None |
|
|
|
file_type = file_info.get('metadata', {}).get('file_type', 'pdf') |
|
base, ext = os.path.splitext(save_path) |
|
if not ext: |
|
save_path = f"{base}.{file_type}" |
|
|
|
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") |
|
|
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-site-isolation-trials', |
|
'--disable-blink-features=AutomationControlled' |
|
] |
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=browser_args |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
device_scale_factor=2.0, |
|
accept_downloads=True |
|
) |
|
|
|
|
|
await context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change plugins |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
// Handle languages |
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
|
|
// Modify hardware concurrency |
|
Object.defineProperty(navigator, 'hardwareConcurrency', { |
|
get: () => 4 |
|
}); |
|
} |
|
""") |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
content = await page.content() |
|
if "the owner has not granted you permission to" in content: |
|
logger.warning("Permission denied error detected") |
|
|
|
|
|
await page.wait_for_timeout(random.randint(3000, 7000)) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
if file_type.lower() == 'pdf': |
|
|
|
|
|
|
|
await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400)) |
|
await page.wait_for_timeout(random.randint(500, 1000)) |
|
|
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Method 1: Check page counter text |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// Method 2: Check actual page elements |
|
const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pageElements.length > 0) return pageElements.length; |
|
|
|
// Method 3: Look for page thumbnails |
|
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); |
|
if (thumbnails.length > 0) return thumbnails.length; |
|
|
|
// Fallback: conservative guess |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated {estimated_pages} pages in PDF") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
logger.info("Scrolling page by page...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
prev_blob_count = 0 |
|
|
|
while attempt < max_attempts: |
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): |
|
logger.info("All pages appear to be loaded.") |
|
break |
|
|
|
|
|
if attempt % 3 == 0: |
|
await page.keyboard.press("End") |
|
else: |
|
await page.keyboard.press("PageDown") |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1500, 3000)) |
|
|
|
|
|
if attempt % 4 == 0: |
|
await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) |
|
|
|
prev_blob_count = blob_count |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = Array.from(document.getElementsByTagName("img")) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${imgs.length} valid page images to add to PDF`); |
|
|
|
let added = 0; |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success', False): |
|
logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") |
|
|
|
|
|
logger.info("Trying fallback screenshot method...") |
|
|
|
|
|
await page.evaluate(""" |
|
() => { |
|
// Find and click the "first page" button if available |
|
const buttons = Array.from(document.querySelectorAll('button')); |
|
const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); |
|
if (firstPageBtn) firstPageBtn.click(); |
|
} |
|
""") |
|
await page.wait_for_timeout(1000); |
|
|
|
|
|
screenshots = [] |
|
current_page = 1 |
|
max_pages = estimated_pages |
|
|
|
|
|
while current_page <= max_pages: |
|
screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") |
|
|
|
|
|
page_elem = await page.query_selector('.drive-viewer-paginated-page') |
|
if page_elem: |
|
await page_elem.screenshot(path=screenshot_path) |
|
else: |
|
|
|
await page.screenshot(path=screenshot_path) |
|
|
|
screenshots.append(screenshot_path) |
|
|
|
|
|
next_btn = await page.query_selector('button[aria-label="Next page"]') |
|
if next_btn: |
|
is_disabled = await next_btn.get_attribute('disabled') |
|
if is_disabled: |
|
logger.info(f"Reached end of document at page {current_page}") |
|
break |
|
|
|
await next_btn.click() |
|
await page.wait_for_timeout(1000) |
|
current_page += 1 |
|
else: |
|
break |
|
|
|
|
|
if screenshots: |
|
first_img = Image.open(screenshots[0]) |
|
width, height = first_img.size |
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
for screenshot in screenshots: |
|
img = Image.open(screenshot) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
|
|
return save_path |
|
|
|
return None |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
await download.save_as(save_path) |
|
|
|
|
|
try: |
|
os.rmdir(temp_dir) |
|
except: |
|
pass |
|
|
|
else: |
|
|
|
screenshot_path = os.path.join(temp_dir, "file.png") |
|
await page.screenshot(path=screenshot_path) |
|
|
|
if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: |
|
|
|
await self.export_google_doc(file_id, file_type, save_path) |
|
else: |
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
os.remove(screenshot_path) |
|
|
|
|
|
await browser.close() |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: |
|
logger.info(f"Successfully downloaded file to {save_path}") |
|
return save_path |
|
else: |
|
logger.error(f"Generated file is too small or missing: {save_path}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error during force download: {e}") |
|
if browser: |
|
await browser.close() |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Force download preparation failed: {e}") |
|
return None |
|
|
|
async def download_from_google_drive(self, url, save_path): |
|
"""Enhanced method to download from Google Drive with multiple fallback approaches""" |
|
|
|
file_id = None |
|
url_patterns = [ |
|
r'drive\.google\.com/file/d/([^/]+)', |
|
r'drive\.google\.com/open\?id=([^&]+)', |
|
r'docs\.google\.com/\w+/d/([^/]+)', |
|
r'id=([^&]+)', |
|
r'drive\.google\.com/uc\?id=([^&]+)', |
|
] |
|
|
|
for pattern in url_patterns: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error(f"Could not extract file ID from URL: {url}") |
|
return False |
|
|
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") |
|
|
|
base, ext = os.path.splitext(save_path) |
|
if not ext and file_type: |
|
|
|
save_path = f"{base}.{file_type}" |
|
|
|
|
|
if is_view_only: |
|
|
|
if file_type == 'pdf': |
|
success = await self.download_viewonly_pdf_with_js(file_id, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: |
|
success = await self.export_google_doc(file_id, file_type, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) |
|
if success: |
|
return True |
|
|
|
|
|
try: |
|
|
|
direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" |
|
|
|
|
|
headers = { |
|
'User-Agent': get_random_user_agent(), |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Referer': 'https://drive.google.com/', |
|
'DNT': '1' |
|
} |
|
|
|
|
|
with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: |
|
if r.status_code == 200: |
|
|
|
content_type = r.headers.get('Content-Type', '') |
|
if 'text/html' in content_type and not file_id.endswith('.html'): |
|
logger.warning("Received HTML instead of file, trying with session cookies") |
|
else: |
|
|
|
with open(save_path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
logger.info("Direct download successful") |
|
return True |
|
|
|
|
|
session = requests.Session() |
|
session.headers.update({'User-Agent': get_random_user_agent()}) |
|
|
|
|
|
session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) |
|
|
|
|
|
url = f"https://drive.google.com/uc?id={file_id}&export=download" |
|
response = session.get(url, stream=True, timeout=30) |
|
|
|
|
|
confirmation_token = None |
|
for k, v in response.cookies.items(): |
|
if k.startswith('download_warning'): |
|
confirmation_token = v |
|
break |
|
|
|
|
|
if confirmation_token: |
|
url = f"{url}&confirm={confirmation_token}" |
|
response = session.get(url, stream=True, timeout=60) |
|
|
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if 'text/html' in content_type: |
|
logger.warning("Received HTML instead of file - likely download restriction") |
|
else: |
|
with open(save_path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=1024*1024): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
with open(save_path, 'rb') as f: |
|
content = f.read(100) |
|
if b'<!DOCTYPE html>' not in content: |
|
logger.info("Successfully downloaded with requests session") |
|
return True |
|
except Exception as e: |
|
logger.warning(f"Requests session download failed: {e}") |
|
|
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') |
|
if download_button: |
|
await download_button.click() |
|
|
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(save_path) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
except Exception as e: |
|
logger.error(f"Error during browser download: {e}") |
|
return False |
|
else: |
|
|
|
await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) |
|
|
|
|
|
download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') |
|
for elem in download_elements: |
|
try: |
|
await elem.click() |
|
|
|
try: |
|
download = await download_promise |
|
await download.save_as(save_path) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
except: |
|
pass |
|
except: |
|
continue |
|
except Exception as e: |
|
logger.error(f"Browser-based download attempt failed: {e}") |
|
|
|
logger.warning("All standard download methods failed") |
|
return False |
|
|
|
async def download_viewonly_pdf_with_js(self, file_id, save_path): |
|
"""Download view-only PDF using the enhanced blob image caching technique""" |
|
try: |
|
|
|
browser_args = [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-blink-features=AutomationControlled' |
|
] |
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=browser_args |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
accept_downloads=True, |
|
ignore_https_errors=True |
|
) |
|
|
|
|
|
await context.add_init_script(""" |
|
() => { |
|
Object.defineProperty(navigator, 'webdriver', { |
|
get: () => false, |
|
}); |
|
|
|
// Change plugins and languages to appear more human |
|
Object.defineProperty(navigator, 'plugins', { |
|
get: () => [1, 2, 3, 4, 5].map(() => ({ |
|
lengthComputable: true, |
|
loaded: 100, |
|
total: 100 |
|
})) |
|
}); |
|
|
|
Object.defineProperty(navigator, 'languages', { |
|
get: () => ['en-US', 'en', 'es'] |
|
}); |
|
} |
|
""") |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) |
|
await page.wait_for_load_state('networkidle') |
|
|
|
|
|
await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) |
|
await page.wait_for_timeout(random.randint(2000, 5000)) |
|
|
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Look for page counter in the interface |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// If we can't find a counter, check actual pages |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length > 0) return pages.length; |
|
|
|
// Default to a reasonable number if we can't determine |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated number of pages: {estimated_pages}") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
logger.info("Scrolling through document to load all pages...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
prev_blob_count = 0 |
|
consecutive_same_count = 0 |
|
|
|
while attempt < max_attempts: |
|
|
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
|
|
if blob_count >= estimated_pages: |
|
logger.info(f"All {estimated_pages} pages appear to be loaded.") |
|
break |
|
|
|
if blob_count == prev_blob_count: |
|
consecutive_same_count += 1 |
|
if consecutive_same_count >= 5 and blob_count > 0: |
|
logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") |
|
break |
|
else: |
|
consecutive_same_count = 0 |
|
|
|
|
|
scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) |
|
|
|
if scroll_action == "PageDown": |
|
await page.keyboard.press("PageDown") |
|
elif scroll_action == "End": |
|
await page.keyboard.press("End") |
|
elif scroll_action == "ArrowDown": |
|
|
|
for _ in range(random.randint(5, 15)): |
|
await page.keyboard.press("ArrowDown") |
|
await page.wait_for_timeout(random.randint(50, 150)) |
|
else: |
|
|
|
current_y = random.randint(300, 700) |
|
await page.mouse.move(x=random.randint(300, 800), y=current_y) |
|
await page.mouse.wheel(0, random.randint(300, 800)) |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1000, 3000)) |
|
|
|
prev_blob_count = blob_count |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = document.getElementsByTagName("img"); |
|
let validImages = []; |
|
|
|
// First collect all valid blob images |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
if (!/^blob:/.test(img.src)) continue; |
|
if (img.width < 100 || img.height < 100) continue; |
|
validImages.push(img); |
|
} |
|
|
|
// Sort by position in the document |
|
validImages.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${validImages.length} valid page images to add to PDF`); |
|
|
|
let added = 0; |
|
// Process each image as a page |
|
for (let i = 0; i < validImages.length; i++) { |
|
let img = validImages[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
// Use a reliable CDN |
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success'): |
|
logger.error(f"Error in PDF generation: {result.get('error')}") |
|
return False |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
|
|
|
|
await download.save_as(save_path) |
|
logger.info(f"Successfully saved PDF to {save_path}") |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 |
|
|
|
finally: |
|
await browser.close() |
|
|
|
except Exception as e: |
|
logger.error(f"Error in viewonly PDF download process: {e}") |
|
return False |
|
|
|
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): |
|
"""Download any view-only file by taking screenshots""" |
|
try: |
|
async with self.context.new_page() as page: |
|
|
|
await page.set_viewport_size({"width": 1600, "height": 1200}) |
|
|
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) |
|
|
|
|
|
await page.wait_for_load_state('networkidle') |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
base_dir = os.path.dirname(save_path) |
|
base_name = os.path.splitext(os.path.basename(save_path))[0] |
|
screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") |
|
os.makedirs(screenshots_dir, exist_ok=True) |
|
|
|
|
|
is_multi_page = await page.evaluate(""" |
|
() => { |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
return pages.length > 1; |
|
} |
|
""") |
|
|
|
if is_multi_page and file_type == 'pdf': |
|
|
|
page_count = await page.evaluate(""" |
|
async () => { |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
const container = document.querySelector('.drive-viewer-paginated-scrollable'); |
|
|
|
if (!container || pages.length === 0) return 0; |
|
|
|
// Scroll through to make sure all pages are loaded |
|
const scrollHeight = container.scrollHeight; |
|
const viewportHeight = container.clientHeight; |
|
const scrollStep = viewportHeight; |
|
|
|
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { |
|
container.scrollTo(0, scrollPos); |
|
await delay(300); |
|
} |
|
|
|
// Scroll back to top |
|
container.scrollTo(0, 0); |
|
await delay(300); |
|
|
|
return pages.length; |
|
} |
|
""") |
|
|
|
logger.info(f"Found {page_count} pages in document") |
|
|
|
|
|
screenshots = [] |
|
for i in range(page_count): |
|
|
|
await page.evaluate(f""" |
|
async () => {{ |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length <= {i}) return false; |
|
|
|
pages[{i}].scrollIntoView(); |
|
await delay(500); |
|
return true; |
|
}} |
|
""") |
|
|
|
|
|
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") |
|
await page.screenshot(path=screenshot_path, clip={ |
|
'x': 0, |
|
'y': 0, |
|
'width': 1600, |
|
'height': 1200 |
|
}) |
|
screenshots.append(screenshot_path) |
|
|
|
|
|
c = canvas.Canvas(save_path) |
|
for screenshot in screenshots: |
|
img = Image.open(screenshot) |
|
width, height = img.size |
|
|
|
|
|
c.setPageSize((width, height)) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
|
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
os.rmdir(screenshots_dir) |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
|
|
screenshot_path = os.path.join(screenshots_dir, "screenshot.png") |
|
await page.screenshot(path=screenshot_path, fullPage=True) |
|
|
|
|
|
if file_type == 'pdf': |
|
|
|
img = Image.open(screenshot_path) |
|
width, height = img.size |
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
c.drawImage(screenshot_path, 0, 0, width, height) |
|
c.save() |
|
else: |
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
|
|
os.remove(screenshot_path) |
|
os.rmdir(screenshots_dir) |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
|
|
except Exception as e: |
|
logger.error(f"Error taking screenshots: {e}") |
|
return False |
|
|
|
async def export_google_doc(self, file_id, file_type, save_path): |
|
"""Export Google Docs/Sheets/Slides to downloadable formats""" |
|
try: |
|
|
|
export_formats = { |
|
'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', |
|
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', |
|
'pdf': 'application/pdf', |
|
} |
|
|
|
export_format = export_formats.get(file_type, 'application/pdf') |
|
export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" |
|
|
|
if 'sheet' in file_type or 'xlsx' in file_type: |
|
export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" |
|
elif 'ppt' in file_type or 'presentation' in file_type: |
|
export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" |
|
elif file_type == 'pdf': |
|
export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" |
|
|
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') |
|
|
|
|
|
response = await page.goto(export_url, wait_until='networkidle') |
|
|
|
if response.status == 200: |
|
content = await response.body() |
|
with open(save_path, 'wb') as f: |
|
f.write(content) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
logger.warning(f"Export failed with status {response.status}") |
|
return False |
|
|
|
except Exception as e: |
|
logger.error(f"Error exporting Google Doc: {e}") |
|
return False |
|
|
|
async def get_google_drive_file_info(self, file_id): |
|
"""Get file type and view-only status from Google Drive""" |
|
file_type = None |
|
is_view_only = False |
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
|
|
|
|
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') |
|
is_view_only = view_only_text is not None |
|
|
|
|
|
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') |
|
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') |
|
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') |
|
|
|
if gdocs_viewer: |
|
file_type = 'docx' |
|
elif gsheets_viewer: |
|
file_type = 'xlsx' |
|
elif gslides_viewer: |
|
file_type = 'pptx' |
|
else: |
|
|
|
pdf_viewer = await page.query_selector('embed[type="application/pdf"]') |
|
if pdf_viewer: |
|
file_type = 'pdf' |
|
else: |
|
|
|
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') |
|
if img_viewer: |
|
|
|
img_src = await img_viewer.get_attribute('src') |
|
if 'jpg' in img_src or 'jpeg' in img_src: |
|
file_type = 'jpg' |
|
elif 'png' in img_src: |
|
file_type = 'png' |
|
else: |
|
file_type = 'jpg' |
|
else: |
|
|
|
file_type = 'pdf' |
|
|
|
|
|
if not file_type: |
|
title_element = await page.query_selector('div[role="heading"]') |
|
if title_element: |
|
title = await title_element.text_content() |
|
if title: |
|
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) |
|
if ext_match: |
|
file_type = ext_match.group(1).lower() |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting Google Drive file info: {e}") |
|
file_type = 'pdf' |
|
|
|
return file_type, is_view_only |
|
|
|
|
|
async def get_sublinks(self, url, limit=10000): |
|
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" |
|
links = set() |
|
try: |
|
logger.info(f"Fetching sublinks from: {url}") |
|
|
|
|
|
if is_download_link(url): |
|
logger.info(f"URL appears to be a direct download link: {url}") |
|
links.add(url) |
|
return list(links)[:limit] |
|
|
|
|
|
normalized_url = normalize_download_url(url) |
|
if normalized_url in self.visited_urls: |
|
logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") |
|
return list(links)[:limit] |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized exam site sublink extraction") |
|
edu_links = await self.get_edu_exam_links(url) |
|
for link in edu_links: |
|
links.add(link) |
|
|
|
|
|
if len(links) > 5: |
|
logger.info(f"Found {len(links)} sublinks with specialized method") |
|
return list(links)[:limit] |
|
|
|
|
|
await self.rotate_proxy_if_needed() |
|
|
|
|
|
try: |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
except Exception as e: |
|
logger.warning(f"Error navigating to URL for sublink extraction: {e}") |
|
|
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
await self.page.evaluate(""" |
|
async () => { |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const height = document.body.scrollHeight; |
|
const step = Math.floor(window.innerHeight / 2); |
|
|
|
for (let i = 0; i < height; i += step) { |
|
window.scrollTo(0, i); |
|
await delay(150); |
|
} |
|
|
|
window.scrollTo(0, 0); |
|
} |
|
""") |
|
await self.page.wait_for_timeout(1000) |
|
|
|
|
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
|
|
if is_aspnet: |
|
logger.info("Detected ASP.NET page, using enhanced extraction method") |
|
|
|
|
|
|
|
dropdowns = await self.page.query_selector_all('select') |
|
buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') |
|
|
|
|
|
for dropdown in dropdowns: |
|
try: |
|
|
|
options = await self.page.evaluate(''' |
|
(dropdown) => { |
|
return Array.from(dropdown.options).map(o => o.value); |
|
} |
|
''', dropdown) |
|
|
|
|
|
for option in options: |
|
if option: |
|
await dropdown.select_option(value=option) |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with dropdown: {e}") |
|
|
|
|
|
safe_buttons = [] |
|
for button in buttons: |
|
button_text = await button.text_content() or "" |
|
button_value = await button.get_attribute("value") or "" |
|
button_id = await button.get_attribute("id") or "" |
|
combined_text = (button_text + button_value + button_id).lower() |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): |
|
continue |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): |
|
safe_buttons.append(button) |
|
|
|
|
|
for button in safe_buttons[:5]: |
|
try: |
|
await button.click() |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
|
|
|
|
grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') |
|
for cell in grid_cells: |
|
try: |
|
href = await cell.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error extracting grid link: {e}") |
|
|
|
|
|
postback_links = await self.page.evaluate(''' |
|
() => { |
|
const results = []; |
|
// Find elements with onclick containing __doPostBack |
|
const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); |
|
for (const el of elements) { |
|
// Extract the postback target |
|
const onclick = el.getAttribute('onclick') || ''; |
|
const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); |
|
if (match && match[1]) { |
|
// Get the visible text to use as description |
|
const text = el.innerText || el.textContent || 'Link'; |
|
results.push({ |
|
id: match[1], |
|
text: text.trim() |
|
}); |
|
} |
|
} |
|
return results; |
|
} |
|
''') |
|
|
|
|
|
for postback in postback_links[:10]: |
|
try: |
|
logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") |
|
await self.page.evaluate(f''' |
|
() => {{ |
|
if (typeof __doPostBack === 'function') {{ |
|
__doPostBack('{postback["id"]}', ''); |
|
}} |
|
}} |
|
''') |
|
await self.page.wait_for_timeout(1500) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error with postback: {e}") |
|
|
|
|
|
pagination_elements = await self.page.query_selector_all( |
|
'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' |
|
) |
|
|
|
|
|
for i in range(min(5, len(pagination_elements))): |
|
try: |
|
|
|
el = pagination_elements[i] |
|
el_text = await el.text_content() or "" |
|
|
|
|
|
if "next" in el_text.lower() or ">" == el_text.strip() or "→" == el_text.strip(): |
|
logger.info(f"Clicking pagination control: {el_text}") |
|
await el.click() |
|
await self.page.wait_for_timeout(2000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking pagination: {e}") |
|
|
|
|
|
hidden_links = await self.page.evaluate(""" |
|
() => { |
|
// Try to execute common JavaScript patterns that reveal hidden content |
|
try { |
|
// Common patterns used in websites to initially hide content |
|
const hiddenContainers = document.querySelectorAll( |
|
'.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' |
|
); |
|
|
|
// Attempt to make them visible |
|
hiddenContainers.forEach(el => { |
|
el.style.display = 'block'; |
|
el.style.visibility = 'visible'; |
|
el.classList.remove('hidden', 'hide'); |
|
}); |
|
|
|
// Return any newly visible links |
|
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); |
|
} catch (e) { |
|
return []; |
|
} |
|
} |
|
""") |
|
|
|
|
|
for href in hidden_links: |
|
if href and not href.startswith('javascript:'): |
|
links.add(href) |
|
|
|
|
|
download_links = await self.page.evaluate(""" |
|
() => { |
|
return Array.from(document.querySelectorAll('a[href]')) |
|
.filter(a => { |
|
const href = a.href.toLowerCase(); |
|
return href.includes('download') || |
|
href.includes('file') || |
|
href.includes('get') || |
|
href.includes('view.php') || |
|
href.includes('action=') || |
|
href.includes('fname='); |
|
}) |
|
.map(a => a.href); |
|
} |
|
""") |
|
|
|
for download_link in download_links: |
|
links.add(download_link) |
|
|
|
|
|
js_links = await self.discover_hidden_links(self.page) |
|
for link in js_links: |
|
links.add(link) |
|
|
|
logger.info(f"Found {len(links)} sublinks") |
|
|
|
|
|
prioritized_links = [] |
|
normal_links = [] |
|
|
|
for link in links: |
|
if is_download_link(link): |
|
prioritized_links.append(link) |
|
else: |
|
normal_links.append(link) |
|
|
|
|
|
result = prioritized_links + normal_links |
|
return result[:limit] |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting sublinks from {url}: {e}") |
|
return list(links)[:limit] |
|
|
|
async def extract_all_link_types(self, links_set, base_url, path_base): |
|
"""Extract all types of links from the current page""" |
|
|
|
a_links = await self.page.query_selector_all('a[href]') |
|
for a in a_links: |
|
try: |
|
href = await a.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
iframes = await self.page.query_selector_all('iframe[src]') |
|
for iframe in iframes: |
|
try: |
|
src = await iframe.get_attribute('src') |
|
if src and not src.startswith('javascript:') and not src.startswith('about:'): |
|
full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') |
|
for el in onclick_elements: |
|
try: |
|
onclick = await el.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
|
|
data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') |
|
for el in data_elements: |
|
for attr in ['data-url', 'data-href', 'data-src']: |
|
try: |
|
value = await el.get_attribute(attr) |
|
if value and not value.startswith('javascript:'): |
|
full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') |
|
for anchor in special_anchors: |
|
try: |
|
href = await anchor.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') |
|
for script in script_elements: |
|
try: |
|
script_content = await script.text_content() |
|
if script_content: |
|
|
|
urls = re.findall(r'(https?://[^\'"]+)', script_content) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
def resolve_relative_url(self, relative_url, base_url, path_base): |
|
"""Properly resolve relative URLs considering multiple formats""" |
|
if relative_url.startswith('/'): |
|
|
|
return f"{base_url}{relative_url}" |
|
elif relative_url.startswith('./'): |
|
|
|
return f"{base_url}{path_base}/{relative_url[2:]}" |
|
elif relative_url.startswith('../'): |
|
|
|
parent_path = '/'.join(path_base.split('/')[:-1]) |
|
return f"{base_url}{parent_path}/{relative_url[3:]}" |
|
else: |
|
|
|
return f"{base_url}{path_base}/{relative_url}" |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): |
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
file_count_text = st.empty() |
|
|
|
try: |
|
|
|
self.visited_urls = set() |
|
|
|
progress_text.text("Analyzing main page...") |
|
|
|
is_aspnet = False |
|
try: |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
except Exception: |
|
pass |
|
|
|
|
|
if is_download_link(url): |
|
progress_text.text("URL appears to be a direct download. Analyzing...") |
|
|
|
|
|
normalized_url = normalize_download_url(url) |
|
file_info = { |
|
'url': normalized_url, |
|
'download_url': normalized_url, |
|
'filename': os.path.basename(urlparse(normalized_url).path) or 'download', |
|
'size': 'Unknown Size', |
|
'metadata': {} |
|
} |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
progress_bar.progress(1.0) |
|
return [file_info] |
|
|
|
|
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
initial_count = len(main_files) |
|
file_count_text.text(f"Found {initial_count} files on main page") |
|
|
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
total_links = len(sublinks) |
|
progress_text.text(f"Found {total_links} sublinks to process") |
|
|
|
|
|
all_files = main_files |
|
|
|
if not sublinks: |
|
progress_bar.progress(1.0) |
|
return all_files |
|
|
|
|
|
for i, sublink in enumerate(sublinks, 1): |
|
progress = i / total_links |
|
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") |
|
progress_bar.progress(progress) |
|
|
|
try: |
|
|
|
if is_download_link(sublink): |
|
|
|
normalized_url = normalize_download_url(sublink) |
|
|
|
|
|
if normalized_url in self.visited_urls: |
|
continue |
|
|
|
|
|
self.visited_urls.add(normalized_url) |
|
|
|
|
|
size_str = await self.get_file_size(normalized_url) |
|
|
|
|
|
filename = os.path.basename(urlparse(normalized_url).path) |
|
if not filename or filename == '/' or '?' in filename: |
|
domain = get_domain(normalized_url) |
|
ext = '.pdf' |
|
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: |
|
if common_ext in normalized_url.lower(): |
|
ext = common_ext |
|
break |
|
filename = f"file_from_{domain}{ext}" |
|
|
|
|
|
all_files.append({ |
|
'url': normalized_url, |
|
'download_url': normalized_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': {} |
|
}) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
continue |
|
|
|
|
|
sub_timeout = timeout * 2 if is_aspnet else timeout |
|
|
|
|
|
if sublink in self.visited_urls: |
|
continue |
|
|
|
|
|
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) |
|
all_files.extend(sub_files) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
except Exception as e: |
|
logger.warning(f"Error processing sublink {sublink}: {e}") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
final_count = len(unique_files) |
|
progress_text.text(f"Deep search complete!") |
|
file_count_text.text(f"Found {final_count} unique files") |
|
progress_bar.progress(1.0) |
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
progress_text.text(f"Error during deep search: {str(e)}") |
|
return [] |
|
|
|
finally: |
|
await asyncio.sleep(2) |
|
if not st.session_state.get('keep_progress', False): |
|
progress_text.empty() |
|
progress_bar.empty() |
|
|
|
|
|
def main(): |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.stTabs [data-baseweb="tab-list"] { |
|
gap: 10px; |
|
} |
|
.stTabs [data-baseweb="tab"] { |
|
height: 50px; |
|
white-space: pre-wrap; |
|
border-radius: 4px 4px 0px 0px; |
|
padding: 10px 16px; |
|
background-color: #f0f2f6; |
|
} |
|
.stTabs [aria-selected="true"] { |
|
background-color: #ffffff !important; |
|
border-bottom: 2px solid #4c78a8; |
|
} |
|
.stFileUploader > div > div > button { |
|
width: 100%; |
|
} |
|
.main-header { |
|
font-size: 2.5rem; |
|
font-weight: 700; |
|
margin-bottom: 10px; |
|
} |
|
.section-subheader { |
|
font-size: 1.3rem; |
|
font-weight: 600; |
|
margin-top: 20px; |
|
margin-bottom: 10px; |
|
} |
|
.info-text { |
|
color: #6c757d; |
|
font-size: 0.9rem; |
|
} |
|
.stButton>button { |
|
width: 100%; |
|
} |
|
.result-card { |
|
background-color: #f8f9fa; |
|
border-radius: 6px; |
|
padding: 16px; |
|
margin-bottom: 12px; |
|
border-left: 4px solid #4c78a8; |
|
} |
|
.sidebar-header { |
|
font-size: 1.2rem; |
|
font-weight: 600; |
|
margin-bottom: 10px; |
|
} |
|
.sidebar-section { |
|
margin-bottom: 20px; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if 'files' not in st.session_state: |
|
st.session_state.files = [] |
|
if 'downloaded_paths' not in st.session_state: |
|
st.session_state.downloaded_paths = [] |
|
if 'download_complete' not in st.session_state: |
|
st.session_state.download_complete = False |
|
if 'selected_tab' not in st.session_state: |
|
st.session_state.selected_tab = 0 |
|
if 'rag_search' not in st.session_state: |
|
st.session_state.rag_search = EnhancedRAGSearch() |
|
if 'keep_progress' not in st.session_state: |
|
st.session_state.keep_progress = False |
|
if 'google_credentials' not in st.session_state: |
|
st.session_state.google_credentials = None |
|
if 'mode' not in st.session_state: |
|
st.session_state.mode = "Standard" |
|
if 'use_proxy' not in st.session_state: |
|
st.session_state.use_proxy = False |
|
if 'proxy_string' not in st.session_state: |
|
st.session_state.proxy_string = None |
|
if 'stealth_mode' not in st.session_state: |
|
st.session_state.stealth_mode = True |
|
|
|
|
|
|
|
|
|
with st.sidebar: |
|
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50) |
|
st.markdown("<p class='sidebar-header'>Advanced File Downloader</p>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Mode</p>", unsafe_allow_html=True) |
|
mode = st.radio( |
|
"Select Mode", |
|
["Standard", "Education Mode", "Research Mode", "Media Mode"], |
|
label_visibility="collapsed", |
|
index=["Standard", "Education Mode", "Research Mode", "Media Mode"].index(st.session_state.mode), |
|
horizontal=False |
|
) |
|
|
|
if mode != st.session_state.mode: |
|
st.session_state.mode = mode |
|
|
|
if mode == "Education Mode": |
|
st.session_state.custom_extensions = ".pdf,.doc,.docx,.ppt,.pptx" |
|
st.session_state.prioritize_pdfs = True |
|
elif mode == "Research Mode": |
|
st.session_state.custom_extensions = ".pdf,.txt,.csv,.json,.xlsx" |
|
st.session_state.prioritize_pdfs = True |
|
elif mode == "Media Mode": |
|
st.session_state.custom_extensions = ".jpg,.png,.mp3,.mp4,.avi,.mov" |
|
st.session_state.prioritize_pdfs = False |
|
|
|
st.markdown(f"<div class='info-text'>Current: <b>{st.session_state.mode}</b></div>", unsafe_allow_html=True) |
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Quick Settings</p>", unsafe_allow_html=True) |
|
|
|
stealth_mode = st.checkbox("Stealth Mode", value=st.session_state.stealth_mode) |
|
if stealth_mode != st.session_state.stealth_mode: |
|
st.session_state.stealth_mode = stealth_mode |
|
|
|
use_proxy = st.checkbox("Use Proxy", value=st.session_state.use_proxy) |
|
if use_proxy != st.session_state.use_proxy: |
|
st.session_state.use_proxy = use_proxy |
|
|
|
if use_proxy: |
|
proxy_string = st.text_input("Proxy Address", |
|
placeholder="e.g., http://user:pass@host:port", |
|
value=st.session_state.proxy_string or "") |
|
if proxy_string != st.session_state.proxy_string: |
|
st.session_state.proxy_string = proxy_string |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Google Drive</p>", unsafe_allow_html=True) |
|
|
|
if st.session_state.google_credentials: |
|
st.success("✅ Connected") |
|
|
|
drive_folder = st.text_input("Drive Folder", |
|
value="File Downloader" if 'drive_folder' not in st.session_state else st.session_state.drive_folder) |
|
if 'drive_folder' not in st.session_state or drive_folder != st.session_state.drive_folder: |
|
st.session_state.drive_folder = drive_folder |
|
|
|
if st.button("Disconnect Drive"): |
|
st.session_state.google_credentials = None |
|
st.rerun() |
|
else: |
|
st.warning("⚠️ Not Connected") |
|
if st.button("Connect Google Drive"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
auth_code = st.text_input("Enter authorization code:") |
|
|
|
if auth_code: |
|
with st.spinner("Connecting to Google Drive..."): |
|
credentials, status_msg = exchange_code_for_credentials(auth_code) |
|
if credentials: |
|
st.session_state.google_credentials = credentials |
|
st.success(status_msg) |
|
st.rerun() |
|
else: |
|
st.error(status_msg) |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
if st.session_state.mode == "Education Mode": |
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Quick Access</p>", unsafe_allow_html=True) |
|
st.markdown("<div class='info-text'>Common Educational Sites</div>", unsafe_allow_html=True) |
|
|
|
if st.button("Past Exam Papers"): |
|
st.session_state.preset_url = "https://pastpapers.example.edu" |
|
st.session_state.search_method = "Exam Site Mode" |
|
st.rerun() |
|
|
|
if st.button("Open Course Materials"): |
|
st.session_state.preset_url = "https://opencourseware.example.edu" |
|
st.session_state.search_method = "Deep Search" |
|
st.rerun() |
|
|
|
if st.button("Research Papers"): |
|
st.session_state.preset_url = "https://papers.example.org" |
|
st.session_state.search_method = "Deep Search" |
|
st.rerun() |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>System Status</p>", unsafe_allow_html=True) |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.markdown("<div class='info-text'>Search</div>", unsafe_allow_html=True) |
|
st.markdown("<div style='color: green; font-weight: bold;'>Active</div>", unsafe_allow_html=True) |
|
with col2: |
|
st.markdown("<div class='info-text'>Browser</div>", unsafe_allow_html=True) |
|
st.markdown("<div style='color: green; font-weight: bold;'>Ready</div>", unsafe_allow_html=True) |
|
|
|
if st.button("Install Dependencies"): |
|
with st.spinner("Installing Playwright dependencies..."): |
|
install_playwright_dependencies() |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section' style='position: absolute; bottom: 20px; width: 90%;'>", unsafe_allow_html=True) |
|
st.markdown("<div class='info-text' style='text-align: center;'>Version 2.0 • March 2025</div>", unsafe_allow_html=True) |
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
col1, col2 = st.columns([5, 1]) |
|
with col1: |
|
st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True) |
|
with col2: |
|
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70) |
|
|
|
mode_descriptions = { |
|
"Standard": "A versatile tool for discovering and downloading files from any website.", |
|
"Education Mode": "Optimized for educational resources, exams, and academic materials.", |
|
"Research Mode": "Focused on research papers, datasets, and academic publications.", |
|
"Media Mode": "Enhanced for finding and downloading images, videos, and audio files." |
|
} |
|
|
|
st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True) |
|
|
|
|
|
tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"]) |
|
|
|
|
|
with tabs[0]: |
|
st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True) |
|
|
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
url = st.text_input("Enter a URL to search for downloadable files:", |
|
placeholder="e.g., https://example.com/resources", |
|
value=st.session_state.get('preset_url', '')) |
|
with col2: |
|
|
|
initial_search_method = st.session_state.get('search_method', "Deep Search") |
|
search_method = st.selectbox("Search Method", |
|
["Deep Search", "Quick Search", "Exam Site Mode"], |
|
index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method)) |
|
|
|
if search_method != st.session_state.get('search_method'): |
|
st.session_state.search_method = search_method |
|
|
|
|
|
with st.expander("Search Options", expanded=False): |
|
col1, col2, col3 = st.columns(3) |
|
with col1: |
|
depth = st.slider("Search Depth", min_value=1, max_value=5, value=2, |
|
help="Higher values will search more links but take longer") |
|
prioritize_pdfs = st.checkbox("Prioritize PDFs", |
|
value=st.session_state.get('prioritize_pdfs', True), |
|
help="Focus on finding PDF files first") |
|
with col2: |
|
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60) |
|
follow_subdomains = st.checkbox("Follow Subdomains", value=True, |
|
help="Include links from subdomains in the search") |
|
with col3: |
|
|
|
default_extensions = { |
|
"Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip", |
|
"Education Mode": ".pdf,.doc,.docx,.ppt,.pptx", |
|
"Research Mode": ".pdf,.txt,.csv,.json,.xlsx", |
|
"Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov" |
|
} |
|
|
|
custom_extensions = st.text_area( |
|
"Custom File Extensions", |
|
value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]), |
|
help="Comma-separated list of file extensions to look for" |
|
) |
|
|
|
|
|
if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions: |
|
st.session_state.custom_extensions = custom_extensions |
|
|
|
search_col1, search_col2 = st.columns([4, 1]) |
|
with search_col1: |
|
search_button = st.button("🔍 Start Search", use_container_width=True) |
|
with search_col2: |
|
clear_button = st.button("🧹 Clear Results", use_container_width=True) |
|
|
|
|
|
if st.session_state.files: |
|
st.markdown("<h3 class='section-subheader'>Found Files</h3>", unsafe_allow_html=True) |
|
|
|
|
|
filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1]) |
|
with filter_col1: |
|
file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.") |
|
with filter_col2: |
|
sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"]) |
|
with filter_col3: |
|
show_only_pdfs = st.checkbox("PDFs Only", value=False) |
|
|
|
|
|
sorted_files = list(st.session_state.files) |
|
if sort_option == "Name": |
|
sorted_files.sort(key=lambda x: x['filename']) |
|
elif sort_option == "Size (Largest)": |
|
|
|
def parse_size(size_str): |
|
if 'Unknown' in size_str: |
|
return 0 |
|
try: |
|
value = float(size_str.split(' ')[0]) |
|
unit = size_str.split(' ')[1] |
|
multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} |
|
return value * multipliers.get(unit, 0) |
|
except: |
|
return 0 |
|
|
|
sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True) |
|
elif sort_option == "Size (Smallest)": |
|
def parse_size(size_str): |
|
if 'Unknown' in size_str: |
|
return float('inf') |
|
try: |
|
value = float(size_str.split(' ')[0]) |
|
unit = size_str.split(' ')[1] |
|
multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} |
|
return value * multipliers.get(unit, 0) |
|
except: |
|
return float('inf') |
|
|
|
sorted_files.sort(key=lambda x: parse_size(x['size'])) |
|
|
|
|
|
file_container = st.container() |
|
with file_container: |
|
selected_files = [] |
|
displayed_files = [] |
|
|
|
for i, file in enumerate(sorted_files): |
|
|
|
if file_filter and file_filter.lower() not in file['filename'].lower(): |
|
continue |
|
if show_only_pdfs and not file['filename'].lower().endswith('.pdf'): |
|
continue |
|
|
|
displayed_files.append(i) |
|
with st.container(): |
|
col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1]) |
|
with col1: |
|
selected = st.checkbox("", key=f"select_{i}", value=True) |
|
if selected: |
|
selected_files.append(i) |
|
with col2: |
|
file_icon = "📄" |
|
if file['filename'].lower().endswith('.pdf'): |
|
file_icon = "📝" |
|
elif file['filename'].lower().endswith(('.doc', '.docx')): |
|
file_icon = "📋" |
|
elif file['filename'].lower().endswith(('.xls', '.xlsx')): |
|
file_icon = "📊" |
|
elif file['filename'].lower().endswith(('.ppt', '.pptx')): |
|
file_icon = "🖼️" |
|
elif file['filename'].lower().endswith(('.jpg', '.png', '.gif')): |
|
file_icon = "🖼️" |
|
elif file['filename'].lower().endswith(('.mp3', '.wav')): |
|
file_icon = "🔊" |
|
elif file['filename'].lower().endswith(('.mp4', '.avi', '.mov')): |
|
file_icon = "🎬" |
|
|
|
st.markdown(f"**{file_icon} {file['filename']}**") |
|
st.markdown(f"<span class='info-text'>{file['url'][:60]}...</span>", unsafe_allow_html=True) |
|
with col3: |
|
st.markdown(f"**Size:** {file['size']}") |
|
with col4: |
|
st.button("Preview", key=f"preview_{i}") |
|
|
|
st.divider() |
|
|
|
if not displayed_files: |
|
st.info("No files match your current filters. Try adjusting your search criteria.") |
|
|
|
|
|
if selected_files: |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
download_dir = st.text_input("Download Directory", value="downloads") |
|
with col2: |
|
download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True) |
|
|
|
download_col1, download_col2, download_col3 = st.columns([3, 1, 1]) |
|
with download_col1: |
|
download_button = st.button("⬇️ Download Selected Files", use_container_width=True) |
|
with download_col2: |
|
google_drive_button = st.button("📤 Upload to Drive", |
|
use_container_width=True, |
|
disabled=not st.session_state.google_credentials) |
|
with download_col3: |
|
select_all = st.button("Select All Files", use_container_width=True) |
|
|
|
|
|
if select_all: |
|
for i in displayed_files: |
|
st.session_state[f"select_{i}"] = True |
|
st.rerun() |
|
|
|
|
|
if st.session_state.download_complete: |
|
st.success(f"✅ Downloaded {len(st.session_state.downloaded_paths)} files successfully!") |
|
download_links = [] |
|
for path in st.session_state.downloaded_paths: |
|
with open(path, "rb") as f: |
|
file_content = f.read() |
|
file_name = os.path.basename(path) |
|
download_links.append((file_name, file_content)) |
|
|
|
if len(download_links) > 0: |
|
if download_option == "ZIP Archive": |
|
|
|
zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir) |
|
with open(zip_path, "rb") as f: |
|
zip_content = f.read() |
|
st.download_button("📦 Download ZIP Archive", |
|
zip_content, |
|
file_name=os.path.basename(zip_path), |
|
mime="application/zip") |
|
else: |
|
|
|
st.markdown("<h4>Download Files</h4>", unsafe_allow_html=True) |
|
|
|
|
|
cols = st.columns(3) |
|
for idx, (name, content) in enumerate(download_links): |
|
mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream' |
|
with cols[idx % 3]: |
|
st.download_button( |
|
f"📄 {name}", |
|
content, |
|
file_name=name, |
|
mime=mime_type, |
|
key=f"dl_{name}", |
|
use_container_width=True |
|
) |
|
|
|
|
|
with tabs[1]: |
|
st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True) |
|
st.write("Upload files to search through their content with AI-powered semantic search.") |
|
|
|
|
|
uploaded_files = st.file_uploader("Upload documents for search", |
|
accept_multiple_files=True, |
|
type=['pdf', 'docx', 'txt', 'csv', 'json']) |
|
|
|
if uploaded_files: |
|
|
|
col1, col2 = st.columns([4, 1]) |
|
with col1: |
|
use_transformer = st.checkbox("Use AI Transformer Model", value=HAVE_TRANSFORMERS, |
|
help="Uses advanced AI for more accurate semantic search (if available)") |
|
with col2: |
|
if st.button("Build Search Index", use_container_width=True): |
|
with st.spinner("Processing files and building search index..."): |
|
files_added = 0 |
|
for uploaded_file in uploaded_files: |
|
file_info = { |
|
'filename': uploaded_file.name, |
|
'url': f'local://{uploaded_file.name}', |
|
'size': humanize_file_size(uploaded_file.size) |
|
} |
|
success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info) |
|
if success: |
|
files_added += 1 |
|
|
|
if files_added > 0: |
|
index_built = st.session_state.rag_search.build_index() |
|
if index_built: |
|
st.success(f"✅ Successfully indexed {files_added} files!") |
|
else: |
|
st.error("Failed to build search index.") |
|
else: |
|
st.warning("No valid text could be extracted from the files.") |
|
|
|
|
|
st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True) |
|
|
|
col1, col2 = st.columns([4, 1]) |
|
with col1: |
|
query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change") |
|
with col2: |
|
expand_query = st.checkbox("Auto-expand query", value=True, |
|
help="Automatically add related terms to your search") |
|
|
|
col1, col2 = st.columns([4, 1]) |
|
with col1: |
|
if st.button("🔍 Search Documents", use_container_width=True): |
|
if not query: |
|
st.warning("Please enter a search query") |
|
else: |
|
with st.spinner("Searching..."): |
|
results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True) |
|
|
|
if results: |
|
st.markdown(f"**Found {len(results)} relevant documents:**") |
|
for i, result in enumerate(results): |
|
with st.container(): |
|
st.markdown(f"<div class='result-card'>", unsafe_allow_html=True) |
|
st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})") |
|
|
|
if result.get('chunk_preview'): |
|
st.markdown("**Matching content:**") |
|
st.text(result['chunk_preview']) |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
else: |
|
st.info("No matching documents found. Try a different query.") |
|
with col2: |
|
num_results = st.number_input("Max results", min_value=1, max_value=20, value=5) |
|
|
|
|
|
with st.expander("Search Tips", expanded=False): |
|
st.markdown(""" |
|
### Effective Search Tips |
|
|
|
- **Be specific** with your queries for more accurate results |
|
- **Try different phrasings** if you don't get the results you expect |
|
- Use **quotation marks** for exact phrase matching |
|
- For **complex topics**, break down your search into multiple queries |
|
- **Combine related terms** to improve recall |
|
|
|
The search engine uses advanced algorithms to understand the semantic meaning of your query, |
|
not just keyword matching. |
|
""") |
|
|
|
|
|
with tabs[2]: |
|
st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True) |
|
|
|
config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"]) |
|
|
|
|
|
with config_tabs[0]: |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode, |
|
help="Makes browser harder to detect as automated, but may be slower") |
|
|
|
handle_captchas = st.checkbox("Handle Captchas Automatically", value=False, |
|
help="Attempt to solve simple captchas automatically") |
|
|
|
download_timeout = st.slider("Download Timeout (seconds)", |
|
min_value=30, max_value=600, value=300, |
|
help="Maximum time to wait for downloads to complete") |
|
with col2: |
|
user_agent = st.selectbox("User Agent", USER_AGENTS, index=0, |
|
help="Browser identity to use when accessing websites") |
|
|
|
save_screenshots = st.checkbox("Save Browser Screenshots", value=False, |
|
help="Save screenshots when errors occur for debugging") |
|
|
|
browser_lang = st.selectbox("Browser Language", |
|
["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"], |
|
index=0) |
|
|
|
if st.button("Update Browser Settings"): |
|
st.session_state.stealth_mode = use_stealth |
|
st.success("Browser settings updated!") |
|
|
|
|
|
st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True) |
|
if st.button("Install Playwright Dependencies"): |
|
with st.spinner("Installing dependencies..."): |
|
install_playwright_dependencies() |
|
|
|
|
|
with config_tabs[1]: |
|
proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy, |
|
help="Route requests through a proxy server for anonymity or bypassing restrictions") |
|
|
|
if proxy_enabled: |
|
proxy_col1, proxy_col2 = st.columns(2) |
|
with proxy_col1: |
|
proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"]) |
|
proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1") |
|
with proxy_col2: |
|
proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080") |
|
proxy_auth = st.text_input("Proxy Authentication (optional)", |
|
placeholder="username:password", type="password") |
|
|
|
st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True) |
|
use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False, |
|
help="Automatically rotate between multiple proxies for better anonymity") |
|
|
|
if use_proxy_rotation: |
|
proxy_list = st.text_area("Proxy List (one per line)", |
|
placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080") |
|
rotation_interval = st.slider("Rotation Interval (requests)", |
|
min_value=1, max_value=50, value=10, |
|
help="How often to switch proxies") |
|
|
|
if st.button("Save Proxy Configuration"): |
|
|
|
proxy_string = None |
|
if proxy_enabled and proxy_host and proxy_port: |
|
proxy_prefix = f"{proxy_type.lower()}://" |
|
proxy_auth_str = f"{proxy_auth}@" if proxy_auth else "" |
|
proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}" |
|
|
|
|
|
st.session_state.use_proxy = proxy_enabled |
|
st.session_state.proxy_string = proxy_string |
|
|
|
|
|
if use_proxy_rotation and proxy_list: |
|
PROXY_ROTATION_CONFIG["enabled"] = True |
|
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval |
|
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] |
|
|
|
st.success("Proxy configuration updated!") |
|
|
|
|
|
with config_tabs[2]: |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True) |
|
|
|
skip_existing = st.checkbox("Skip Existing Files", value=True, |
|
help="Don't download files that already exist locally") |
|
|
|
auto_rename = st.checkbox("Auto-Rename Duplicates", value=True, |
|
help="Automatically rename files instead of overwriting") |
|
|
|
verify_downloads = st.checkbox("Verify Downloads", value=True, |
|
help="Check file integrity after download") |
|
|
|
max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3, |
|
help="Number of times to retry failed downloads") |
|
|
|
with col2: |
|
st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True) |
|
|
|
auto_organize = st.checkbox("Auto-Organize Files", value=True, |
|
help="Automatically organize files by type") |
|
|
|
default_dir = st.text_input("Default Download Directory", value="downloads", |
|
help="Default location to save downloaded files") |
|
|
|
org_by_domain = st.checkbox("Organize by Domain", value=False, |
|
help="Create subdirectories based on source domains") |
|
|
|
org_by_type = st.checkbox("Organize by File Type", value=False, |
|
help="Create subdirectories based on file types") |
|
|
|
if st.button("Save Download Settings"): |
|
st.session_state.download_settings = { |
|
"skip_existing": skip_existing, |
|
"auto_rename": auto_rename, |
|
"verify_downloads": verify_downloads, |
|
"max_retries": max_retries, |
|
"auto_organize": auto_organize, |
|
"default_dir": default_dir, |
|
"org_by_domain": org_by_domain, |
|
"org_by_type": org_by_type |
|
} |
|
st.success("Download settings saved!") |
|
|
|
|
|
with config_tabs[3]: |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True) |
|
|
|
max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3, |
|
help="Maximum number of simultaneous downloads") |
|
|
|
memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024, |
|
help="Maximum memory to use for file processing") |
|
|
|
processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2, |
|
help="Number of threads to use for file processing") |
|
|
|
with col2: |
|
st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True) |
|
|
|
log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1, |
|
help="Detail level for application logs") |
|
|
|
save_debug_info = st.checkbox("Save Debug Information", value=False, |
|
help="Save detailed information about program execution") |
|
|
|
log_dir = st.text_input("Log Directory", value="logs", |
|
help="Directory to save log files") |
|
|
|
if st.button("Apply System Settings"): |
|
st.session_state.system_settings = { |
|
"max_concurrent": max_concurrent, |
|
"memory_limit": memory_limit, |
|
"processing_threads": processing_threads, |
|
"log_level": log_level, |
|
"save_debug_info": save_debug_info, |
|
"log_dir": log_dir |
|
} |
|
|
|
log_level_num = getattr(logging, log_level) |
|
logging.getLogger().setLevel(log_level_num) |
|
st.success("System settings applied!") |
|
|
|
|
|
st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True) |
|
reset_col1, reset_col2 = st.columns([1, 3]) |
|
with reset_col1: |
|
if st.button("Reset Application", use_container_width=True): |
|
for key in list(st.session_state.keys()): |
|
if key != 'google_credentials': |
|
del st.session_state[key] |
|
st.success("Application has been reset!") |
|
st.rerun() |
|
with reset_col2: |
|
st.info("This will clear all search results, downloaded files, and reset settings to defaults.") |
|
|
|
|
|
st.markdown("<h4 class='section-subheader'>Advanced Options</h4>", unsafe_allow_html=True) |
|
|
|
adv_col1, adv_col2 = st.columns(2) |
|
with adv_col1: |
|
clear_cache = st.button("Clear Cache", use_container_width=True) |
|
if clear_cache: |
|
|
|
temp_dir = tempfile.gettempdir() |
|
try: |
|
for f in os.listdir(temp_dir): |
|
if f.startswith("playwright") or f.startswith("download"): |
|
try: |
|
os.remove(os.path.join(temp_dir, f)) |
|
except: |
|
pass |
|
st.success("Cache cleared successfully!") |
|
except Exception as e: |
|
st.error(f"Error clearing cache: {e}") |
|
|
|
with adv_col2: |
|
export_settings = st.button("Export Settings", use_container_width=True) |
|
if export_settings: |
|
|
|
settings = { |
|
"mode": st.session_state.mode, |
|
"stealth_mode": st.session_state.stealth_mode, |
|
"use_proxy": st.session_state.use_proxy, |
|
"proxy_string": st.session_state.proxy_string, |
|
"custom_extensions": st.session_state.get("custom_extensions", ""), |
|
"prioritize_pdfs": st.session_state.get("prioritize_pdfs", True), |
|
"system_settings": st.session_state.get("system_settings", {}), |
|
"download_settings": st.session_state.get("download_settings", {}) |
|
} |
|
|
|
settings_json = json.dumps(settings, indent=2) |
|
b64 = base64.b64encode(settings_json.encode()).decode() |
|
href = f'data:application/json;base64,{b64}' |
|
st.markdown(f'<a href="{href}" download="file_downloader_settings.json">Download Settings File</a>', unsafe_allow_html=True) |
|
|
|
|
|
with tabs[3]: |
|
st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True) |
|
|
|
help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"]) |
|
|
|
with help_tabs[0]: |
|
st.markdown(""" |
|
### Getting Started |
|
|
|
1. **Enter a URL** on the Search & Download tab |
|
2. Select a **Search Method**: |
|
- **Deep Search**: Thorough but slower |
|
- **Quick Search**: Fast but may miss some files |
|
- **Exam Site Mode**: Optimized for educational resource sites |
|
3. Click **Start Search** to find downloadable files |
|
4. Select files you want to download |
|
5. Click **Download Selected Files** |
|
|
|
#### Using Different Modes |
|
|
|
Select a mode from the sidebar to optimize the tool for different use cases: |
|
|
|
- **Standard Mode**: Balanced for general use |
|
- **Education Mode**: Optimized for finding academic materials |
|
- **Research Mode**: Better for research papers and datasets |
|
- **Media Mode**: Enhanced for finding images, videos, and audio |
|
|
|
For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials. |
|
""") |
|
|
|
with help_tabs[1]: |
|
st.markdown(""" |
|
### Advanced Features |
|
|
|
- **Local File Search**: Upload files and search through their content using the enhanced RAG search |
|
- **Custom Extensions**: Specify additional file types to look for beyond the default set |
|
- **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers |
|
- **Proxy Support**: Use proxies to access region-restricted content or improve anonymity |
|
- **Google Drive Integration**: Upload downloaded files directly to your Google Drive |
|
|
|
#### Search Tips |
|
|
|
- For educational sites, include specific terms like "exam", "test", "paper" in the URL |
|
- When using Local File Search, try different variations of your query for better results |
|
- Use filtering and sorting options to find the most relevant files quickly |
|
|
|
#### File Organization |
|
|
|
You can configure automatic file organization in the Advanced Configuration tab: |
|
|
|
- **Organize by Domain**: Creates folders based on the source website |
|
- **Organize by File Type**: Separates files into folders by their extension |
|
- **Auto-Rename**: Prevents overwriting existing files with same names |
|
""") |
|
|
|
with help_tabs[2]: |
|
st.markdown(""" |
|
### Troubleshooting |
|
|
|
#### Common Issues |
|
|
|
- **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions |
|
- **Downloads failing**: Check if the site requires authentication or uses captchas |
|
- **Slow performance**: Reduce search depth or disable stealth mode for faster results |
|
- **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings |
|
|
|
#### Captcha Issues |
|
|
|
Some websites use captchas to prevent automated access. If you encounter captchas: |
|
|
|
1. Try using a different proxy |
|
2. Enable "Handle Captchas Automatically" for simple captchas |
|
3. For complex captchas, you may need to manually access the site first |
|
|
|
#### Proxy Problems |
|
|
|
If you're having issues with proxies: |
|
|
|
1. Verify your proxy is working with an external tool |
|
2. Check that you've entered the correct format (http://host:port) |
|
3. Some websites may block known proxy IPs |
|
|
|
#### Memory Usage |
|
|
|
If the application is using too much memory: |
|
|
|
1. Reduce the "Memory Limit" in System settings |
|
2. Process fewer files at once |
|
3. Use lower search depth values |
|
""") |
|
|
|
with help_tabs[3]: |
|
st.markdown(""" |
|
### About This Tool |
|
|
|
**Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources. |
|
|
|
#### Key Features |
|
|
|
- **Smart Discovery**: Finds downloadable files even when they're not directly linked |
|
- **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques |
|
- **Educational Focus**: Specialized detection for exam papers and academic resources |
|
- **Stealth Capabilities**: Avoids detection by anti-scraping measures |
|
|
|
#### Technical Details |
|
|
|
This tool uses: |
|
|
|
- **Playwright**: For browser automation and stealth capabilities |
|
- **Sentence Transformers**: For AI-powered semantic search |
|
- **Streamlit**: For the user interface |
|
- **Google Drive API**: For cloud integration |
|
|
|
#### Credits |
|
|
|
Created with Python, Streamlit, Playwright, and various AI libraries. |
|
|
|
For issues or suggestions, please contact the developer. |
|
|
|
Version 2.0 - March 2025 |
|
""") |
|
|
|
|
|
if search_button and url: |
|
|
|
st.session_state.files = [] |
|
st.session_state.downloaded_paths = [] |
|
st.session_state.download_complete = False |
|
|
|
|
|
if 'preset_url' in st.session_state: |
|
st.session_state.preset_url = '' |
|
|
|
|
|
custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()] |
|
|
|
|
|
proxy_string = st.session_state.proxy_string if st.session_state.use_proxy else None |
|
|
|
|
|
if 'use_proxy_rotation' in locals() and use_proxy_rotation and proxy_list: |
|
PROXY_ROTATION_CONFIG["enabled"] = True |
|
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval |
|
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] |
|
|
|
|
|
sublink_limit = 5000 if search_method == "Deep Search" else 1000 |
|
search_depth = depth if search_method == "Deep Search" else 1 |
|
is_exam_site = search_method == "Exam Site Mode" |
|
|
|
|
|
async def run_search(): |
|
async with DownloadManager( |
|
use_proxy=st.session_state.use_proxy, |
|
proxy=proxy_string, |
|
use_stealth=st.session_state.stealth_mode, |
|
proxy_rotation=PROXY_ROTATION_CONFIG["enabled"] |
|
) as manager: |
|
|
|
if is_exam_site: |
|
st.session_state.keep_progress = True |
|
edu_links = await manager.get_edu_exam_links(url) |
|
all_files = [] |
|
|
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
|
|
|
|
for i, link in enumerate(edu_links): |
|
progress = (i+1) / max(1, len(edu_links)) |
|
progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}") |
|
progress_bar.progress(progress) |
|
|
|
files = await manager.extract_downloadable_files(link, custom_ext_list) |
|
all_files.extend(files) |
|
|
|
st.session_state.files = all_files |
|
progress_text.empty() |
|
progress_bar.empty() |
|
st.session_state.keep_progress = False |
|
|
|
else: |
|
|
|
files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout) |
|
st.session_state.files = files |
|
|
|
|
|
asyncio.run(run_search()) |
|
st.rerun() |
|
|
|
|
|
if 'download_button' in locals() and download_button and selected_files: |
|
|
|
os.makedirs(download_dir, exist_ok=True) |
|
|
|
|
|
st.session_state.downloaded_paths = [] |
|
st.session_state.download_complete = False |
|
|
|
|
|
files_to_download = [st.session_state.files[i] for i in selected_files] |
|
|
|
|
|
async def run_download(): |
|
async with DownloadManager( |
|
use_proxy=st.session_state.use_proxy, |
|
proxy=st.session_state.proxy_string, |
|
use_stealth=st.session_state.stealth_mode |
|
) as manager: |
|
download_progress = st.progress(0) |
|
status_text = st.empty() |
|
|
|
for i, file_info in enumerate(files_to_download): |
|
progress = (i) / len(files_to_download) |
|
status_text.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}") |
|
download_progress.progress(progress) |
|
|
|
downloaded_path = await manager.download_file( |
|
file_info, |
|
download_dir, |
|
get_domain(file_info['url']) |
|
) |
|
|
|
if downloaded_path: |
|
st.session_state.downloaded_paths.append(downloaded_path) |
|
|
|
download_progress.progress(1.0) |
|
status_text.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!") |
|
st.session_state.download_complete = True |
|
|
|
|
|
asyncio.run(run_download()) |
|
st.rerun() |
|
|
|
|
|
if 'google_drive_button' in locals() and google_drive_button and st.session_state.google_credentials and st.session_state.downloaded_paths: |
|
with st.spinner("Uploading to Google Drive..."): |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials) |
|
|
|
|
|
folder_id = None |
|
folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader" |
|
|
|
|
|
query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false" |
|
results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute() |
|
items = results.get('files', []) |
|
|
|
if not items: |
|
|
|
folder_id = create_drive_folder(drive_service, folder_name) |
|
else: |
|
folder_id = items[0]['id'] |
|
|
|
|
|
upload_progress = st.progress(0) |
|
status_text = st.empty() |
|
uploaded_count = 0 |
|
|
|
for i, path in enumerate(st.session_state.downloaded_paths): |
|
progress = i / len(st.session_state.downloaded_paths) |
|
status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}") |
|
upload_progress.progress(progress) |
|
|
|
result = google_drive_upload(path, st.session_state.google_credentials, folder_id) |
|
if isinstance(result, str) and not result.startswith("Error"): |
|
uploaded_count += 1 |
|
|
|
upload_progress.progress(1.0) |
|
status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'") |
|
|
|
st.success(f"✅ Files uploaded to Google Drive successfully!") |
|
|
|
|
|
if clear_button: |
|
st.session_state.files = [] |
|
st.session_state.downloaded_paths = [] |
|
st.session_state.download_complete = False |
|
if 'preset_url' in st.session_state: |
|
st.session_state.preset_url = '' |
|
st.rerun() |
|
|
|
if __name__ == "__main__": |
|
main() |