diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,20 +1,79 @@ +import streamlit as st +# This MUST be the first Streamlit command +st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐") + +# Core imports import os -import json +import subprocess +from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError import asyncio -import streamlit as st import logging +from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote +import re +from pathlib import Path +from io import BytesIO +import random +from bs4 import BeautifulSoup +from PyPDF2 import PdfReader +import zipfile +import tempfile +import mimetypes +import requests +import datetime +import traceback +import base64 +import shutil +import json +import time +from PIL import Image +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas +import google_auth_oauthlib.flow +import googleapiclient.discovery +import google.auth.transport.requests +import googleapiclient.http -# Configure logging +# Enhanced RAG search imports +import nltk +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +import docx2txt + +# Try to import sentence-transformers for better embeddings +try: + from sentence_transformers import SentenceTransformer + HAVE_TRANSFORMERS = True +except ImportError: + HAVE_TRANSFORMERS = False + +# Try to download NLTK data if not already present +try: + nltk.data.find('tokenizers/punkt') +except LookupError: + try: + nltk.download('punkt', quiet=True) + except: + pass + +try: + nltk.data.find('corpora/stopwords') +except LookupError: + try: + nltk.download('stopwords', quiet=True) + from nltk.corpus import stopwords + STOPWORDS = set(stopwords.words('english')) + except: + STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by']) + +# -------------------- Logging Setup -------------------- logging.basicConfig( level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('app.log'), - logging.StreamHandler() - ] + format='%(asctime)s - %(levelname)s - %(message)s' ) +logger = logging.getLogger(__name__) -# Load Google OAuth config from environment variables +# -------------------- Google OAuth Config -------------------- GOOGLE_OAUTH_CONFIG = { "web": { "client_id": os.environ.get("GOOGLE_CLIENT_ID"), @@ -27,23 +86,3785 @@ GOOGLE_OAUTH_CONFIG = { } } -# Setup the UI -st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐") +# -------------------- Stealth and UA Settings -------------------- +# Extended user agent list for better variety +USER_AGENTS = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' +] -# Import the core components (still keeping modular organization) -from utils import USER_AGENTS, STEALTH_SETTINGS, PROXY_ROTATION_CONFIG -from utils import ( - get_random_user_agent, sizeof_fmt, create_zip_file, humanize_file_size, get_domain, - is_download_link, normalize_download_url, detect_captcha, show_user_friendly_error -) -from google_drive import ( - get_google_auth_url, exchange_code_for_credentials, google_drive_upload, create_drive_folder -) -from download_manager import DownloadManager -from rag_search import EnhancedRAGSearch +# Stealth browser settings +STEALTH_SETTINGS = { + # Hardware features to modify/disable + "hardware_concurrency": 4, + "device_memory": 8, + # Browser features to enable/disable + "webgl_vendor": "Google Inc. (Intel)", + "webgl_renderer": "Intel Iris OpenGL Engine", + "languages": ["en-US", "en"], + "disable_webrtc": True, + # Additional timing randomization + "navigator_platform": "Win32", + "touch_support": False +} + +# Proxy rotation configuration (if using multiple proxies) +PROXY_ROTATION_CONFIG = { + "enabled": False, # Set to True to enable rotation + "rotation_interval": 10, # Rotate every 10 requests + "proxies": [] # Will be populated from the UI if needed +} + +# -------------------- Enhanced RAG Search with Small LLM -------------------- +class EnhancedRAGSearch: + def __init__(self): + self.file_texts = [] + self.chunks = [] # Document chunks for more targeted search + self.chunk_metadata = [] # Metadata for each chunk + self.file_metadata = [] + self.languages = [] + self.model = None + + # Try to load the sentence transformer model if available + if HAVE_TRANSFORMERS: + try: + # Use a small, efficient model + self.model = SentenceTransformer('all-MiniLM-L6-v2') + self.use_transformer = True + logger.info("Using sentence-transformers for RAG") + except Exception as e: + logger.warning(f"Error loading sentence-transformer: {e}") + self.use_transformer = False + else: + self.use_transformer = False + + # Fallback to TF-IDF if transformers not available + if not self.use_transformer: + self.vectorizer = TfidfVectorizer( + stop_words='english', + ngram_range=(1, 2), # Use bigrams for better context + max_features=15000, # Use more features for better representation + min_df=1 # Include rare terms + ) + + self.vectors = None + self.chunk_vectors = None + + def add_file(self, file_data, file_info): + """Add a file to the search index with improved processing""" + file_ext = os.path.splitext(file_info['filename'])[1].lower() + text = self.extract_text(file_data, file_ext) + + if text: + # Store the whole document text + self.file_texts.append(text) + self.file_metadata.append(file_info) + + # Try to detect language + try: + # Simple language detection based on stopwords + words = re.findall(r'\b\w+\b', text.lower()) + english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100])) + lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown' + self.languages.append(lang) + except: + self.languages.append('en') # Default to English + + # Create chunks for more granular search + chunks = self.create_chunks(text) + for chunk in chunks: + self.chunks.append(chunk) + self.chunk_metadata.append({ + 'file_info': file_info, + 'chunk_size': len(chunk), + 'file_index': len(self.file_texts) - 1 + }) + + return True + return False + + def create_chunks(self, text, chunk_size=1000, overlap=200): + """Split text into overlapping chunks for better search precision""" + # Try to use NLTK for sentence-aware chunking + try: + sentences = nltk.sent_tokenize(text) + chunks = [] + current_chunk = "" + + for sentence in sentences: + if len(current_chunk) + len(sentence) <= chunk_size: + current_chunk += sentence + " " + else: + # Add current chunk if it has content + if current_chunk: + chunks.append(current_chunk.strip()) + + # Start new chunk with overlap from previous chunk + if len(current_chunk) > overlap: + # Find the last space within the overlap region + overlap_text = current_chunk[-overlap:] + last_space = overlap_text.rfind(' ') + if last_space != -1: + current_chunk = current_chunk[-(overlap-last_space):] + sentence + " " + else: + current_chunk = sentence + " " + else: + current_chunk = sentence + " " + + # Add the last chunk if it has content + if current_chunk: + chunks.append(current_chunk.strip()) + + return chunks + except: + # Fallback to simpler chunking approach + chunks = [] + for i in range(0, len(text), chunk_size - overlap): + chunk = text[i:i + chunk_size] + if chunk: + chunks.append(chunk) + return chunks + + def extract_text(self, file_data, file_ext): + """Extract text from different file types with enhanced support""" + try: + if file_ext.lower() == '.pdf': + reader = PyPDF2.PdfReader(BytesIO(file_data)) + text = "" + for page in reader.pages: + extracted = page.extract_text() + if extracted: + text += extracted + "\n" + # If text extraction fails, try to OCR (would need extra libraries) + return text + elif file_ext.lower() in ['.docx', '.doc']: + return docx2txt.process(BytesIO(file_data)) + elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']: + # Handle both UTF-8 and other common encodings + try: + return file_data.decode('utf-8', errors='ignore') + except: + encodings = ['latin-1', 'iso-8859-1', 'windows-1252'] + for enc in encodings: + try: + return file_data.decode(enc, errors='ignore') + except: + pass + # Last resort fallback + return file_data.decode('utf-8', errors='ignore') + elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']: + # For these types, we would need additional libraries + # For now, return a placeholder message + return f"[Content of {file_ext} file - install additional libraries for full text extraction]" + else: + return "" + except Exception as e: + logger.error(f"Error extracting text: {e}") + return "" + + def build_index(self): + """Build both document and chunk search indices""" + if not self.file_texts: + return False + + try: + if self.use_transformer: + # Use sentence transformer models for embeddings + logger.info("Building document and chunk embeddings with transformer model...") + self.vectors = self.model.encode(self.file_texts, show_progress_bar=False) + + # Build chunk-level index if we have chunks + if self.chunks: + # Process in batches to avoid memory issues + batch_size = 32 + chunk_vectors = [] + for i in range(0, len(self.chunks), batch_size): + batch = self.chunks[i:i+batch_size] + batch_vectors = self.model.encode(batch, show_progress_bar=False) + chunk_vectors.append(batch_vectors) + self.chunk_vectors = np.vstack(chunk_vectors) + else: + # Build document-level index + self.vectors = self.vectorizer.fit_transform(self.file_texts) + + # Build chunk-level index if we have chunks + if self.chunks: + self.chunk_vectors = self.vectorizer.transform(self.chunks) + + return True + except Exception as e: + logger.error(f"Error building search index: {e}") + return False + + def expand_query(self, query): + """Add related terms to query for better recall - mini LLM function""" + # Dictionary of related terms for common keywords + expansions = { + "exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"], + "test": ["exam", "quiz", "assessment", "paper"], + "document": ["file", "paper", "report", "doc", "documentation"], + "manual": ["guide", "instruction", "documentation", "handbook"], + "tutorial": ["guide", "instructions", "how-to", "lesson"], + "article": ["paper", "publication", "journal", "research"], + "research": ["study", "investigation", "paper", "analysis"], + "book": ["textbook", "publication", "volume", "edition"], + "thesis": ["dissertation", "paper", "research", "study"], + "report": ["document", "paper", "analysis", "summary"], + "assignment": ["homework", "task", "project", "work"], + "lecture": ["class", "presentation", "talk", "lesson"], + "notes": ["annotations", "summary", "outline", "study material"], + "syllabus": ["curriculum", "course outline", "program", "plan"], + "paper": ["document", "article", "publication", "exam", "test"], + "question": ["problem", "query", "exercise", "inquiry"], + "solution": ["answer", "resolution", "explanation", "result"], + "reference": ["source", "citation", "bibliography", "resource"], + "analysis": ["examination", "study", "evaluation", "assessment"], + "guide": ["manual", "instruction", "handbook", "tutorial"], + "worksheet": ["exercise", "activity", "handout", "practice"], + "review": ["evaluation", "assessment", "critique", "feedback"], + "material": ["resource", "content", "document", "information"], + "data": ["information", "statistics", "figures", "numbers"] + } + + # Enhanced query expansion simulating a mini-LLM + query_words = re.findall(r'\b\w+\b', query.lower()) + expanded_terms = set() + + # Directly add expansions from our dictionary + for word in query_words: + if word in expansions: + expanded_terms.update(expansions[word]) + + # Add common academic file formats if not already included + if any(term in query.lower() for term in ["file", "document", "download", "paper"]): + if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]): + expanded_terms.update(["pdf", "docx", "pptx", "xlsx"]) + + # Add special academic terms when the query seems related to education + if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]): + expanded_terms.update(["syllabus", "lecture", "notes", "textbook"]) + + # Return original query plus expanded terms + if expanded_terms: + expanded_query = f"{query} {' '.join(expanded_terms)}" + logger.info(f"Expanded query: '{query}' -> '{expanded_query}'") + return expanded_query + return query + + def search(self, query, top_k=5, search_chunks=True): + """Enhanced search with both document and chunk-level search""" + if self.vectors is None: + return [] + + # Simulate a small LLM by expanding the query with related terms + expanded_query = self.expand_query(query) + + try: + results = [] + + if self.use_transformer: + # Transform the query to embedding + query_vector = self.model.encode([expanded_query])[0] + + # First search at document level for higher-level matches + if self.vectors is not None: + # Compute similarities between query and documents + doc_similarities = cosine_similarity( + query_vector.reshape(1, -1), + self.vectors + ).flatten() + + top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] + + for i, idx in enumerate(top_doc_indices): + if doc_similarities[idx] > 0.2: # Threshold to exclude irrelevant results + results.append({ + 'file_info': self.file_metadata[idx], + 'score': float(doc_similarities[idx]), + 'rank': i+1, + 'match_type': 'document', + 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' + }) + + # Then search at chunk level for more specific matches if enabled + if search_chunks and self.chunk_vectors is not None: + # Compute similarities between query and chunks + chunk_similarities = cosine_similarity( + query_vector.reshape(1, -1), + self.chunk_vectors + ).flatten() + + top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results + + # Use a set to avoid duplicate file results + seen_files = set(r['file_info']['url'] for r in results) + + for i, idx in enumerate(top_chunk_indices): + if chunk_similarities[idx] > 0.25: # Higher threshold for chunks + file_index = self.chunk_metadata[idx]['file_index'] + file_info = self.file_metadata[file_index] + + # Only add if we haven't already included this file + if file_info['url'] not in seen_files: + seen_files.add(file_info['url']) + results.append({ + 'file_info': file_info, + 'score': float(chunk_similarities[idx]), + 'rank': len(results) + 1, + 'match_type': 'chunk', + 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', + 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] + }) + + # Stop after we've found enough results + if len(results) >= top_k*1.5: + break + else: + # Fallback to TF-IDF if transformers not available + query_vector = self.vectorizer.transform([expanded_query]) + + # First search at document level + if self.vectors is not None: + doc_similarities = cosine_similarity(query_vector, self.vectors).flatten() + top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] + + for i, idx in enumerate(top_doc_indices): + if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results + results.append({ + 'file_info': self.file_metadata[idx], + 'score': float(doc_similarities[idx]), + 'rank': i+1, + 'match_type': 'document', + 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' + }) + + # Then search at chunk level if enabled + if search_chunks and self.chunk_vectors is not None: + chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten() + top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] + + # Avoid duplicates + seen_files = set(r['file_info']['url'] for r in results) + + for i, idx in enumerate(top_chunk_indices): + if chunk_similarities[idx] > 0.15: + file_index = self.chunk_metadata[idx]['file_index'] + file_info = self.file_metadata[file_index] + + if file_info['url'] not in seen_files: + seen_files.add(file_info['url']) + results.append({ + 'file_info': file_info, + 'score': float(chunk_similarities[idx]), + 'rank': len(results) + 1, + 'match_type': 'chunk', + 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', + 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] + }) + + if len(results) >= top_k*1.5: + break + + # Sort combined results by score + results.sort(key=lambda x: x['score'], reverse=True) + + # Re-rank and truncate + for i, result in enumerate(results[:top_k]): + result['rank'] = i+1 + + return results[:top_k] + except Exception as e: + logger.error(f"Error during search: {e}") + return [] + +# -------------------- Utility Functions -------------------- +def get_random_user_agent(): + return random.choice(USER_AGENTS) + +def sizeof_fmt(num, suffix='B'): + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}{suffix}" + num /= 1024.0 + return f"{num:.1f}Y{suffix}" + +def create_zip_file(file_paths, output_dir): + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") + with zipfile.ZipFile(zip_path, 'w') as zipf: + for file_path in file_paths: + zipf.write(file_path, os.path.basename(file_path)) + return zip_path + +def get_file_extension(url, default='.pdf'): + """Extract file extension from URL or filename""" + path = urlparse(url).path + ext = os.path.splitext(path)[1].lower() + if not ext: + return default + return ext + +def humanize_file_size(size_bytes): + """Format file size in human-readable format""" + if size_bytes < 1024: + return f"{size_bytes} bytes" + for unit in ['KB', 'MB', 'GB', 'TB']: + size_bytes /= 1024.0 + if size_bytes < 1024.0: + return f"{size_bytes:.1f} {unit}" + return f"{size_bytes:.1f} PB" + +def get_domain(url): + """Extract domain from URL""" + parsed = urlparse(url) + return parsed.netloc + +def is_valid_file_url(url, extensions): + """Check if URL is a valid file URL based on extension""" + return any(url.lower().endswith(ext) for ext in extensions) + +def detect_captcha(html_content): + """Detect common captcha patterns in HTML content""" + captcha_patterns = [ + 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', + 'challenge', 'solve the following', 'verify you are human' + ] + html_lower = html_content.lower() + return any(pattern in html_lower for pattern in captcha_patterns) + +def is_download_link(url): + """Enhanced function to detect if a URL is likely a download link""" + # Check for obvious download indicators in URL + url_lower = url.lower() + + # Check for common download-related terms in the URL + download_terms = [ + 'download', 'dl', 'get', 'file', 'attachment', 'export', 'view', + 'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document' + ] + if any(term in url_lower for term in download_terms): + return True + + # Check for common download script patterns + script_patterns = [ + 'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php', + 'download.aspx', 'getfile.aspx', 'file.aspx', + 'downloadhandler', 'filehandler', 'filedownload', + 'download.jsp', 'download.cgi', 'download.do', + 'download-file', 'get-file', + 'downloadfile', 'getfile', 'viewfile', + 'Action=downloadfile', 'action=download', 'action=view', + 'download?', 'file?', 'get?', 'view?' + ] + if any(pattern in url_lower for pattern in script_patterns): + return True + + # Check for common file extensions in the URL path or parameters + path = urlparse(url).path + common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', + '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg', + '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov'] + + if any(ext in path.lower() for ext in common_extensions): + return True + + # Check for file ID or file parameters in URL + params = parse_qs(urlparse(url).query) + param_keys = params.keys() + file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid'] + if any(key.lower() in file_param_indicators for key in param_keys): + return True + + # Check for complex encoding patterns like in the example URL + if 'Action=downloadfile' in url or 'fname=' in url: + return True + + return False + +def normalize_download_url(url): + """Normalize download URLs to handle various formats and encodings""" + try: + # Handle common URL shorteners and redirections + parsed = urlparse(url) + + # Handle phpMyAdmin-style encoded URLs + if 'Action=downloadfile' in url and 'file=' in url: + # Extract the encoded file parameter + params = parse_qs(parsed.query) + if 'file' in params: + # This is just a placeholder - in a real implementation, + # you would need to handle the specific encoding used + encoded_file = params['file'][0] + # Keep the URL as is for now, since we'll handle it during download + return url + + # Handle URLs with fname parameter (like in the example) + if 'fname=' in url: + # Keep as is - we'll handle this specially during download + return url + + # For other URLs, make sure they are properly quoted + path = parsed.path + # Only quote the path portion if needed + if '%' not in path and ' ' in path: + path = quote(path) + + # Reconstruct the URL + normalized = parsed._replace(path=path).geturl() + return normalized + except Exception as e: + logger.error(f"Error normalizing URL {url}: {e}") + return url + +# -------------------- Google Drive Functions -------------------- +def get_google_auth_url(): + client_config = GOOGLE_OAUTH_CONFIG["web"] + flow = google_auth_oauthlib.flow.Flow.from_client_config( + {"web": client_config}, + scopes=["https://www.googleapis.com/auth/drive.file"] + ) + flow.redirect_uri = client_config["redirect_uris"][0] + authorization_url, _ = flow.authorization_url( + access_type="offline", + include_granted_scopes="true", + prompt="consent" + ) + return authorization_url + +def exchange_code_for_credentials(auth_code): + if not auth_code.strip(): + return None, "No code provided." + try: + client_config = GOOGLE_OAUTH_CONFIG["web"] + flow = google_auth_oauthlib.flow.Flow.from_client_config( + {"web": client_config}, + scopes=["https://www.googleapis.com/auth/drive.file"] + ) + flow.redirect_uri = client_config["redirect_uris"][0] + flow.fetch_token(code=auth_code.strip()) + creds = flow.credentials + if not creds or not creds.valid: + return None, "Could not validate credentials. Check code and try again." + return creds, "Google Sign-In successful!" + except Exception as e: + return None, f"Error during token exchange: {e}" + +def google_drive_upload(file_path, credentials, folder_id=None): + try: + drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) + file_metadata = {'name': os.path.basename(file_path)} + if folder_id: + file_metadata['parents'] = [folder_id] + media = googleapiclient.http.MediaFileUpload(file_path, resumable=True) + created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() + return created.get("id", "") + except Exception as e: + return f"Error uploading to Drive: {str(e)}" + +def create_drive_folder(drive_service, name): + folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'} + folder = drive_service.files().create(body=folder_metadata, fields='id').execute() + return folder.get('id') + +# -------------------- Playwright Setup -------------------- +def install_playwright_dependencies(): + try: + # Set environment variable for Playwright browsers path + os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") + + # Install system dependencies + subprocess.run(['apt-get', 'update', '-y'], check=True) + packages = [ + 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', + 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', + 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' + ] + subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) + + # Install Playwright and dependencies + subprocess.run(['pip', 'install', 'playwright'], check=True) + subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) + + st.success("Playwright dependencies installed successfully!") + except Exception as e: + st.error(f"Error installing Playwright dependencies: {e}") + st.info("You may need to manually install dependencies. Check console for details.") + logger.error(f"Playwright setup error: {e}") + traceback.print_exc() + +# -------------------- Download Manager Class -------------------- +class DownloadManager: + def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): + self.use_proxy = use_proxy + self.proxy = proxy + self.query = query + self.num_results = num_results + self.playwright = None + self.browser = None + self.context = None + self.page = None + self.use_stealth = use_stealth + self.proxy_rotation = proxy_rotation + self.request_count = 0 + self.captcha_detected = False + self.download_timeout = 300 # 5 minutes timeout for downloads + # Track visited URLs to avoid revisiting the same URL multiple times + self.visited_urls = set() + # Track successfully downloaded files to avoid redownloading + self.downloaded_files = set() + + async def __aenter__(self): + self.playwright = await async_playwright().start() + + # Prepare browser args with stealth settings + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--no-zygote', + '--single-process', + '--disable-web-security', + '--disable-features=IsolateOrigins', + '--disable-site-isolation-trials' + ] + + # Add stealth-specific args + if self.use_stealth: + browser_args.extend([ + '--disable-blink-features=AutomationControlled', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-webgl', + '--disable-webrtc' + ]) + + # Setup browser options + opts = { + "headless": True, + "args": browser_args + } + + # Configure proxy if specified + if self.use_proxy and self.proxy: + opts["proxy"] = {"server": self.proxy} + + # Launch browser with options + self.browser = await self.playwright.chromium.launch(**opts) + + # Setup browser context with enhanced settings + context_opts = { + "user_agent": get_random_user_agent(), + "viewport": {"width": 1920, "height": 1080}, + "device_scale_factor": 1, + "has_touch": False, + "is_mobile": False, + "ignore_https_errors": True, + "accept_downloads": True + } + + # Apply stealth-specific settings to the context + if self.use_stealth: + # Apply JS-injection for enhanced stealth + context_opts["bypass_csp"] = True + self.context = await self.browser.new_context(**context_opts) + + # Execute stealth JS to avoid detection + await self.context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Change navigator properties + const newProto = navigator.__proto__; + delete newProto.webdriver; + + // Overwrite the plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + // Handle languages more naturally + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + + // Modify hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 4 + }); + + // Modify deviceMemory + Object.defineProperty(navigator, 'deviceMemory', { + get: () => 8 + }); + + // WebGL modifications + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) { + return 'Intel Inc.'; + } + if (parameter === 37446) { + return 'Intel Iris OpenGL Engine'; + } + return getParameter.apply(this, arguments); + }; + } + """) + else: + # Regular context without stealth + self.context = await self.browser.new_context(**context_opts) + + # Create page with enhanced headers + self.page = await self.context.new_page() + await self.page.set_extra_http_headers({ + 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Cache-Control': 'max-age=0', + 'DNT': '1', # Do Not Track + 'Referer': 'https://www.google.com/', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1' + }) + + # Add delay for mouse movements to simulate human behavior + if self.use_stealth: + await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) + await self.page.wait_for_timeout(random.randint(200, 500)) + + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self.browser: + await self.browser.close() + if self.playwright: + await self.playwright.stop() + + async def rotate_proxy_if_needed(self): + """Rotate proxy if proxy rotation is enabled and threshold is reached""" + if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: + self.request_count += 1 + if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: + # Get next proxy from the pool + next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) + PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list + + # Close existing context and create new one with the new proxy + if self.context: + await self.context.close() + + # Create new context with the new proxy + context_opts = { + "user_agent": get_random_user_agent(), + "proxy": {"server": next_proxy}, + "accept_downloads": True + } + self.context = await self.browser.new_context(**context_opts) + self.page = await self.context.new_page() + + # Reset counter + self.request_count = 0 + logger.info(f"Rotated to new proxy: {next_proxy}") -# Initialize session state variables -def initialize_session_state(): + async def handle_captcha(self, page): + """Detect and handle captchas if possible""" + # Check for common captcha patterns + content = await page.content() + if detect_captcha(content): + self.captcha_detected = True + logger.warning("Captcha detected on page") + + # Strategies for handling captchas: + # 1. For simple captchas, try to extract the image and solve it + captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') + if captcha_img: + logger.info("Found captcha image, attempting to capture") + + # Take screenshot of the captcha + captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") + await captcha_img.screenshot(path=captcha_path) + + # In a real implementation, you would send this to a captcha solving service + # For now, just log the detection + logger.info(f"Captcha image saved to {captcha_path}") + + # For demonstration, we'll notify the user but not actually solve it + return False + + # 2. For reCAPTCHA, special handling would be required + recaptcha = await page.query_selector('iframe[src*="recaptcha"]') + if recaptcha: + logger.warning("reCAPTCHA detected, would require external solving service") + return False + + # 3. Try to perform human-like actions that might bypass simple bot checks + await self.perform_human_actions(page) + + # Check if captcha is still present + content = await page.content() + if detect_captcha(content): + logger.warning("Captcha still present after human-like actions") + return False + else: + logger.info("Captcha appears to be resolved") + return True + + return True # No captcha detected + + async def perform_human_actions(self, page): + """Perform human-like actions on the page to possibly bypass simple bot checks""" + try: + # 1. Slowly scroll down the page + for i in range(3): + await page.evaluate(f"window.scrollTo(0, {i * 300})") + await page.wait_for_timeout(random.randint(300, 700)) + + # 2. Random mouse movements + for _ in range(3): + x = random.randint(100, 800) + y = random.randint(100, 600) + await page.mouse.move(x=x, y=y) + await page.wait_for_timeout(random.randint(200, 500)) + + # 3. Click on a non-essential part of the page + try: + await page.click("body", position={"x": 50, "y": 50}) + except: + pass + + # 4. Wait a bit before continuing + await page.wait_for_timeout(1000) + + except Exception as e: + logger.warning(f"Error during human-like actions: {e}") + + async def search_bing(self): + urls = [] + try: + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + search_url = f"https://www.bing.com/search?q={self.query}" + await self.page.goto(search_url, timeout=30000) + await self.page.wait_for_load_state('networkidle') + + # Check for captchas + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected during search, results may be limited") + + # More natural scrolling behavior + for i in range(3): + await self.page.evaluate(f"window.scrollTo(0, {i * 400})") + await self.page.wait_for_timeout(random.randint(300, 800)) + + # Extract search results + links = await self.page.query_selector_all("li.b_algo h2 a") + for link in links[:self.num_results]: + href = await link.get_attribute('href') + if href: + urls.append(href) + + # If we didn't find enough results, try an alternative selector + if len(urls) < self.num_results: + alt_links = await self.page.query_selector_all(".b_caption a") + for link in alt_links: + href = await link.get_attribute('href') + if href and href not in urls: + urls.append(href) + if len(urls) >= self.num_results: + break + + return urls + except Exception as e: + logger.error(f"Error searching Bing: {e}") + return [] + + async def get_file_size(self, url): + try: + await self.rotate_proxy_if_needed() + + # For complex download URLs, we need to be careful with HEAD requests + if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: + # For these URLs, we'll try a more reliable approach using range headers + headers = { + 'User-Agent': get_random_user_agent(), + 'Range': 'bytes=0-0' # Just request the first byte to check headers + } + + try: + with requests.get(url, headers=headers, stream=True, timeout=10) as r: + if 'Content-Range' in r.headers: + content_range = r.headers['Content-Range'] + match = re.search(r'bytes 0-0/(\d+)', content_range) + if match: + size = int(match.group(1)) + return sizeof_fmt(size) + + if 'Content-Length' in r.headers: + size = int(r.headers['Content-Length']) + # If size is 1, it's likely just our single requested byte + if size > 1: + return sizeof_fmt(size) + except Exception as e: + logger.warning(f"Error getting file size with Range request: {e}") + + # Fallback to browser approach + try: + async with self.context.new_page() as page: + response = await page.request.head(url, timeout=15000) + length = response.headers.get('Content-Length', None) + if length: + return sizeof_fmt(int(length)) + except Exception as e: + logger.warning(f"Error getting file size with browser: {e}") + + return "Unknown Size" + else: + # Standard approach for normal URLs + async with self.context.new_page() as page: + response = await page.request.head(url, timeout=15000) + length = response.headers.get('Content-Length', None) + if length: + return sizeof_fmt(int(length)) + else: + return "Unknown Size" + except Exception as e: + logger.warning(f"Error getting file size: {e}") + return "Unknown Size" + + async def get_pdf_metadata(self, url): + try: + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + resp = await page.request.get(url, timeout=15000) + if resp.ok: + content = await resp.body() + pdf = BytesIO(content) + reader = PdfReader(pdf) + return { + 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', + 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', + 'Pages': len(reader.pages), + } + else: + return {} + except Exception as e: + logger.warning(f"Error reading PDF metadata: {e}") + return {} + + async def extract_real_download_url(self, url): + """Enhanced method to extract real download URL, handling complex URLs""" + try: + # Check if this is a complex download URL that needs special handling + if 'Action=downloadfile' in url or 'fname=' in url: + logger.info(f"Complex download URL detected: {url}") + + # For these special cases, we'll use the browser to navigate and intercept redirects + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + # Set up request interception to capture redirects + await page.route('**', lambda route: route.continue_()) + + # Listen for all responses + responses = [] + page.on('response', lambda response: responses.append(response)) + + try: + # Go to the URL + await page.goto(url, wait_until='networkidle', timeout=30000) + + # Check all responses for potential downloads + for response in responses: + # Look for content-disposition headers indicating a download + content_disposition = response.headers.get('Content-Disposition', '') + if 'attachment' in content_disposition or 'filename=' in content_disposition: + return response.url + + # Look for content-type headers indicating a file + content_type = response.headers.get('Content-Type', '') + if content_type and content_type != 'text/html' and not content_type.startswith('text/'): + return response.url + + # If no clear download was detected, return the final URL + return page.url + except Exception as e: + logger.warning(f"Error extracting real download URL: {e}") + return url + else: + # Standard approach for normal URLs + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + response = await page.goto(url, wait_until='networkidle', timeout=30000) + if response and response.headers.get('location'): + return response.headers['location'] + return page.url + except Exception as e: + logger.error(f"Error extracting real download URL: {e}") + return url + + # IMPROVED: Enhanced exam links extraction method + async def get_edu_exam_links(self, url): + """Specialized method for educational exam websites that follows a common pattern.""" + try: + logger.info(f"Fetching exam links from {url}") + links = set() + + # First try with direct requests for speed (but with proper headers) + headers = { + "User-Agent": get_random_user_agent(), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Referer": "https://www.google.com/", + "DNT": "1" + } + + try: + response = requests.get(url, headers=headers, timeout=30) + + if response.status_code == 200: + # Parse with BeautifulSoup first for efficiency + soup = BeautifulSoup(response.text, "html.parser") + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + + # Look for all links + for a in soup.find_all("a", href=True): + href = a["href"] + full_url = urljoin(url, href) + + # Look for text clues + link_text = a.get_text().lower() + + # Special patterns for exam sites (expanded list) + url_patterns = [ + "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", + "/test/", "/download/", "/files/", "/assignments/", + "paper_", "question_", "exam_", "test_", "past_", + "assignment_", "sample_", "study_material", "notes_", + "/resource/", "/subject/", "/course/", "/material/" + ] + + text_patterns = [ + "exam", "paper", "test", "question", "past", "download", + "assignment", "sample", "study", "material", "notes", + "subject", "course", "resource", "pdf", "document", + "view", "open", "get", "solution", "answer" + ] + + # Check URL for patterns + if any(pattern in full_url.lower() for pattern in url_patterns): + links.add(full_url) + continue + + # Check link text for patterns + if any(pattern in link_text for pattern in text_patterns): + links.add(full_url) + continue + + # Check for common file extensions + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(full_url) + + # Check for download script parameters + if "Action=downloadfile" in url or "fname=" in url: + links.add(url) # Add the URL itself as it's a download link + except Exception as e: + logger.warning(f"Request-based extraction failed: {e}") + + # Browser-based approach for more thorough extraction or if initial approach was inadequate + try: + # Check if we need to proceed with browser-based extraction + if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: + logger.info("Using browser for enhanced link extraction") + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Navigate to the page with more natural timing + await self.page.goto(url, timeout=45000, wait_until='networkidle') + await self.page.wait_for_timeout(random.randint(1000, 2000)) + + # Handle captchas if present + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected, extraction may be limited") + + # Get base URL for resolving relative links + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + + # Perform natural scrolling to trigger lazy-loaded content + page_height = await self.page.evaluate("document.body.scrollHeight") + viewport_height = await self.page.evaluate("window.innerHeight") + + for scroll_pos in range(0, page_height, viewport_height // 2): + await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") + await self.page.wait_for_timeout(random.randint(300, 800)) + + # Scroll back to top + await self.page.evaluate("window.scrollTo(0, 0)") + await self.page.wait_for_timeout(500) + + # Extract all links with Playwright (better than just anchor tags) + all_links = await self.page.evaluate(""" + () => { + const results = []; + + // Get all anchor tags + const anchors = document.querySelectorAll('a[href]'); + for (const a of anchors) { + if (a.href) { + results.push({ + href: a.href, + text: a.innerText || a.textContent || '', + isButton: a.classList.contains('btn') || a.role === 'button' + }); + } + } + + // Get buttons that might contain links + const buttons = document.querySelectorAll('button'); + for (const btn of buttons) { + const onclick = btn.getAttribute('onclick') || ''; + if (onclick.includes('window.location') || onclick.includes('download')) { + results.push({ + href: '#button', + text: btn.innerText || btn.textContent || '', + isButton: true, + onclick: onclick + }); + } + } + + return results; + } + """) + + # Process the extracted links + for link_info in all_links: + href = link_info.get('href', '') + text = link_info.get('text', '').lower() + + if href and href != '#button': + # Check URL patterns + url_patterns = [ + "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", + "/test/", "/download/", "/files/", "/assignments/", + "paper_", "question_", "exam_", "test_", "past_", + "assignment_", "sample_", "study_material", "notes_" + ] + + # Check text patterns + text_patterns = [ + "exam", "paper", "test", "question", "past", "download", + "assignment", "sample", "study", "material", "notes", + "pdf", "document", "view", "open", "solution" + ] + + if any(pattern in href.lower() for pattern in url_patterns) or \ + any(pattern in text for pattern in text_patterns) or \ + any(href.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(href) + + # Check for download links in the page + download_links = await self.page.evaluate(""" + () => { + // Find all links that might be download links + const links = Array.from(document.querySelectorAll('a[href]')); + return links + .filter(a => { + const href = a.href.toLowerCase(); + return href.includes('download') || + href.includes('getfile') || + href.includes('view.php') || + href.includes('action=downloadfile') || + href.includes('fname='); + }) + .map(a => a.href); + } + """) + + for dl_link in download_links: + links.add(dl_link) + + # Check for ASP.NET specific elements that might contain exam links + grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') + for grid in grid_elements: + grid_links = await grid.query_selector_all('a[href]') + for a in grid_links: + href = await a.get_attribute('href') + text = await a.text_content() + + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + links.add(full_url) + + # Try clicking pagination controls to reveal more content + pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') + for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons + try: + # Check if this is a numeric pagination button (more likely to be useful) + button_text = await button.text_content() + if button_text and button_text.strip().isdigit(): + logger.info(f"Clicking pagination button: {button_text}") + await button.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Extract links from this page + new_page_links = await self.page.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); + } + """) + + for href in new_page_links: + if href and not href.startswith('javascript:'): + if any(pattern in href.lower() for pattern in url_patterns) or \ + any(href.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(href) + except Exception as e: + logger.warning(f"Error clicking pagination button: {e}") + + # Try clicking any controls that might reveal more exam links (more focused approach) + show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') + for button in show_buttons: + button_text = (await button.text_content() or "").lower() + button_value = (await button.get_attribute("value") or "").lower() + button_id = (await button.get_attribute("id") or "").lower() + + # Look for buttons that seem likely to reveal file lists + promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", + "download", "resource", "material", "browse", "file"] + + if any(term in button_text or term in button_value or term in button_id + for term in promising_terms): + try: + logger.info(f"Clicking button: {button_text or button_value}") + await button.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Get any new links that appeared + new_links = await self.page.query_selector_all('a[href]') + for a in new_links: + href = await a.get_attribute('href') + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + + # Focus on file extensions and patterns + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ + any(pattern in full_url.lower() for pattern in url_patterns): + links.add(full_url) + except Exception as e: + logger.warning(f"Error clicking button: {e}") + + # Special handling for ASP.NET PostBack links + try: + # Find and interact with ASP.NET __doPostBack elements + postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') + for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks + try: + onclick = await element.get_attribute('onclick') + if onclick and '__doPostBack' in onclick: + element_text = await element.text_content() + + # Only interact with elements that seem likely to contain exam links + promising_terms = ["show", "view", "list", "exam", "paper", "test", + "download", "resource", "material"] + + if any(term in element_text.lower() for term in promising_terms): + logger.info(f"Clicking ASP.NET postback element: {element_text}") + + # Click the element + await element.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=10000) + + # Extract any new links + new_links = await self.page.query_selector_all('a[href]') + for a in new_links: + href = await a.get_attribute('href') + if href: + full_url = href if href.startswith('http') else urljoin(url, href) + if any(full_url.lower().endswith(ext) for ext in + ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + links.add(full_url) + except Exception as e: + logger.warning(f"Error interacting with postback element: {e}") + except Exception as e: + logger.warning(f"Error during postback handling: {e}") + + except Exception as e: + logger.error(f"Browser-based extraction failed: {e}") + + # Filter links to likely contain exam documents + filtered_links = [] + for link in links: + # Common file extensions for exam documents + if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): + filtered_links.append(link) + continue + + # Common paths for exam documents + if any(pattern in link.lower() for pattern in [ + "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", + "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", + "/resource/", "/material/", "/notes/", "/subjectmaterial/" + ]): + filtered_links.append(link) + continue + + # Check for download links (these may not have obvious extensions) + if is_download_link(link): + filtered_links.append(link) + + logger.info(f"Found {len(filtered_links)} potential exam document links") + return filtered_links + + except Exception as e: + logger.error(f"Error getting exam links: {e}") + return [] + + async def discover_hidden_links(self, page): + """Discover hidden links that might be in JavaScript, iframes, or dynamic content""" + hidden_links = set() + + # Execute JavaScript to find links in script tags and data attributes + js_links = await page.evaluate(""" + () => { + const links = new Set(); + + // Extract URLs from script tags + const scripts = document.querySelectorAll('script'); + for (const script of scripts) { + const content = script.textContent || ''; + const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; + for (let match of urlMatches) { + links.add(match.replace(/["']/g, '')); + } + } + + // Look for download-related variables in scripts + for (const script of scripts) { + const content = script.textContent || ''; + // Look for common patterns for file URLs in JavaScript + if (content.includes('downloadURL') || content.includes('fileURL') || + content.includes('pdfURL') || content.includes('documentURL')) { + + // Extract potential URLs + const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; + for (let match of potentialUrls) { + const url = match.replace(/["']/g, ''); + // Try to resolve relative URLs + if (url.startsWith('/') || !url.includes('://')) { + if (url.startsWith('/')) { + links.add(window.location.origin + url); + } else { + // Handle relative paths more carefully + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + url); + } + } else if (url.startsWith('http')) { + links.add(url); + } + } + } + } + + // Check for links in data attributes + const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); + for (const el of elements) { + for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { + const val = el.getAttribute(attr); + if (val) { + // Try to resolve relative URLs + if (val.startsWith('/')) { + links.add(window.location.origin + val); + } else if (val.startsWith('http')) { + links.add(val); + } else if (!val.startsWith('javascript:') && !val.startsWith('#')) { + // Handle relative paths + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + val); + } + } + } + } + + // Look for URLs in inline event handlers + const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); + for (const el of clickableElements) { + for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { + const val = el.getAttribute(attr); + if (val) { + // Check for JavaScript URLs with window.location + if (val.includes('window.location') || val.includes('document.location')) { + const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); + if (urlMatch && urlMatch[1]) { + const url = urlMatch[1]; + if (url.startsWith('/')) { + links.add(window.location.origin + url); + } else if (url.startsWith('http')) { + links.add(url); + } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + url); + } + } + } + + // Check for direct URLs in attributes + const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; + for (let match of urlMatches) { + links.add(match.replace(/["']/g, '')); + } + + // Check for download.php and similar patterns + if (val.includes('download.php') || val.includes('getfile.php') || + val.includes('Action=downloadfile') || val.includes('viewfile.php')) { + + // Handle both onclick handlers and direct hrefs + let url = ''; + if (attr === 'href') { + url = val; + } else { + // Extract URL from JavaScript + const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); + if (jsUrlMatch) { + url = jsUrlMatch[1]; + } + } + + // Resolve URL if needed + if (url) { + if (url.startsWith('/')) { + links.add(window.location.origin + url); + } else if (url.startsWith('http')) { + links.add(url); + } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { + const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); + links.add(base + url); + } + } + } + } + } + } + + // Find PHP/ASP file download links + const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); + for (const link of fileLinks) { + links.add(link.href); + } + + return Array.from(links); + } + """) + + for link in js_links: + hidden_links.add(link) + + # Extract links from iframes + iframes = await page.query_selector_all('iframe') + for iframe in iframes: + try: + frame = await iframe.content_frame() + if frame: + iframe_links = await frame.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')) + .map(a => a.href) + .filter(href => href.startsWith('http')); + } + """) + for link in iframe_links: + hidden_links.add(link) + except Exception as e: + logger.warning(f"Could not extract links from iframe: {e}") + + # Look for links in shadow DOM (used in modern web components) + shadow_links = await page.evaluate(""" + () => { + const links = new Set(); + + // Helper function to recursively process shadow roots + function processShadowRoot(root) { + if (!root) return; + + // Get links in this shadow root + const shadowLinks = root.querySelectorAll('a[href]'); + for (const link of shadowLinks) { + if (link.href && link.href.startsWith('http')) { + links.add(link.href); + } + } + + // Process nested shadow roots + const elements = root.querySelectorAll('*'); + for (const el of elements) { + if (el.shadowRoot) { + processShadowRoot(el.shadowRoot); + } + } + } + + // Find all shadow roots in the document + const elements = document.querySelectorAll('*'); + for (const el of elements) { + if (el.shadowRoot) { + processShadowRoot(el.shadowRoot); + } + } + + return Array.from(links); + } + """) + + for link in shadow_links: + hidden_links.add(link) + + # Look for download links in forms + form_links = await page.evaluate(""" + () => { + const links = new Set(); + + // Check for form actions that might be download endpoints + const forms = document.querySelectorAll('form'); + for (const form of forms) { + const action = form.action || ''; + if (action && ( + action.includes('download') || + action.includes('getfile') || + action.includes('viewfile') || + action.includes('Action=downloadfile') + )) { + // Collect input values that might be needed for the download + const inputs = {}; + const formInputs = form.querySelectorAll('input[name]'); + for (const input of formInputs) { + inputs[input.name] = input.value; + } + + // Store both the form action and any important inputs + links.add(action); + } + } + + return Array.from(links); + } + """) + + for link in form_links: + hidden_links.add(link) + + return hidden_links + + async def extract_downloadable_files(self, url, custom_ext_list): + found_files = [] + try: + # Normalize the URL to handle special cases + normalized_url = normalize_download_url(url) + + # Skip if we've already visited this URL + if normalized_url in self.visited_urls: + logger.info(f"Skipping already visited URL: {normalized_url}") + return [] + + # Mark this URL as visited + self.visited_urls.add(normalized_url) + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # First check if this is a direct download link (Action=downloadfile or fname parameter) + if is_download_link(normalized_url): + logger.info(f"Processing potential direct download link: {normalized_url}") + + # Try to extract the real download URL if needed + real_url = await self.extract_real_download_url(normalized_url) + + # Determine filename - for complex URLs this can be tricky + filename = os.path.basename(urlparse(real_url).path) + + # Handle URL-encoded filenames + if '%' in filename: + try: + filename = unquote(filename) + except Exception: + pass + + # For URLs with download parameters, try to extract filename from query + if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): + # Look for file parameter + params = parse_qs(urlparse(normalized_url).query) + + # Check common filename parameters + for param in ['file', 'filename', 'name', 'fname', 'f']: + if param in params and params[param]: + potential_filename = params[param][0] + if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: + filename = os.path.basename(potential_filename) + break + + # If still no valid filename, use domain-based fallback + if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): + domain = get_domain(real_url) + # Try to determine file type from content-type or extension hints in URL + ext = '.pdf' # Default + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: + if common_ext in normalized_url.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + # Get file size + size_str = await self.get_file_size(real_url) + + # Add to found files + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': size_str, + 'metadata': {}, + 'download_url': normalized_url # Keep original URL for downloading + }) + + # For direct download links, we can return early + if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): + return found_files + + # Special handling for educational exam sites + if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in + ["exam", "test", "pastpaper", "eduexp"]): + logger.info("Using specialized handler for educational exam site") + + # Get direct links to exam files + exam_links = await self.get_edu_exam_links(url) + + for link in exam_links: + # Try to resolve any redirection + real_url = await self.extract_real_download_url(link) + filename = os.path.basename(urlparse(real_url).path) + + # If filename is URL encoded (common with Chinese/international sites) + if '%' in filename: + try: + filename = unquote(filename) + except Exception: + pass + + # If filename is empty or invalid, create a sensible one + if not filename or filename == '/': + domain = get_domain(real_url) + ext = '.pdf' # Default + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: + if common_ext in link.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + # Get file size + size_str = await self.get_file_size(real_url) + + # Get metadata for PDFs + meta = {} + if real_url.lower().endswith('.pdf'): + try: + meta = await self.get_pdf_metadata(real_url) + except Exception: + pass + + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': size_str, + 'metadata': meta, + 'download_url': link # Store original link for downloading + }) + + # If we found exam files with the specialized method, return them + if found_files: + return found_files + + # Standard extraction method if specialized method didn't find files + response = await self.page.goto(url, timeout=30000, wait_until='networkidle') + if not response: + return [] + + # Check for captchas + if not await self.handle_captcha(self.page): + logger.warning("Captcha detected, file extraction may be limited") + + # Scroll through the page naturally to trigger lazy loading + await self.page.evaluate(""" + (async () => { + const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const scrollStep = Math.floor(window.innerHeight / 2); + + for (let i = 0; i < height; i += scrollStep) { + window.scrollTo(0, i); + await delay(100); + } + + window.scrollTo(0, 0); + })() + """) + await self.page.wait_for_timeout(1000) + + final_url = self.page.url + if '.php' in final_url or 'download' in final_url: + real_url = await self.extract_real_download_url(final_url) + if real_url != final_url: + # Try to detect the filename from headers or URL + response = await self.page.request.head(real_url, timeout=15000) + filename = None + + # Try to get from Content-Disposition header + content_disposition = response.headers.get('Content-Disposition', '') + if 'filename=' in content_disposition: + filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) + if filename_match: + filename = filename_match.group(1) + + # If not found in headers, use URL basename + if not filename: + filename = os.path.basename(urlparse(real_url).path) + if not filename or filename == '/': + # Generate a name based on domain + domain = get_domain(real_url) + ext = '.pdf' # Default + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: + if common_ext in real_url.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + found_files.append({ + 'url': real_url, + 'filename': filename, + 'size': await self.get_file_size(real_url), + 'metadata': {}, + 'download_url': final_url # Keep original URL for downloading + }) + return found_files + + await self.page.wait_for_load_state('networkidle', timeout=30000) + content = await self.page.content() + soup = BeautifulSoup(content, 'html.parser') + + default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', + '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', + '.pptx', '.odt', '.txt'] + all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) + + parsed_base = urlparse(final_url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + path_base = os.path.dirname(parsed_base.path) + + # Process all anchor tags + for a in soup.find_all('a', href=True): + href = a['href'].strip() + + if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + real_url = await self.extract_real_download_url(full_url) + if real_url and real_url != full_url: + found_files.append({ + 'url': real_url, + 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', + 'size': await self.get_file_size(real_url), + 'metadata': {}, + 'download_url': full_url # Original URL for download + }) + continue + + if any(href.lower().endswith(ext) for ext in all_exts): + file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + size_str = await self.get_file_size(file_url) + meta = {} + if file_url.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(file_url) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': size_str, + 'metadata': meta, + 'download_url': file_url # Same as URL for direct links + }) + + # Handle Google Drive links + elif ("drive.google.com" in href) or ("docs.google.com" in href): + file_id = None + for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: + match = re.search(pattern, href) + if match: + file_id = match.group(1) + break + if file_id: + # Get file info to determine type and view-only status + file_type, is_view_only = await self.get_google_drive_file_info(file_id) + + # Create a more informative filename based on info + filename = f"gdrive_{file_id}" + if file_type: + filename = f"{filename}.{file_type}" + + size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") + + found_files.append({ + 'url': href, # Use original URL + 'filename': filename, + 'size': size_str, + 'metadata': { + 'view_only': is_view_only, + 'file_type': file_type, + 'file_id': file_id + }, + 'download_url': href # Same as URL for Google Drive + }) + + # Also check for files in other elements (iframe, embed, object, etc.) + other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) + for elem in other_elements: + src = elem.get('src') or elem.get('data') + if src and any(src.lower().endswith(ext) for ext in all_exts): + file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) + size_str = await self.get_file_size(file_url) + meta = {} + if file_url.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(file_url) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': size_str, + 'metadata': meta, + 'download_url': file_url + }) + + # Check for file links in onclick attributes + onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') + for elem in onclick_elements: + onclick = await elem.get_attribute('onclick') + urls = re.findall(r'(https?://[^\'"]+)', onclick) + for url_match in urls: + if any(url_match.lower().endswith(ext) for ext in all_exts): + size_str = await self.get_file_size(url_match) + meta = {} + if url_match.lower().endswith('.pdf'): + meta = await self.get_pdf_metadata(url_match) + found_files.append({ + 'url': url_match, + 'filename': os.path.basename(url_match.split('?')[0]), + 'size': size_str, + 'metadata': meta, + 'download_url': url_match + }) + + # Also check for data-src and data-url attributes (common in lazy-loaded sites) + data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') + for elem in data_elements: + for attr in ['data-src', 'data-url', 'data-href', 'data-download']: + try: + value = await elem.get_attribute(attr) + if value and any(value.lower().endswith(ext) for ext in all_exts): + file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) + found_files.append({ + 'url': file_url, + 'filename': os.path.basename(file_url.split('?')[0]), + 'size': await self.get_file_size(file_url), + 'metadata': {}, + 'download_url': file_url + }) + except: + pass + + # Check script tags for JSON data that might contain file URLs + script_elements = soup.find_all('script', type='application/json') + for script in script_elements: + try: + json_data = json.loads(script.string) + # Look for URL patterns in the JSON data + def extract_urls_from_json(obj, urls_found=None): + if urls_found is None: + urls_found = [] + if isinstance(obj, dict): + for k, v in obj.items(): + # Check if any key contains url-like terms + url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] + if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): + urls_found.append(v) + else: + extract_urls_from_json(v, urls_found) + elif isinstance(obj, list): + for item in obj: + extract_urls_from_json(item, urls_found) + return urls_found + + json_urls = extract_urls_from_json(json_data) + for json_url in json_urls: + if any(json_url.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': json_url, + 'filename': os.path.basename(json_url.split('?')[0]), + 'size': await self.get_file_size(json_url), + 'metadata': {}, + 'download_url': json_url + }) + except: + pass + + # Check for hidden download buttons or forms + hidden_elements = await self.page.evaluate(""" + () => { + const results = []; + + // Check for hidden forms with download actions + const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); + for (const form of forms) { + const action = form.getAttribute('action') || ''; + results.push({ + type: 'form', + action: action, + inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { + return {name: input.name, value: input.value}; + }) + }); + } + + // Check for hidden download links/buttons + const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { + const style = window.getComputedStyle(a); + return (style.display === 'none' || style.visibility === 'hidden') && + (a.href.includes('download') || a.href.includes('file')); + }); + + for (const link of hiddenLinks) { + results.push({ + type: 'link', + href: link.href, + text: link.innerText || link.textContent + }); + } + + return results; + } + """) + + # Process hidden elements + for elem in hidden_elements: + if elem['type'] == 'link' and 'href' in elem: + href = elem['href'] + if any(href.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': href, + 'filename': os.path.basename(href.split('?')[0]), + 'size': await self.get_file_size(href), + 'metadata': {}, + 'download_url': href + }) + + # Check for hidden links that might be in JavaScript, iframes, or dynamic content + hidden_links = await self.discover_hidden_links(self.page) + for link in hidden_links: + if any(link.lower().endswith(ext) for ext in all_exts): + found_files.append({ + 'url': link, + 'filename': os.path.basename(link.split('?')[0]), + 'size': await self.get_file_size(link), + 'metadata': {}, + 'download_url': link + }) + + # Deduplicate files by URL + seen_urls = set() + unique_files = [] + for f in found_files: + if f['url'] not in seen_urls: + seen_urls.add(f['url']) + unique_files.append(f) + + return unique_files + except Exception as e: + logger.error(f"Error extracting files from {url}: {e}") + traceback.print_exc() + return [] + + async def download_file(self, file_info, save_dir, referer): + file_url = file_info.get('download_url', file_info['url']) # Use download_url if available + fname = file_info['filename'] + path = os.path.join(save_dir, fname) + base, ext = os.path.splitext(fname) + counter = 1 + while os.path.exists(path): + path = os.path.join(save_dir, f"{base}_{counter}{ext}") + counter += 1 + os.makedirs(save_dir, exist_ok=True) + + # Check if we've already downloaded this file + if file_url in self.downloaded_files: + logger.info(f"File already downloaded: {file_url}") + return None + + try: + # Special handling for Google Drive files + if "drive.google.com" in file_url or "docs.google.com" in file_url: + # Check if it's marked as view-only in metadata + is_view_only = file_info.get('metadata', {}).get('view_only', False) + + # For view-only files, try our most robust approach first + if is_view_only: + logger.info(f"Attempting to download view-only file: {file_url}") + result_path = await self.force_download_viewonly(file_info, path) + if result_path: + self.downloaded_files.add(file_url) + return result_path + + # If that failed, try the regular download approach + logger.info("Primary method failed, trying fallback methods") + + # Try regular download methods + success = await self.download_from_google_drive(file_url, path) + if success: + self.downloaded_files.add(file_url) + return path + + # If all methods failed for Google Drive, try one last approach + logger.warning("All standard methods failed, attempting force download") + result_path = await self.force_download_viewonly(file_info, path) + if result_path: + self.downloaded_files.add(file_url) + return result_path if result_path else None + + # Special handling for complex download URLs + if 'Action=downloadfile' in file_url or 'fname=' in file_url: + logger.info(f"Using browser download approach for complex URL: {file_url}") + + # For these URLs, we'll need to navigate to the page and handle the download + await self.rotate_proxy_if_needed() + + async with self.context.new_page() as page: + # Set up download event listener + download_promise = page.wait_for_event("download") + + # Navigate to the URL + await page.goto(file_url, timeout=60000) + + # Wait for the download to start + try: + download = await download_promise + await download.save_as(path) + + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except Exception as e: + logger.error(f"Browser download failed: {e}") + + # If download didn't start automatically, try to find and click download buttons + download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') + for button in download_buttons: + try: + await button.click() + try: + download = await download_promise + await download.save_as(path) + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except: + pass + except: + continue + + # If browser approach failed, try direct request as last resort + logger.info("Browser approach failed, trying direct request") + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Try with direct requests first (faster) + try: + headers = { + 'User-Agent': get_random_user_agent(), + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': referer, + 'DNT': '1' + } + + with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: + if response.status_code == 200: + # Check content type to verify it's not HTML/error page + content_type = response.headers.get('Content-Type', '') + if 'text/html' in content_type and not file_url.endswith('.html'): + logger.warning(f"Received HTML instead of expected file: {file_url}") + else: + with open(path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Verify file was downloaded correctly + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except Exception as e: + logger.warning(f"Direct download failed: {e}, trying browser approach") + + # Original code for non-Google Drive downloads using Playwright + async with self.context.new_page() as page: + headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': referer + } + + # Try to download with timeout protection + try: + response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) + if response.status == 200: + content = await response.body() + with open(path, 'wb') as f: + f.write(content) + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + else: + logger.error(f"Download failed with status {response.status}: {file_url}") + + # Try to extract error information + error_info = await response.text() + logger.debug(f"Error response: {error_info[:200]}...") + + # Check if this might be a captcha or login issue + if detect_captcha(error_info): + logger.warning("Captcha detected during download") + # For HF Spaces, we can't implement browser-based captcha solving here + # Just log the issue for now + except PlaywrightTimeoutError: + logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") + + # Try an alternative approach - using the browser's download manager + try: + logger.info("Trying browser download manager approach") + download_promise = page.wait_for_event("download") + await page.goto(file_url, timeout=60000) + + # Wait for download to start (with timeout) + download = await download_promise + await download.save_as(path) + + if os.path.exists(path) and os.path.getsize(path) > 0: + self.downloaded_files.add(file_url) + return path + except Exception as e: + logger.error(f"Browser download manager approach failed: {e}") + + return None + except Exception as e: + logger.error(f"Error downloading {file_url}: {e}") + return None + + async def force_download_viewonly(self, file_info, save_path): + """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" + try: + # Extract file ID + file_id = file_info.get('metadata', {}).get('file_id') + if not file_id: + url = file_info['url'] + for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: + match = re.search(pattern, url) + if match: + file_id = match.group(1) + break + + if not file_id: + logger.error("Could not extract file ID") + return None + + file_type = file_info.get('metadata', {}).get('file_type', 'pdf') + base, ext = os.path.splitext(save_path) + if not ext: + save_path = f"{base}.{file_type}" + + logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") + + # Create a dedicated browser instance with better resolution and stealth + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-blink-features=AutomationControlled' # Anti-detection + ] + + browser = await self.playwright.chromium.launch( + headless=True, + args=browser_args + ) + + # Use higher resolution for better quality + context = await browser.new_context( + viewport={'width': 1600, 'height': 1200}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + device_scale_factor=2.0, + accept_downloads=True # Critical for the download workflow + ) + + # Add anti-detection script + await context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Change plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + // Handle languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + + // Modify hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => 4 + }); + } + """) + + page = await context.new_page() + + try: + # Go to the file view page + logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) + await page.wait_for_load_state('networkidle') + + # Check for any barriers or permissions issues + content = await page.content() + if "the owner has not granted you permission to" in content: + logger.warning("Permission denied error detected") + + # Randomized wait to appear more human-like + await page.wait_for_timeout(random.randint(3000, 7000)) + + # Create temp directory + temp_dir = tempfile.mkdtemp() + + # Special handling for PDFs + if file_type.lower() == 'pdf': + # Use the improved scrolling and detection approach + + # Perform some natural mouse movements and scrolling + await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400)) + await page.wait_for_timeout(random.randint(500, 1000)) + + # Estimate number of pages + estimated_pages = await page.evaluate(""" + () => { + // Method 1: Check page counter text + const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { + const text = el.textContent || ''; + return /\\d+\\s*\\/\\s*\\d+/.test(text); + }); + + if (pageCounters.length > 0) { + const text = pageCounters[0].textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) return parseInt(match[2]); + } + + // Method 2: Check actual page elements + const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pageElements.length > 0) return pageElements.length; + + // Method 3: Look for page thumbnails + const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); + if (thumbnails.length > 0) return thumbnails.length; + + // Fallback: conservative guess + return 50; + } + """) + + logger.info(f"Estimated {estimated_pages} pages in PDF") + + # Initial scroll to trigger lazy loading + logger.info("Initial scroll to bottom to trigger lazy loading...") + await page.keyboard.press("End") + await page.wait_for_timeout(3000) + + # Scroll page by page to ensure all pages are loaded + logger.info("Scrolling page by page...") + max_attempts = min(estimated_pages * 3, 300) + attempt = 0 + prev_blob_count = 0 + + while attempt < max_attempts: + blob_count = await page.evaluate(""" + Array.from(document.getElementsByTagName('img')) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .length + """) + + logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") + + if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): + logger.info("All pages appear to be loaded.") + break + + # Alternate between PageDown and End keys for more natural scrolling + if attempt % 3 == 0: + await page.keyboard.press("End") + else: + await page.keyboard.press("PageDown") + + # Randomized wait times + await page.wait_for_timeout(random.randint(1500, 3000)) + + # Move mouse randomly to appear more human-like + if attempt % 4 == 0: + await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) + + prev_blob_count = blob_count + attempt += 1 + + # Extra wait to ensure everything is loaded + await page.wait_for_timeout(5000) + + # Set up download event listener for the PDF + download_promise = page.wait_for_event("download") + + # Use jsPDF to generate PDF from loaded pages + logger.info("Generating PDF from loaded pages...") + result = await page.evaluate(r''' + (function() { + return new Promise((resolve, reject) => { + let script = document.createElement("script"); + script.onload = function () { + try { + let pdf = new jsPDF(); + let imgs = Array.from(document.getElementsByTagName("img")) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .sort((a, b) => { + const rectA = a.getBoundingClientRect(); + const rectB = b.getBoundingClientRect(); + return rectA.top - rectB.top; + }); + + console.log(`Found ${imgs.length} valid page images to add to PDF`); + + let added = 0; + for (let i = 0; i < imgs.length; i++) { + let img = imgs[i]; + let canvas = document.createElement("canvas"); + let ctx = canvas.getContext("2d"); + canvas.width = img.width; + canvas.height = img.height; + ctx.drawImage(img, 0, 0, img.width, img.height); + let imgData = canvas.toDataURL("image/jpeg", 1.0); + + if (added > 0) { + pdf.addPage(); + } + + pdf.addImage(imgData, 'JPEG', 0, 0); + added++; + } + + pdf.save("download.pdf"); + resolve({success: true, pageCount: added}); + } catch (error) { + reject({success: false, error: error.toString()}); + } + }; + + script.onerror = function() { + reject({success: false, error: "Failed to load jsPDF library"}); + }; + + script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; + document.body.appendChild(script); + }); + })(); + ''') + + if not result.get('success', False): + logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") + + # Try fallback approach - screenshot method + logger.info("Trying fallback screenshot method...") + + # Navigate back to the first page + await page.evaluate(""" + () => { + // Find and click the "first page" button if available + const buttons = Array.from(document.querySelectorAll('button')); + const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); + if (firstPageBtn) firstPageBtn.click(); + } + """) + await page.wait_for_timeout(1000); + + # Create a PDF by taking screenshots of each page + screenshots = [] + current_page = 1 + max_pages = estimated_pages + + # Create a PDF using the reportlab package + while current_page <= max_pages: + screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") + + # Try to find the current page element + page_elem = await page.query_selector('.drive-viewer-paginated-page') + if page_elem: + await page_elem.screenshot(path=screenshot_path) + else: + # Fallback to full page screenshot + await page.screenshot(path=screenshot_path) + + screenshots.append(screenshot_path) + + # Try to navigate to next page + next_btn = await page.query_selector('button[aria-label="Next page"]') + if next_btn: + is_disabled = await next_btn.get_attribute('disabled') + if is_disabled: + logger.info(f"Reached end of document at page {current_page}") + break + + await next_btn.click() + await page.wait_for_timeout(1000) + current_page += 1 + else: + break + + # Create PDF from screenshots + if screenshots: + first_img = Image.open(screenshots[0]) + width, height = first_img.size + + c = canvas.Canvas(save_path, pagesize=(width, height)) + for screenshot in screenshots: + img = Image.open(screenshot) + c.drawImage(screenshot, 0, 0, width, height) + c.showPage() + c.save() + + # Clean up screenshots + for screenshot in screenshots: + os.remove(screenshot) + + return save_path + + return None + + logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") + + # Wait for the download and save it + download = await download_promise + await download.save_as(save_path) + + # Clean up temp directory + try: + os.rmdir(temp_dir) + except: + pass + + else: + # Non-PDF file handling + screenshot_path = os.path.join(temp_dir, "file.png") + await page.screenshot(path=screenshot_path) + + if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: + # For document types, try to export directly + await self.export_google_doc(file_id, file_type, save_path) + else: + # For other types, save the screenshot with appropriate extension + shutil.copy(screenshot_path, save_path) + + os.remove(screenshot_path) + + # Close browser + await browser.close() + + # Verify file exists and has content + if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: + logger.info(f"Successfully downloaded file to {save_path}") + return save_path + else: + logger.error(f"Generated file is too small or missing: {save_path}") + return None + + except Exception as e: + logger.error(f"Error during force download: {e}") + if browser: + await browser.close() + return None + + except Exception as e: + logger.error(f"Force download preparation failed: {e}") + return None + + async def download_from_google_drive(self, url, save_path): + """Enhanced method to download from Google Drive with multiple fallback approaches""" + # Extract the file ID from different URL formats + file_id = None + url_patterns = [ + r'drive\.google\.com/file/d/([^/]+)', + r'drive\.google\.com/open\?id=([^&]+)', + r'docs\.google\.com/\w+/d/([^/]+)', + r'id=([^&]+)', + r'drive\.google\.com/uc\?id=([^&]+)', + ] + + for pattern in url_patterns: + match = re.search(pattern, url) + if match: + file_id = match.group(1) + break + + if not file_id: + logger.error(f"Could not extract file ID from URL: {url}") + return False + + # Determine file type first (important for handling different file types) + file_type, is_view_only = await self.get_google_drive_file_info(file_id) + logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") + + base, ext = os.path.splitext(save_path) + if not ext and file_type: + # Add the correct extension if missing + save_path = f"{base}.{file_type}" + + # For view-only files, use specialized approaches + if is_view_only: + # Approach 1: For PDFs, use the JS method + if file_type == 'pdf': + success = await self.download_viewonly_pdf_with_js(file_id, save_path) + if success: + return True + + # Approach 2: For Google Docs, Sheets, etc., use export API + if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: + success = await self.export_google_doc(file_id, file_type, save_path) + if success: + return True + + # Approach 3: Try the direct screenshot method for any view-only file + success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) + if success: + return True + + # Try standard approaches for non-view-only files + try: + # Try direct download link first (fastest) + direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" + + # Add anti-bot headers + headers = { + 'User-Agent': get_random_user_agent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://drive.google.com/', + 'DNT': '1' + } + + # Try with streaming to handle larger files + with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: + if r.status_code == 200: + # Check if we got HTML instead of the file + content_type = r.headers.get('Content-Type', '') + if 'text/html' in content_type and not file_id.endswith('.html'): + logger.warning("Received HTML instead of file, trying with session cookies") + else: + # Looks like we got the actual file + with open(save_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Verify file exists and has content + if os.path.exists(save_path) and os.path.getsize(save_path) > 0: + logger.info("Direct download successful") + return True + + # Try with requests and session cookies + session = requests.Session() + session.headers.update({'User-Agent': get_random_user_agent()}) + + # Visit the page first to get cookies + session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) + + # Try download + url = f"https://drive.google.com/uc?id={file_id}&export=download" + response = session.get(url, stream=True, timeout=30) + + # Check for confirmation token + confirmation_token = None + for k, v in response.cookies.items(): + if k.startswith('download_warning'): + confirmation_token = v + break + + # Use confirmation token if found + if confirmation_token: + url = f"{url}&confirm={confirmation_token}" + response = session.get(url, stream=True, timeout=60) + + # Check if we're getting HTML instead of the file + content_type = response.headers.get('Content-Type', '') + if 'text/html' in content_type: + logger.warning("Received HTML instead of file - likely download restriction") + else: + with open(save_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=1024*1024): + if chunk: + f.write(chunk) + + if os.path.exists(save_path) and os.path.getsize(save_path) > 0: + with open(save_path, 'rb') as f: + content = f.read(100) + if b'<!DOCTYPE html>' not in content: + logger.info("Successfully downloaded with requests session") + return True + except Exception as e: + logger.warning(f"Requests session download failed: {e}") + + # Try browser-based approach as last resort + try: + async with self.context.new_page() as page: + # Visit the file view page first to get cookies + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) + await page.wait_for_timeout(3000) + + # Set up download event listener + download_promise = page.wait_for_event("download") + + # Try to trigger the download button click + download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') + if download_button: + await download_button.click() + + # Wait for download to start + try: + download = await download_promise + await download.save_as(save_path) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + except Exception as e: + logger.error(f"Error during browser download: {e}") + return False + else: + # Try the export download URL + await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) + + # Look for and click any download buttons or links + download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') + for elem in download_elements: + try: + await elem.click() + # Wait a bit to see if download starts + try: + download = await download_promise + await download.save_as(save_path) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + except: + pass + except: + continue + except Exception as e: + logger.error(f"Browser-based download attempt failed: {e}") + + logger.warning("All standard download methods failed") + return False + + async def download_viewonly_pdf_with_js(self, file_id, save_path): + """Download view-only PDF using the enhanced blob image caching technique""" + try: + # Create a dedicated browser instance with stealth capabilities + browser_args = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-web-security', + '--disable-blink-features=AutomationControlled' # Anti-detection + ] + + browser = await self.playwright.chromium.launch( + headless=True, + args=browser_args + ) + + # Setup stealth context + context = await browser.new_context( + viewport={'width': 1600, 'height': 1200}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + accept_downloads=True, # Critical for handling the download event + ignore_https_errors=True + ) + + # Add stealth script + await context.add_init_script(""" + () => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + // Change plugins and languages to appear more human + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5].map(() => ({ + lengthComputable: true, + loaded: 100, + total: 100 + })) + }); + + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en', 'es'] + }); + } + """) + + page = await context.new_page() + + try: + # Step 1: Navigate to the file with human-like behavior + logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) + await page.wait_for_load_state('networkidle') + + # Perform human-like interactions + await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) + await page.wait_for_timeout(random.randint(2000, 5000)) + + # Step 2: Estimate the number of pages + estimated_pages = await page.evaluate(""" + () => { + // Look for page counter in the interface + const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { + const text = el.textContent || ''; + return /\\d+\\s*\\/\\s*\\d+/.test(text); + }); + + if (pageCounters.length > 0) { + const text = pageCounters[0].textContent || ''; + const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); + if (match && match[2]) return parseInt(match[2]); + } + + // If we can't find a counter, check actual pages + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pages.length > 0) return pages.length; + + // Default to a reasonable number if we can't determine + return 50; + } + """) + + logger.info(f"Estimated number of pages: {estimated_pages}") + + # Step 3: Initial scroll to trigger loading + logger.info("Initial scroll to bottom to trigger lazy loading...") + await page.keyboard.press("End") + await page.wait_for_timeout(3000) + + # Step 4: Wait for all pages to load with better feedback and randomization + logger.info("Scrolling through document to load all pages...") + max_attempts = min(estimated_pages * 3, 300) + attempt = 0 + prev_blob_count = 0 + consecutive_same_count = 0 + + while attempt < max_attempts: + # Count blob images (which are the PDF pages) + blob_count = await page.evaluate(""" + Array.from(document.getElementsByTagName('img')) + .filter(img => img.src.startsWith('blob:') && img.width > 100) + .length + """) + + logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") + + # Check if we've loaded all pages or if we're stuck + if blob_count >= estimated_pages: + logger.info(f"All {estimated_pages} pages appear to be loaded.") + break + + if blob_count == prev_blob_count: + consecutive_same_count += 1 + if consecutive_same_count >= 5 and blob_count > 0: + logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") + break + else: + consecutive_same_count = 0 + + # Mix up the scrolling approach for more human-like behavior + scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) + + if scroll_action == "PageDown": + await page.keyboard.press("PageDown") + elif scroll_action == "End": + await page.keyboard.press("End") + elif scroll_action == "ArrowDown": + # Press arrow down multiple times + for _ in range(random.randint(5, 15)): + await page.keyboard.press("ArrowDown") + await page.wait_for_timeout(random.randint(50, 150)) + else: # mouse + # Scroll using mouse wheel + current_y = random.randint(300, 700) + await page.mouse.move(x=random.randint(300, 800), y=current_y) + await page.mouse.wheel(0, random.randint(300, 800)) + + # Random wait between scrolls + await page.wait_for_timeout(random.randint(1000, 3000)) + + prev_blob_count = blob_count + attempt += 1 + + # Extra wait to ensure everything is fully loaded + await page.wait_for_timeout(5000) + + # Step 5: Set up a download event listener + download_promise = page.wait_for_event("download") + + # Step 6: Inject the jsPDF script to generate PDF + logger.info("Generating PDF from loaded pages...") + result = await page.evaluate(r''' + (function() { + return new Promise((resolve, reject) => { + let script = document.createElement("script"); + script.onload = function () { + try { + let pdf = new jsPDF(); + let imgs = document.getElementsByTagName("img"); + let validImages = []; + + // First collect all valid blob images + for (let i = 0; i < imgs.length; i++) { + let img = imgs[i]; + if (!/^blob:/.test(img.src)) continue; + if (img.width < 100 || img.height < 100) continue; + validImages.push(img); + } + + // Sort by position in the document + validImages.sort((a, b) => { + const rectA = a.getBoundingClientRect(); + const rectB = b.getBoundingClientRect(); + return rectA.top - rectB.top; + }); + + console.log(`Found ${validImages.length} valid page images to add to PDF`); + + let added = 0; + // Process each image as a page + for (let i = 0; i < validImages.length; i++) { + let img = validImages[i]; + let canvas = document.createElement("canvas"); + let ctx = canvas.getContext("2d"); + canvas.width = img.width; + canvas.height = img.height; + ctx.drawImage(img, 0, 0, img.width, img.height); + let imgData = canvas.toDataURL("image/jpeg", 1.0); + + if (added > 0) { + pdf.addPage(); + } + + pdf.addImage(imgData, 'JPEG', 0, 0); + added++; + } + + pdf.save("download.pdf"); + resolve({success: true, pageCount: added}); + } catch (error) { + reject({success: false, error: error.toString()}); + } + }; + + script.onerror = function() { + reject({success: false, error: "Failed to load jsPDF library"}); + }; + + // Use a reliable CDN + script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; + document.body.appendChild(script); + }); + })(); + ''') + + if not result.get('success'): + logger.error(f"Error in PDF generation: {result.get('error')}") + return False + + logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") + + # Step 7: Wait for the download to complete and save the file + download = await download_promise + + # Step 8: Save the downloaded file to the specified path + await download.save_as(save_path) + logger.info(f"Successfully saved PDF to {save_path}") + + return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 + + finally: + await browser.close() + + except Exception as e: + logger.error(f"Error in viewonly PDF download process: {e}") + return False + + async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): + """Download any view-only file by taking screenshots""" + try: + async with self.context.new_page() as page: + # Set high-resolution viewport + await page.set_viewport_size({"width": 1600, "height": 1200}) + + # Navigate to the file + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) + + # Make sure the file is loaded + await page.wait_for_load_state('networkidle') + await page.wait_for_timeout(3000) # Extra time for rendering + + # Create directory for screenshots if multiple pages + base_dir = os.path.dirname(save_path) + base_name = os.path.splitext(os.path.basename(save_path))[0] + screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") + os.makedirs(screenshots_dir, exist_ok=True) + + # Check if it's a multi-page document + is_multi_page = await page.evaluate(""" + () => { + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + return pages.length > 1; + } + """) + + if is_multi_page and file_type == 'pdf': + # For multi-page PDFs, take screenshots of each page + page_count = await page.evaluate(""" + async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + const container = document.querySelector('.drive-viewer-paginated-scrollable'); + + if (!container || pages.length === 0) return 0; + + // Scroll through to make sure all pages are loaded + const scrollHeight = container.scrollHeight; + const viewportHeight = container.clientHeight; + const scrollStep = viewportHeight; + + for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { + container.scrollTo(0, scrollPos); + await delay(300); + } + + // Scroll back to top + container.scrollTo(0, 0); + await delay(300); + + return pages.length; + } + """) + + logger.info(f"Found {page_count} pages in document") + + # Take screenshots of each page + screenshots = [] + for i in range(page_count): + # Scroll to page + await page.evaluate(f""" + async () => {{ + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const pages = document.querySelectorAll('.drive-viewer-paginated-page'); + if (pages.length <= {i}) return false; + + pages[{i}].scrollIntoView(); + await delay(500); + return true; + }} + """) + + # Take screenshot + screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") + await page.screenshot(path=screenshot_path, clip={ + 'x': 0, + 'y': 0, + 'width': 1600, + 'height': 1200 + }) + screenshots.append(screenshot_path) + + # Combine screenshots into PDF + c = canvas.Canvas(save_path) + for screenshot in screenshots: + img = Image.open(screenshot) + width, height = img.size + + # Add page to PDF + c.setPageSize((width, height)) + c.drawImage(screenshot, 0, 0, width, height) + c.showPage() + + c.save() + + # Clean up screenshots + for screenshot in screenshots: + os.remove(screenshot) + os.rmdir(screenshots_dir) + + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + else: + # For single-page or non-PDF files, just take one screenshot + screenshot_path = os.path.join(screenshots_dir, "screenshot.png") + await page.screenshot(path=screenshot_path, fullPage=True) + + # Convert to requested format if needed + if file_type == 'pdf': + # Create PDF from screenshot + img = Image.open(screenshot_path) + width, height = img.size + + c = canvas.Canvas(save_path, pagesize=(width, height)) + c.drawImage(screenshot_path, 0, 0, width, height) + c.save() + else: + # Just copy the screenshot to the destination with proper extension + shutil.copy(screenshot_path, save_path) + + # Clean up + os.remove(screenshot_path) + os.rmdir(screenshots_dir) + + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + + except Exception as e: + logger.error(f"Error taking screenshots: {e}") + return False + + async def export_google_doc(self, file_id, file_type, save_path): + """Export Google Docs/Sheets/Slides to downloadable formats""" + try: + # Map file types to export formats + export_formats = { + 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx + 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx + 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx + 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'pdf': 'application/pdf', + } + + export_format = export_formats.get(file_type, 'application/pdf') + export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" + + if 'sheet' in file_type or 'xlsx' in file_type: + export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" + elif 'ppt' in file_type or 'presentation' in file_type: + export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" + elif file_type == 'pdf': + export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" + + async with self.context.new_page() as page: + # Get cookies from the main view page first + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') + + # Now try the export + response = await page.goto(export_url, wait_until='networkidle') + + if response.status == 200: + content = await response.body() + with open(save_path, 'wb') as f: + f.write(content) + return os.path.exists(save_path) and os.path.getsize(save_path) > 0 + else: + logger.warning(f"Export failed with status {response.status}") + return False + + except Exception as e: + logger.error(f"Error exporting Google Doc: {e}") + return False + + async def get_google_drive_file_info(self, file_id): + """Get file type and view-only status from Google Drive""" + file_type = None + is_view_only = False + + try: + async with self.context.new_page() as page: + await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) + + # Check if view-only + view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') + is_view_only = view_only_text is not None + + # Check for Google Docs viewer + gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') + gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') + gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') + + if gdocs_viewer: + file_type = 'docx' + elif gsheets_viewer: + file_type = 'xlsx' + elif gslides_viewer: + file_type = 'pptx' + else: + # Check for PDF viewer + pdf_viewer = await page.query_selector('embed[type="application/pdf"]') + if pdf_viewer: + file_type = 'pdf' + else: + # Check for image viewer + img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') + if img_viewer: + # Get image type from src + img_src = await img_viewer.get_attribute('src') + if 'jpg' in img_src or 'jpeg' in img_src: + file_type = 'jpg' + elif 'png' in img_src: + file_type = 'png' + else: + file_type = 'jpg' # Default to jpg + else: + # Generic file type fallback + file_type = 'pdf' # Default to PDF + + # If still no type, check filename + if not file_type: + title_element = await page.query_selector('div[role="heading"]') + if title_element: + title = await title_element.text_content() + if title: + ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) + if ext_match: + file_type = ext_match.group(1).lower() + + except Exception as e: + logger.error(f"Error getting Google Drive file info: {e}") + file_type = 'pdf' # Default to PDF if we can't determine + + return file_type, is_view_only + + # IMPROVED: Enhanced sublink extraction method + async def get_sublinks(self, url, limit=10000): + """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" + links = set() + try: + logger.info(f"Fetching sublinks from: {url}") + + # Check if this is a direct download link + if is_download_link(url): + logger.info(f"URL appears to be a direct download link: {url}") + links.add(url) + return list(links)[:limit] + + # Skip if we've already visited this URL + normalized_url = normalize_download_url(url) + if normalized_url in self.visited_urls: + logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") + return list(links)[:limit] + + # Add to visited URLs + self.visited_urls.add(normalized_url) + + # Special handling for educational sites like phsms.cloud.ncnu.edu.tw + if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in + ["exam", "test", "pastpaper", "eduexp"]): + logger.info("Using specialized exam site sublink extraction") + edu_links = await self.get_edu_exam_links(url) + for link in edu_links: + links.add(link) + + # If we found a good number of links with the specialized method, return them + if len(links) > 5: + logger.info(f"Found {len(links)} sublinks with specialized method") + return list(links)[:limit] + + # Rotate proxy if needed + await self.rotate_proxy_if_needed() + + # Standard sublink extraction for all sites + try: + await self.page.goto(url, timeout=30000, wait_until='networkidle') + except Exception as e: + logger.warning(f"Error navigating to URL for sublink extraction: {e}") + # Continue with what we have, we'll try to extract links anyway + + # Get base URL for resolving relative links + parsed_base = urlparse(url) + base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" + path_base = os.path.dirname(parsed_base.path) + + # Perform initial scrolling to load lazy content + await self.page.evaluate(""" + async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const step = Math.floor(window.innerHeight / 2); + + for (let i = 0; i < height; i += step) { + window.scrollTo(0, i); + await delay(150); + } + + window.scrollTo(0, 0); + } + """) + await self.page.wait_for_timeout(1000) + + # Check if page has ASP.NET elements which might need special handling + is_aspnet = await self.page.evaluate(''' + () => { + return document.querySelector('form#aspnetForm') !== null || + document.querySelector('input[name="__VIEWSTATE"]') !== null; + } + ''') + + if is_aspnet: + logger.info("Detected ASP.NET page, using enhanced extraction method") + + # Try to interact with ASP.NET controls that might reveal more links + # Look for dropdowns, buttons, and grid elements + dropdowns = await self.page.query_selector_all('select') + buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') + + # Try interacting with dropdowns first + for dropdown in dropdowns: + try: + # Get all options + options = await self.page.evaluate(''' + (dropdown) => { + return Array.from(dropdown.options).map(o => o.value); + } + ''', dropdown) + + # Try selecting each option + for option in options: + if option: + await dropdown.select_option(value=option) + await self.page.wait_for_timeout(1000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error interacting with dropdown: {e}") + + # Try clicking buttons (but avoid dangerous ones like "delete") + safe_buttons = [] + for button in buttons: + button_text = await button.text_content() or "" + button_value = await button.get_attribute("value") or "" + button_id = await button.get_attribute("id") or "" + combined_text = (button_text + button_value + button_id).lower() + + # Skip potentially destructive buttons + if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): + continue + + # Prioritize buttons that might show more content + if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): + safe_buttons.append(button) + + # Click the safe buttons + for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks + try: + await button.click() + await self.page.wait_for_timeout(1000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error clicking button: {e}") + + # Extract links from the initial page state + await self.extract_all_link_types(links, base_url, path_base) + + # Look specifically for links inside grid/table views which are common in ASP.NET applications + grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') + for cell in grid_cells: + try: + href = await cell.get_attribute('href') + if href: + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links.add(full_url) + except Exception as e: + logger.warning(f"Error extracting grid link: {e}") + + # Extract links from onclick attributes and javascript:__doPostBack calls + postback_links = await self.page.evaluate(''' + () => { + const results = []; + // Find elements with onclick containing __doPostBack + const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); + for (const el of elements) { + // Extract the postback target + const onclick = el.getAttribute('onclick') || ''; + const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); + if (match && match[1]) { + // Get the visible text to use as description + const text = el.innerText || el.textContent || 'Link'; + results.push({ + id: match[1], + text: text.trim() + }); + } + } + return results; + } + ''') + + # Try interacting with some of the postback links + for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions + try: + logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") + await self.page.evaluate(f''' + () => {{ + if (typeof __doPostBack === 'function') {{ + __doPostBack('{postback["id"]}', ''); + }} + }} + ''') + await self.page.wait_for_timeout(1500) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Extract any new links that appeared + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error with postback: {e}") + + # Look for pagination controls and try to navigate through them + pagination_elements = await self.page.query_selector_all( + 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' + ) + + # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops) + for i in range(min(5, len(pagination_elements))): + try: + # Focus on elements that look like "next page" buttons + el = pagination_elements[i] + el_text = await el.text_content() or "" + + # Only click if this looks like a pagination control + if "next" in el_text.lower() or ">" == el_text.strip() or "โ" == el_text.strip(): + logger.info(f"Clicking pagination control: {el_text}") + await el.click() + await self.page.wait_for_timeout(2000) + await self.page.wait_for_load_state('networkidle', timeout=5000) + + # Get new links from this page + await self.extract_all_link_types(links, base_url, path_base) + except Exception as e: + logger.warning(f"Error clicking pagination: {e}") + + # Check for hidden links that might be revealed by JavaScript + hidden_links = await self.page.evaluate(""" + () => { + // Try to execute common JavaScript patterns that reveal hidden content + try { + // Common patterns used in websites to initially hide content + const hiddenContainers = document.querySelectorAll( + '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' + ); + + // Attempt to make them visible + hiddenContainers.forEach(el => { + el.style.display = 'block'; + el.style.visibility = 'visible'; + el.classList.remove('hidden', 'hide'); + }); + + // Return any newly visible links + return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); + } catch (e) { + return []; + } + } + """) + + # Add any newly discovered links + for href in hidden_links: + if href and not href.startswith('javascript:'): + links.add(href) + + # Find all download links + download_links = await self.page.evaluate(""" + () => { + return Array.from(document.querySelectorAll('a[href]')) + .filter(a => { + const href = a.href.toLowerCase(); + return href.includes('download') || + href.includes('file') || + href.includes('get') || + href.includes('view.php') || + href.includes('action=') || + href.includes('fname='); + }) + .map(a => a.href); + } + """) + + for download_link in download_links: + links.add(download_link) + + # Also check for hidden links in JavaScript, iframes, or dynamic content + js_links = await self.discover_hidden_links(self.page) + for link in js_links: + links.add(link) + + logger.info(f"Found {len(links)} sublinks") + + # Prioritize download links + prioritized_links = [] + normal_links = [] + + for link in links: + if is_download_link(link): + prioritized_links.append(link) + else: + normal_links.append(link) + + # Return prioritized links first, then normal links, up to the limit + result = prioritized_links + normal_links + return result[:limit] + + except Exception as e: + logger.error(f"Error getting sublinks from {url}: {e}") + return list(links)[:limit] # Return what we have so far + + async def extract_all_link_types(self, links_set, base_url, path_base): + """Extract all types of links from the current page""" + # Get all <a> tag links + a_links = await self.page.query_selector_all('a[href]') + for a in a_links: + try: + href = await a.get_attribute('href') + if href and not href.startswith('javascript:') and not href.startswith('#'): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Get iframe sources + iframes = await self.page.query_selector_all('iframe[src]') + for iframe in iframes: + try: + src = await iframe.get_attribute('src') + if src and not src.startswith('javascript:') and not src.startswith('about:'): + full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Get links from onclick attributes that reference URLs + onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') + for el in onclick_elements: + try: + onclick = await el.get_attribute('onclick') + urls = re.findall(r'(https?://[^\'"]+)', onclick) + for url in urls: + links_set.add(url) + except Exception: + pass + + # Look for URLs in data-* attributes + data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') + for el in data_elements: + for attr in ['data-url', 'data-href', 'data-src']: + try: + value = await el.get_attribute(attr) + if value and not value.startswith('javascript:'): + full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Look for special anchor links that might not have href attributes + special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') + for anchor in special_anchors: + try: + href = await anchor.get_attribute('href') + if href and not href.startswith('javascript:') and not href.startswith('#'): + full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) + links_set.add(full_url) + except Exception: + pass + + # Extract links from JSON data embedded in the page + script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') + for script in script_elements: + try: + script_content = await script.text_content() + if script_content: + # Look for URLs in the JSON content + urls = re.findall(r'(https?://[^\'"]+)', script_content) + for url in urls: + links_set.add(url) + except Exception: + pass + + def resolve_relative_url(self, relative_url, base_url, path_base): + """Properly resolve relative URLs considering multiple formats""" + if relative_url.startswith('/'): + # Absolute path relative to domain + return f"{base_url}{relative_url}" + elif relative_url.startswith('./'): + # Explicit relative path + return f"{base_url}{path_base}/{relative_url[2:]}" + elif relative_url.startswith('../'): + # Parent directory + parent_path = '/'.join(path_base.split('/')[:-1]) + return f"{base_url}{parent_path}/{relative_url[3:]}" + else: + # Regular relative path + return f"{base_url}{path_base}/{relative_url}" + + async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): + if not custom_ext_list: + custom_ext_list = [] + progress_text = st.empty() + progress_bar = st.progress(0) + file_count_text = st.empty() + + try: + # Reset the visited URLs for a fresh deep search + self.visited_urls = set() + + progress_text.text("Analyzing main page...") + # Special handling for ASP.NET pages + is_aspnet = False + try: + await self.page.goto(url, timeout=30000, wait_until='networkidle') + is_aspnet = await self.page.evaluate(''' + () => { + return document.querySelector('form#aspnetForm') !== null || + document.querySelector('input[name="__VIEWSTATE"]') !== null; + } + ''') + except Exception: + pass + + # Check if this URL is a direct download + if is_download_link(url): + progress_text.text("URL appears to be a direct download. Analyzing...") + + # Try to extract file directly + normalized_url = normalize_download_url(url) + file_info = { + 'url': normalized_url, + 'download_url': normalized_url, + 'filename': os.path.basename(urlparse(normalized_url).path) or 'download', + 'size': 'Unknown Size', + 'metadata': {} + } + + # Add to visited URLs + self.visited_urls.add(normalized_url) + progress_bar.progress(1.0) + return [file_info] + + # Extract files from main page + main_files = await self.extract_downloadable_files(url, custom_ext_list) + initial_count = len(main_files) + file_count_text.text(f"Found {initial_count} files on main page") + + # Get sublinks with enhanced method + progress_text.text("Getting sublinks...") + sublinks = await self.get_sublinks(url, sublink_limit) + total_links = len(sublinks) + progress_text.text(f"Found {total_links} sublinks to process") + + # Always include files from the main page, regardless of sublinks + all_files = main_files + + if not sublinks: + progress_bar.progress(1.0) + return all_files + + # Process each sublink + for i, sublink in enumerate(sublinks, 1): + progress = i / total_links + progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") + progress_bar.progress(progress) + + try: + # Check if this is a direct download link + if is_download_link(sublink): + # For download links, just add the link directly + normalized_url = normalize_download_url(sublink) + + # Skip if already visited + if normalized_url in self.visited_urls: + continue + + # Mark as visited + self.visited_urls.add(normalized_url) + + # Get file size if possible + size_str = await self.get_file_size(normalized_url) + + # Get filename, with fallback to domain-based name + filename = os.path.basename(urlparse(normalized_url).path) + if not filename or filename == '/' or '?' in filename: + domain = get_domain(normalized_url) + ext = '.pdf' # Default extension + for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: + if common_ext in normalized_url.lower(): + ext = common_ext + break + filename = f"file_from_{domain}{ext}" + + # Add file to results + all_files.append({ + 'url': normalized_url, + 'download_url': normalized_url, + 'filename': filename, + 'size': size_str, + 'metadata': {} + }) + file_count_text.text(f"Found {len(all_files)} total files") + continue + + # For regular links, use a longer timeout for ASP.NET pages which can be slower + sub_timeout = timeout * 2 if is_aspnet else timeout + + # Skip already visited URLs + if sublink in self.visited_urls: + continue + + # Extract files from sublink + sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) + all_files.extend(sub_files) + file_count_text.text(f"Found {len(all_files)} total files") + except Exception as e: + logger.warning(f"Error processing sublink {sublink}: {e}") + + # Deduplicate files + seen_urls = set() + unique_files = [] + for f in all_files: + if f['url'] not in seen_urls: + seen_urls.add(f['url']) + unique_files.append(f) + + final_count = len(unique_files) + progress_text.text(f"Deep search complete!") + file_count_text.text(f"Found {final_count} unique files") + progress_bar.progress(1.0) + return unique_files + + except Exception as e: + logger.error(f"Deep search error: {e}") + progress_text.text(f"Error during deep search: {str(e)}") + return [] + + finally: + await asyncio.sleep(2) + if not st.session_state.get('keep_progress', False): + progress_text.empty() + progress_bar.empty() + +# -------------------- Main App -------------------- +def main(): + + # Custom CSS for better appearance + st.markdown(""" + <style> + .stTabs [data-baseweb="tab-list"] { + gap: 10px; + } + .stTabs [data-baseweb="tab"] { + height: 50px; + white-space: pre-wrap; + border-radius: 4px 4px 0px 0px; + padding: 10px 16px; + background-color: #f0f2f6; + } + .stTabs [aria-selected="true"] { + background-color: #ffffff !important; + border-bottom: 2px solid #4c78a8; + } + .stFileUploader > div > div > button { + width: 100%; + } + .main-header { + font-size: 2.5rem; + font-weight: 700; + margin-bottom: 10px; + } + .section-subheader { + font-size: 1.3rem; + font-weight: 600; + margin-top: 20px; + margin-bottom: 10px; + } + .info-text { + color: #6c757d; + font-size: 0.9rem; + } + .stButton>button { + width: 100%; + } + .result-card { + background-color: #f8f9fa; + border-radius: 6px; + padding: 16px; + margin-bottom: 12px; + border-left: 4px solid #4c78a8; + } + .sidebar-header { + font-size: 1.2rem; + font-weight: 600; + margin-bottom: 10px; + } + .sidebar-section { + margin-bottom: 20px; + } + </style> + """, unsafe_allow_html=True) + + # Initialize session state for storing files if 'files' not in st.session_state: st.session_state.files = [] if 'downloaded_paths' not in st.session_state: @@ -66,15 +3887,976 @@ def initialize_session_state(): st.session_state.proxy_string = None if 'stealth_mode' not in st.session_state: st.session_state.stealth_mode = True + + # ============================ + # SIDEBAR + # ============================ + with st.sidebar: + st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50) + st.markdown("<p class='sidebar-header'>Advanced File Downloader</p>", unsafe_allow_html=True) + + # Mode Selection + st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) + st.markdown("<p class='sidebar-header'>Mode</p>", unsafe_allow_html=True) + mode = st.radio( + "Select Mode", + ["Standard", "Education Mode", "Research Mode", "Media Mode"], + label_visibility="collapsed", + index=["Standard", "Education Mode", "Research Mode", "Media Mode"].index(st.session_state.mode), + horizontal=False + ) + + if mode != st.session_state.mode: + st.session_state.mode = mode + # Update mode-specific settings + if mode == "Education Mode": + st.session_state.custom_extensions = ".pdf,.doc,.docx,.ppt,.pptx" + st.session_state.prioritize_pdfs = True + elif mode == "Research Mode": + st.session_state.custom_extensions = ".pdf,.txt,.csv,.json,.xlsx" + st.session_state.prioritize_pdfs = True + elif mode == "Media Mode": + st.session_state.custom_extensions = ".jpg,.png,.mp3,.mp4,.avi,.mov" + st.session_state.prioritize_pdfs = False + + st.markdown(f"<div class='info-text'>Current: <b>{st.session_state.mode}</b></div>", unsafe_allow_html=True) + st.markdown("</div>", unsafe_allow_html=True) + + # Quick Settings + st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) + st.markdown("<p class='sidebar-header'>Quick Settings</p>", unsafe_allow_html=True) + + stealth_mode = st.checkbox("Stealth Mode", value=st.session_state.stealth_mode) + if stealth_mode != st.session_state.stealth_mode: + st.session_state.stealth_mode = stealth_mode + + use_proxy = st.checkbox("Use Proxy", value=st.session_state.use_proxy) + if use_proxy != st.session_state.use_proxy: + st.session_state.use_proxy = use_proxy + + if use_proxy: + proxy_string = st.text_input("Proxy Address", + placeholder="e.g., http://user:pass@host:port", + value=st.session_state.proxy_string or "") + if proxy_string != st.session_state.proxy_string: + st.session_state.proxy_string = proxy_string + + st.markdown("</div>", unsafe_allow_html=True) + + # Google Drive Integration + st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) + st.markdown("<p class='sidebar-header'>Google Drive</p>", unsafe_allow_html=True) + + if st.session_state.google_credentials: + st.success("โ Connected") + + drive_folder = st.text_input("Drive Folder", + value="File Downloader" if 'drive_folder' not in st.session_state else st.session_state.drive_folder) + if 'drive_folder' not in st.session_state or drive_folder != st.session_state.drive_folder: + st.session_state.drive_folder = drive_folder + + if st.button("Disconnect Drive"): + st.session_state.google_credentials = None + st.rerun() + else: + st.warning("โ ๏ธ Not Connected") + if st.button("Connect Google Drive"): + auth_url = get_google_auth_url() + st.markdown(f"[Click here to authorize]({auth_url})") + auth_code = st.text_input("Enter authorization code:") + + if auth_code: + with st.spinner("Connecting to Google Drive..."): + credentials, status_msg = exchange_code_for_credentials(auth_code) + if credentials: + st.session_state.google_credentials = credentials + st.success(status_msg) + st.rerun() + else: + st.error(status_msg) + + st.markdown("</div>", unsafe_allow_html=True) + + # Preset buttons for common EDU sites + if st.session_state.mode == "Education Mode": + st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) + st.markdown("<p class='sidebar-header'>Quick Access</p>", unsafe_allow_html=True) + st.markdown("<div class='info-text'>Common Educational Sites</div>", unsafe_allow_html=True) + + if st.button("Past Exam Papers"): + st.session_state.preset_url = "https://pastpapers.example.edu" + st.session_state.search_method = "Exam Site Mode" + st.rerun() + + if st.button("Open Course Materials"): + st.session_state.preset_url = "https://opencourseware.example.edu" + st.session_state.search_method = "Deep Search" + st.rerun() + + if st.button("Research Papers"): + st.session_state.preset_url = "https://papers.example.org" + st.session_state.search_method = "Deep Search" + st.rerun() + + st.markdown("</div>", unsafe_allow_html=True) + + # Tool status + st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) + st.markdown("<p class='sidebar-header'>System Status</p>", unsafe_allow_html=True) + + col1, col2 = st.columns(2) + with col1: + st.markdown("<div class='info-text'>Search</div>", unsafe_allow_html=True) + st.markdown("<div style='color: green; font-weight: bold;'>Active</div>", unsafe_allow_html=True) + with col2: + st.markdown("<div class='info-text'>Browser</div>", unsafe_allow_html=True) + st.markdown("<div style='color: green; font-weight: bold;'>Ready</div>", unsafe_allow_html=True) + + if st.button("Install Dependencies"): + with st.spinner("Installing Playwright dependencies..."): + install_playwright_dependencies() + + st.markdown("</div>", unsafe_allow_html=True) + + # App info + st.markdown("<div class='sidebar-section' style='position: absolute; bottom: 20px; width: 90%;'>", unsafe_allow_html=True) + st.markdown("<div class='info-text' style='text-align: center;'>Version 2.0 โข March 2025</div>", unsafe_allow_html=True) + st.markdown("</div>", unsafe_allow_html=True) -# Import the UI code while keeping the modular structure -from ui import setup_ui, create_sidebar, display_file_results, handle_downloads, handle_google_drive_upload -from main import main as app_main - -# Set up and run the application -def main(): - initialize_session_state() - app_main() + # ============================ + # MAIN CONTENT AREA + # ============================ + + # Header section + col1, col2 = st.columns([5, 1]) + with col1: + st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True) + with col2: + st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70) + + mode_descriptions = { + "Standard": "A versatile tool for discovering and downloading files from any website.", + "Education Mode": "Optimized for educational resources, exams, and academic materials.", + "Research Mode": "Focused on research papers, datasets, and academic publications.", + "Media Mode": "Enhanced for finding and downloading images, videos, and audio files." + } + + st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True) + + # Main tabs + tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"]) + + # Tab 1: Search & Download + with tabs[0]: + st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True) + + col1, col2 = st.columns([3, 1]) + with col1: + url = st.text_input("Enter a URL to search for downloadable files:", + placeholder="e.g., https://example.com/resources", + value=st.session_state.get('preset_url', '')) + with col2: + # Initialize search_method with either session state or default value + initial_search_method = st.session_state.get('search_method', "Deep Search") + search_method = st.selectbox("Search Method", + ["Deep Search", "Quick Search", "Exam Site Mode"], + index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method)) + # Update session state when changed + if search_method != st.session_state.get('search_method'): + st.session_state.search_method = search_method + + # Advanced options in an expander + with st.expander("Search Options", expanded=False): + col1, col2, col3 = st.columns(3) + with col1: + depth = st.slider("Search Depth", min_value=1, max_value=5, value=2, + help="Higher values will search more links but take longer") + prioritize_pdfs = st.checkbox("Prioritize PDFs", + value=st.session_state.get('prioritize_pdfs', True), + help="Focus on finding PDF files first") + with col2: + timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60) + follow_subdomains = st.checkbox("Follow Subdomains", value=True, + help="Include links from subdomains in the search") + with col3: + # Default extensions based on mode + default_extensions = { + "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip", + "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx", + "Research Mode": ".pdf,.txt,.csv,.json,.xlsx", + "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov" + } + + custom_extensions = st.text_area( + "Custom File Extensions", + value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]), + help="Comma-separated list of file extensions to look for" + ) + + # Update session state when extensions changed + if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions: + st.session_state.custom_extensions = custom_extensions + + search_col1, search_col2 = st.columns([4, 1]) + with search_col1: + search_button = st.button("๐ Start Search", use_container_width=True) + with search_col2: + clear_button = st.button("๐งน Clear Results", use_container_width=True) + + # File results section + if st.session_state.files: + st.markdown("<h3 class='section-subheader'>Found Files</h3>", unsafe_allow_html=True) + + # File filtering options + filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1]) + with filter_col1: + file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.") + with filter_col2: + sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"]) + with filter_col3: + show_only_pdfs = st.checkbox("PDFs Only", value=False) + + # Sort files based on selection + sorted_files = list(st.session_state.files) + if sort_option == "Name": + sorted_files.sort(key=lambda x: x['filename']) + elif sort_option == "Size (Largest)": + # Convert size strings to comparable values + def parse_size(size_str): + if 'Unknown' in size_str: + return 0 + try: + value = float(size_str.split(' ')[0]) + unit = size_str.split(' ')[1] + multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} + return value * multipliers.get(unit, 0) + except: + return 0 + + sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True) + elif sort_option == "Size (Smallest)": + def parse_size(size_str): + if 'Unknown' in size_str: + return float('inf') + try: + value = float(size_str.split(' ')[0]) + unit = size_str.split(' ')[1] + multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} + return value * multipliers.get(unit, 0) + except: + return float('inf') + + sorted_files.sort(key=lambda x: parse_size(x['size'])) + + # File list with selection + file_container = st.container() + with file_container: + selected_files = [] + displayed_files = [] + + for i, file in enumerate(sorted_files): + # Apply filters + if file_filter and file_filter.lower() not in file['filename'].lower(): + continue + if show_only_pdfs and not file['filename'].lower().endswith('.pdf'): + continue + + displayed_files.append(i) + with st.container(): + col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1]) + with col1: + selected = st.checkbox("", key=f"select_{i}", value=True) + if selected: + selected_files.append(i) + with col2: + file_icon = "๐" + if file['filename'].lower().endswith('.pdf'): + file_icon = "๐" + elif file['filename'].lower().endswith(('.doc', '.docx')): + file_icon = "๐" + elif file['filename'].lower().endswith(('.xls', '.xlsx')): + file_icon = "๐" + elif file['filename'].lower().endswith(('.ppt', '.pptx')): + file_icon = "๐ผ๏ธ" + elif file['filename'].lower().endswith(('.jpg', '.png', '.gif')): + file_icon = "๐ผ๏ธ" + elif file['filename'].lower().endswith(('.mp3', '.wav')): + file_icon = "๐" + elif file['filename'].lower().endswith(('.mp4', '.avi', '.mov')): + file_icon = "๐ฌ" + + st.markdown(f"**{file_icon} {file['filename']}**") + st.markdown(f"<span class='info-text'>{file['url'][:60]}...</span>", unsafe_allow_html=True) + with col3: + st.markdown(f"**Size:** {file['size']}") + with col4: + st.button("Preview", key=f"preview_{i}") + + st.divider() + + if not displayed_files: + st.info("No files match your current filters. Try adjusting your search criteria.") + + # Download options + if selected_files: + col1, col2 = st.columns(2) + with col1: + download_dir = st.text_input("Download Directory", value="downloads") + with col2: + download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True) + + download_col1, download_col2, download_col3 = st.columns([3, 1, 1]) + with download_col1: + download_button = st.button("โฌ๏ธ Download Selected Files", use_container_width=True) + with download_col2: + google_drive_button = st.button("๐ค Upload to Drive", + use_container_width=True, + disabled=not st.session_state.google_credentials) + with download_col3: + select_all = st.button("Select All Files", use_container_width=True) + + # Handle select all button + if select_all: + for i in displayed_files: + st.session_state[f"select_{i}"] = True + st.rerun() + + # Download progress/results + if st.session_state.download_complete: + st.success(f"โ Downloaded {len(st.session_state.downloaded_paths)} files successfully!") + download_links = [] + for path in st.session_state.downloaded_paths: + with open(path, "rb") as f: + file_content = f.read() + file_name = os.path.basename(path) + download_links.append((file_name, file_content)) + + if len(download_links) > 0: + if download_option == "ZIP Archive": + # Create ZIP archive for download + zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir) + with open(zip_path, "rb") as f: + zip_content = f.read() + st.download_button("๐ฆ Download ZIP Archive", + zip_content, + file_name=os.path.basename(zip_path), + mime="application/zip") + else: + # Show individual file download links + st.markdown("<h4>Download Files</h4>", unsafe_allow_html=True) + + # Create a grid of download buttons + cols = st.columns(3) + for idx, (name, content) in enumerate(download_links): + mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream' + with cols[idx % 3]: + st.download_button( + f"๐ {name}", + content, + file_name=name, + mime=mime_type, + key=f"dl_{name}", + use_container_width=True + ) + + # Tab 2: Local File Search + with tabs[1]: + st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True) + st.write("Upload files to search through their content with AI-powered semantic search.") + + # File upload + uploaded_files = st.file_uploader("Upload documents for search", + accept_multiple_files=True, + type=['pdf', 'docx', 'txt', 'csv', 'json']) + + if uploaded_files: + # Build search index on upload + col1, col2 = st.columns([4, 1]) + with col1: + use_transformer = st.checkbox("Use AI Transformer Model", value=HAVE_TRANSFORMERS, + help="Uses advanced AI for more accurate semantic search (if available)") + with col2: + if st.button("Build Search Index", use_container_width=True): + with st.spinner("Processing files and building search index..."): + files_added = 0 + for uploaded_file in uploaded_files: + file_info = { + 'filename': uploaded_file.name, + 'url': f'local://{uploaded_file.name}', + 'size': humanize_file_size(uploaded_file.size) + } + success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info) + if success: + files_added += 1 + + if files_added > 0: + index_built = st.session_state.rag_search.build_index() + if index_built: + st.success(f"โ Successfully indexed {files_added} files!") + else: + st.error("Failed to build search index.") + else: + st.warning("No valid text could be extracted from the files.") + + # Search interface + st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True) + + col1, col2 = st.columns([4, 1]) + with col1: + query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change") + with col2: + expand_query = st.checkbox("Auto-expand query", value=True, + help="Automatically add related terms to your search") + + col1, col2 = st.columns([4, 1]) + with col1: + if st.button("๐ Search Documents", use_container_width=True): + if not query: + st.warning("Please enter a search query") + else: + with st.spinner("Searching..."): + results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True) + + if results: + st.markdown(f"**Found {len(results)} relevant documents:**") + for i, result in enumerate(results): + with st.container(): + st.markdown(f"<div class='result-card'>", unsafe_allow_html=True) + st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})") + + if result.get('chunk_preview'): + st.markdown("**Matching content:**") + st.text(result['chunk_preview']) + + st.markdown("</div>", unsafe_allow_html=True) + else: + st.info("No matching documents found. Try a different query.") + with col2: + num_results = st.number_input("Max results", min_value=1, max_value=20, value=5) + + # Quick search tips + with st.expander("Search Tips", expanded=False): + st.markdown(""" + ### Effective Search Tips + + - **Be specific** with your queries for more accurate results + - **Try different phrasings** if you don't get the results you expect + - Use **quotation marks** for exact phrase matching + - For **complex topics**, break down your search into multiple queries + - **Combine related terms** to improve recall + + The search engine uses advanced algorithms to understand the semantic meaning of your query, + not just keyword matching. + """) + + # Tab 3: Advanced Configuration + with tabs[2]: + st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True) + + config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"]) + + # Browser Settings tab + with config_tabs[0]: + col1, col2 = st.columns(2) + with col1: + use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode, + help="Makes browser harder to detect as automated, but may be slower") + + handle_captchas = st.checkbox("Handle Captchas Automatically", value=False, + help="Attempt to solve simple captchas automatically") + + download_timeout = st.slider("Download Timeout (seconds)", + min_value=30, max_value=600, value=300, + help="Maximum time to wait for downloads to complete") + with col2: + user_agent = st.selectbox("User Agent", USER_AGENTS, index=0, + help="Browser identity to use when accessing websites") + + save_screenshots = st.checkbox("Save Browser Screenshots", value=False, + help="Save screenshots when errors occur for debugging") + + browser_lang = st.selectbox("Browser Language", + ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"], + index=0) + + if st.button("Update Browser Settings"): + st.session_state.stealth_mode = use_stealth + st.success("Browser settings updated!") + + # Dependency installation section + st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True) + if st.button("Install Playwright Dependencies"): + with st.spinner("Installing dependencies..."): + install_playwright_dependencies() + + # Proxy Configuration tab + with config_tabs[1]: + proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy, + help="Route requests through a proxy server for anonymity or bypassing restrictions") + + if proxy_enabled: + proxy_col1, proxy_col2 = st.columns(2) + with proxy_col1: + proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"]) + proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1") + with proxy_col2: + proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080") + proxy_auth = st.text_input("Proxy Authentication (optional)", + placeholder="username:password", type="password") + + st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True) + use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False, + help="Automatically rotate between multiple proxies for better anonymity") + + if use_proxy_rotation: + proxy_list = st.text_area("Proxy List (one per line)", + placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080") + rotation_interval = st.slider("Rotation Interval (requests)", + min_value=1, max_value=50, value=10, + help="How often to switch proxies") + + if st.button("Save Proxy Configuration"): + # Construct the proxy string + proxy_string = None + if proxy_enabled and proxy_host and proxy_port: + proxy_prefix = f"{proxy_type.lower()}://" + proxy_auth_str = f"{proxy_auth}@" if proxy_auth else "" + proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}" + + # Update session state + st.session_state.use_proxy = proxy_enabled + st.session_state.proxy_string = proxy_string + + # Configure proxy rotation if enabled + if use_proxy_rotation and proxy_list: + PROXY_ROTATION_CONFIG["enabled"] = True + PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval + PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] + + st.success("Proxy configuration updated!") + + # Download Options tab + with config_tabs[2]: + col1, col2 = st.columns(2) + with col1: + st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True) + + skip_existing = st.checkbox("Skip Existing Files", value=True, + help="Don't download files that already exist locally") + + auto_rename = st.checkbox("Auto-Rename Duplicates", value=True, + help="Automatically rename files instead of overwriting") + + verify_downloads = st.checkbox("Verify Downloads", value=True, + help="Check file integrity after download") + + max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3, + help="Number of times to retry failed downloads") + + with col2: + st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True) + + auto_organize = st.checkbox("Auto-Organize Files", value=True, + help="Automatically organize files by type") + + default_dir = st.text_input("Default Download Directory", value="downloads", + help="Default location to save downloaded files") + + org_by_domain = st.checkbox("Organize by Domain", value=False, + help="Create subdirectories based on source domains") + + org_by_type = st.checkbox("Organize by File Type", value=False, + help="Create subdirectories based on file types") + + if st.button("Save Download Settings"): + st.session_state.download_settings = { + "skip_existing": skip_existing, + "auto_rename": auto_rename, + "verify_downloads": verify_downloads, + "max_retries": max_retries, + "auto_organize": auto_organize, + "default_dir": default_dir, + "org_by_domain": org_by_domain, + "org_by_type": org_by_type + } + st.success("Download settings saved!") + + # System tab + with config_tabs[3]: + col1, col2 = st.columns(2) + with col1: + st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True) + + max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3, + help="Maximum number of simultaneous downloads") + + memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024, + help="Maximum memory to use for file processing") + + processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2, + help="Number of threads to use for file processing") + + with col2: + st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True) + + log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1, + help="Detail level for application logs") + + save_debug_info = st.checkbox("Save Debug Information", value=False, + help="Save detailed information about program execution") + + log_dir = st.text_input("Log Directory", value="logs", + help="Directory to save log files") + + if st.button("Apply System Settings"): + st.session_state.system_settings = { + "max_concurrent": max_concurrent, + "memory_limit": memory_limit, + "processing_threads": processing_threads, + "log_level": log_level, + "save_debug_info": save_debug_info, + "log_dir": log_dir + } + # Update logging configuration + log_level_num = getattr(logging, log_level) + logging.getLogger().setLevel(log_level_num) + st.success("System settings applied!") + + # Reset application button + st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True) + reset_col1, reset_col2 = st.columns([1, 3]) + with reset_col1: + if st.button("Reset Application", use_container_width=True): + for key in list(st.session_state.keys()): + if key != 'google_credentials': # Preserve Google auth + del st.session_state[key] + st.success("Application has been reset!") + st.rerun() + with reset_col2: + st.info("This will clear all search results, downloaded files, and reset settings to defaults.") + + # Advanced cleanup options + st.markdown("<h4 class='section-subheader'>Advanced Options</h4>", unsafe_allow_html=True) + + adv_col1, adv_col2 = st.columns(2) + with adv_col1: + clear_cache = st.button("Clear Cache", use_container_width=True) + if clear_cache: + # Clear cached files and temporary data + temp_dir = tempfile.gettempdir() + try: + for f in os.listdir(temp_dir): + if f.startswith("playwright") or f.startswith("download"): + try: + os.remove(os.path.join(temp_dir, f)) + except: + pass + st.success("Cache cleared successfully!") + except Exception as e: + st.error(f"Error clearing cache: {e}") + + with adv_col2: + export_settings = st.button("Export Settings", use_container_width=True) + if export_settings: + # Export current settings to JSON + settings = { + "mode": st.session_state.mode, + "stealth_mode": st.session_state.stealth_mode, + "use_proxy": st.session_state.use_proxy, + "proxy_string": st.session_state.proxy_string, + "custom_extensions": st.session_state.get("custom_extensions", ""), + "prioritize_pdfs": st.session_state.get("prioritize_pdfs", True), + "system_settings": st.session_state.get("system_settings", {}), + "download_settings": st.session_state.get("download_settings", {}) + } + + settings_json = json.dumps(settings, indent=2) + b64 = base64.b64encode(settings_json.encode()).decode() + href = f'data:application/json;base64,{b64}' + st.markdown(f'<a href="{href}" download="file_downloader_settings.json">Download Settings File</a>', unsafe_allow_html=True) + + # Tab 4: Help + with tabs[3]: + st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True) + + help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"]) + + with help_tabs[0]: + st.markdown(""" + ### Getting Started + + 1. **Enter a URL** on the Search & Download tab + 2. Select a **Search Method**: + - **Deep Search**: Thorough but slower + - **Quick Search**: Fast but may miss some files + - **Exam Site Mode**: Optimized for educational resource sites + 3. Click **Start Search** to find downloadable files + 4. Select files you want to download + 5. Click **Download Selected Files** + + #### Using Different Modes + + Select a mode from the sidebar to optimize the tool for different use cases: + + - **Standard Mode**: Balanced for general use + - **Education Mode**: Optimized for finding academic materials + - **Research Mode**: Better for research papers and datasets + - **Media Mode**: Enhanced for finding images, videos, and audio + + For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials. + """) + + with help_tabs[1]: + st.markdown(""" + ### Advanced Features + + - **Local File Search**: Upload files and search through their content using the enhanced RAG search + - **Custom Extensions**: Specify additional file types to look for beyond the default set + - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers + - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity + - **Google Drive Integration**: Upload downloaded files directly to your Google Drive + + #### Search Tips + + - For educational sites, include specific terms like "exam", "test", "paper" in the URL + - When using Local File Search, try different variations of your query for better results + - Use filtering and sorting options to find the most relevant files quickly + + #### File Organization + + You can configure automatic file organization in the Advanced Configuration tab: + + - **Organize by Domain**: Creates folders based on the source website + - **Organize by File Type**: Separates files into folders by their extension + - **Auto-Rename**: Prevents overwriting existing files with same names + """) + + with help_tabs[2]: + st.markdown(""" + ### Troubleshooting + + #### Common Issues + + - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions + - **Downloads failing**: Check if the site requires authentication or uses captchas + - **Slow performance**: Reduce search depth or disable stealth mode for faster results + - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings + + #### Captcha Issues + + Some websites use captchas to prevent automated access. If you encounter captchas: + + 1. Try using a different proxy + 2. Enable "Handle Captchas Automatically" for simple captchas + 3. For complex captchas, you may need to manually access the site first + + #### Proxy Problems + + If you're having issues with proxies: + + 1. Verify your proxy is working with an external tool + 2. Check that you've entered the correct format (http://host:port) + 3. Some websites may block known proxy IPs + + #### Memory Usage + + If the application is using too much memory: + + 1. Reduce the "Memory Limit" in System settings + 2. Process fewer files at once + 3. Use lower search depth values + """) + + with help_tabs[3]: + st.markdown(""" + ### About This Tool + + **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources. + + #### Key Features + + - **Smart Discovery**: Finds downloadable files even when they're not directly linked + - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques + - **Educational Focus**: Specialized detection for exam papers and academic resources + - **Stealth Capabilities**: Avoids detection by anti-scraping measures + + #### Technical Details + + This tool uses: + + - **Playwright**: For browser automation and stealth capabilities + - **Sentence Transformers**: For AI-powered semantic search + - **Streamlit**: For the user interface + - **Google Drive API**: For cloud integration + + #### Credits + + Created with Python, Streamlit, Playwright, and various AI libraries. + + For issues or suggestions, please contact the developer. + + Version 2.0 - March 2025 + """) + + # Handle search and download actions + if search_button and url: + # Reset files and downloaded paths + st.session_state.files = [] + st.session_state.downloaded_paths = [] + st.session_state.download_complete = False + + # Clear the preset URL if it was used + if 'preset_url' in st.session_state: + st.session_state.preset_url = '' + + # Prepare custom extensions + custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()] + + # Configure proxy from session state + proxy_string = st.session_state.proxy_string if st.session_state.use_proxy else None + + # Set up proxy rotation if enabled + if 'use_proxy_rotation' in locals() and use_proxy_rotation and proxy_list: + PROXY_ROTATION_CONFIG["enabled"] = True + PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval + PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] + + # Configure search parameters based on method + sublink_limit = 5000 if search_method == "Deep Search" else 1000 + search_depth = depth if search_method == "Deep Search" else 1 + is_exam_site = search_method == "Exam Site Mode" + + # Execute the search asynchronously + async def run_search(): + async with DownloadManager( + use_proxy=st.session_state.use_proxy, + proxy=proxy_string, + use_stealth=st.session_state.stealth_mode, + proxy_rotation=PROXY_ROTATION_CONFIG["enabled"] + ) as manager: + # For exam sites, use specialized approach + if is_exam_site: + st.session_state.keep_progress = True + edu_links = await manager.get_edu_exam_links(url) + all_files = [] + + progress_text = st.empty() + progress_bar = st.progress(0) + + # Process each exam link + for i, link in enumerate(edu_links): + progress = (i+1) / max(1, len(edu_links)) + progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}") + progress_bar.progress(progress) + + files = await manager.extract_downloadable_files(link, custom_ext_list) + all_files.extend(files) + + st.session_state.files = all_files + progress_text.empty() + progress_bar.empty() + st.session_state.keep_progress = False + + else: + # Use general search method + files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout) + st.session_state.files = files + + # Run the search + asyncio.run(run_search()) + st.rerun() + + # Handle download button + if 'download_button' in locals() and download_button and selected_files: + # Create download directory + os.makedirs(download_dir, exist_ok=True) + + # Reset download state + st.session_state.downloaded_paths = [] + st.session_state.download_complete = False + + # Get selected files + files_to_download = [st.session_state.files[i] for i in selected_files] + + # Execute the download asynchronously + async def run_download(): + async with DownloadManager( + use_proxy=st.session_state.use_proxy, + proxy=st.session_state.proxy_string, + use_stealth=st.session_state.stealth_mode + ) as manager: + download_progress = st.progress(0) + status_text = st.empty() + + for i, file_info in enumerate(files_to_download): + progress = (i) / len(files_to_download) + status_text.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}") + download_progress.progress(progress) + + downloaded_path = await manager.download_file( + file_info, + download_dir, + get_domain(file_info['url']) + ) + + if downloaded_path: + st.session_state.downloaded_paths.append(downloaded_path) + + download_progress.progress(1.0) + status_text.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!") + st.session_state.download_complete = True + + # Run the download + asyncio.run(run_download()) + st.rerun() + + # Handle Google Drive upload + if 'google_drive_button' in locals() and google_drive_button and st.session_state.google_credentials and st.session_state.downloaded_paths: + with st.spinner("Uploading to Google Drive..."): + drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials) + + # Create folder if it doesn't exist + folder_id = None + folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader" + + # Check if folder exists + query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false" + results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute() + items = results.get('files', []) + + if not items: + # Create folder + folder_id = create_drive_folder(drive_service, folder_name) + else: + folder_id = items[0]['id'] + + # Upload each file + upload_progress = st.progress(0) + status_text = st.empty() + uploaded_count = 0 + + for i, path in enumerate(st.session_state.downloaded_paths): + progress = i / len(st.session_state.downloaded_paths) + status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}") + upload_progress.progress(progress) + + result = google_drive_upload(path, st.session_state.google_credentials, folder_id) + if isinstance(result, str) and not result.startswith("Error"): + uploaded_count += 1 + + upload_progress.progress(1.0) + status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'") + + st.success(f"โ Files uploaded to Google Drive successfully!") + + # Handle clear button + if clear_button: + st.session_state.files = [] + st.session_state.downloaded_paths = [] + st.session_state.download_complete = False + if 'preset_url' in st.session_state: + st.session_state.preset_url = '' + st.rerun() if __name__ == "__main__": main() \ No newline at end of file