diff --git "a/app.py" "b/app.py" deleted file mode 100644--- "a/app.py" +++ /dev/null @@ -1,4816 +0,0 @@ -import streamlit as st -st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐Ÿ“") - - -# Core imports -import os -import subprocess -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError -import asyncio -import logging -from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote -import re -from pathlib import Path -from io import BytesIO -import random -from bs4 import BeautifulSoup -from PyPDF2 import PdfReader -import zipfile -import tempfile -import mimetypes -import requests -import datetime -import traceback -import base64 -import shutil -import json -import time -from PIL import Image -from reportlab.lib.pagesizes import letter -from reportlab.pdfgen import canvas -import google_auth_oauthlib.flow -import googleapiclient.discovery -import google.auth.transport.requests -import googleapiclient.http - -# Enhanced RAG search imports -import nltk -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity -import numpy as np -import docx2txt - -# Try to import sentence-transformers for better embeddings -try: - from sentence_transformers import SentenceTransformer - HAVE_TRANSFORMERS = True -except ImportError: - HAVE_TRANSFORMERS = False - -# Try to download NLTK data if not already present -try: - nltk.data.find('tokenizers/punkt') -except LookupError: - try: - nltk.download('punkt', quiet=True) - except: - pass - -try: - nltk.data.find('corpora/stopwords') -except LookupError: - try: - nltk.download('stopwords', quiet=True) - from nltk.corpus import stopwords - STOPWORDS = set(stopwords.words('english')) - except: - STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by']) - -# -------------------- Logging Setup -------------------- -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# -------------------- Google OAuth Config -------------------- -GOOGLE_OAUTH_CONFIG = { - "web": { - "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", - "project_id": "huggingface-449214", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_secret": "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", - "redirect_uris": ["https://euler314-craw-web.hf.space/"] - } -} - -# -------------------- Stealth and UA Settings -------------------- -# Extended user agent list for better variety -USER_AGENTS = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54', - 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', - 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0' -] - -# Stealth browser settings -STEALTH_SETTINGS = { - # Hardware features to modify/disable - "hardware_concurrency": 4, - "device_memory": 8, - # Browser features to enable/disable - "webgl_vendor": "Google Inc. (Intel)", - "webgl_renderer": "Intel Iris OpenGL Engine", - "languages": ["en-US", "en"], - "disable_webrtc": True, - # Additional timing randomization - "navigator_platform": "Win32", - "touch_support": False -} - -# Proxy rotation configuration (if using multiple proxies) -PROXY_ROTATION_CONFIG = { - "enabled": False, # Set to True to enable rotation - "rotation_interval": 10, # Rotate every 10 requests - "proxies": [] # Will be populated from the UI if needed -} - -# -------------------- Enhanced RAG Search with Small LLM -------------------- -class EnhancedRAGSearch: - def __init__(self): - self.file_texts = [] - self.chunks = [] # Document chunks for more targeted search - self.chunk_metadata = [] # Metadata for each chunk - self.file_metadata = [] - self.languages = [] - self.model = None - - # Try to load the sentence transformer model if available - if HAVE_TRANSFORMERS: - try: - # Use a small, efficient model - self.model = SentenceTransformer('all-MiniLM-L6-v2') - self.use_transformer = True - logger.info("Using sentence-transformers for RAG") - except Exception as e: - logger.warning(f"Error loading sentence-transformer: {e}") - self.use_transformer = False - else: - self.use_transformer = False - - # Fallback to TF-IDF if transformers not available - if not self.use_transformer: - self.vectorizer = TfidfVectorizer( - stop_words='english', - ngram_range=(1, 2), # Use bigrams for better context - max_features=15000, # Use more features for better representation - min_df=1 # Include rare terms - ) - - self.vectors = None - self.chunk_vectors = None - - def add_file(self, file_data, file_info): - """Add a file to the search index with improved processing""" - file_ext = os.path.splitext(file_info['filename'])[1].lower() - text = self.extract_text(file_data, file_ext) - - if text: - # Store the whole document text - self.file_texts.append(text) - self.file_metadata.append(file_info) - - # Try to detect language - try: - # Simple language detection based on stopwords - words = re.findall(r'\b\w+\b', text.lower()) - english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100])) - lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown' - self.languages.append(lang) - except: - self.languages.append('en') # Default to English - - # Create chunks for more granular search - chunks = self.create_chunks(text) - for chunk in chunks: - self.chunks.append(chunk) - self.chunk_metadata.append({ - 'file_info': file_info, - 'chunk_size': len(chunk), - 'file_index': len(self.file_texts) - 1 - }) - - return True - return False - - def create_chunks(self, text, chunk_size=1000, overlap=200): - """Split text into overlapping chunks for better search precision""" - # Try to use NLTK for sentence-aware chunking - try: - sentences = nltk.sent_tokenize(text) - chunks = [] - current_chunk = "" - - for sentence in sentences: - if len(current_chunk) + len(sentence) <= chunk_size: - current_chunk += sentence + " " - else: - # Add current chunk if it has content - if current_chunk: - chunks.append(current_chunk.strip()) - - # Start new chunk with overlap from previous chunk - if len(current_chunk) > overlap: - # Find the last space within the overlap region - overlap_text = current_chunk[-overlap:] - last_space = overlap_text.rfind(' ') - if last_space != -1: - current_chunk = current_chunk[-(overlap-last_space):] + sentence + " " - else: - current_chunk = sentence + " " - else: - current_chunk = sentence + " " - - # Add the last chunk if it has content - if current_chunk: - chunks.append(current_chunk.strip()) - - return chunks - except: - # Fallback to simpler chunking approach - chunks = [] - for i in range(0, len(text), chunk_size - overlap): - chunk = text[i:i + chunk_size] - if chunk: - chunks.append(chunk) - return chunks - - def extract_text(self, file_data, file_ext): - """Extract text from different file types with enhanced support""" - try: - if file_ext.lower() == '.pdf': - reader = PyPDF2.PdfReader(BytesIO(file_data)) - text = "" - for page in reader.pages: - extracted = page.extract_text() - if extracted: - text += extracted + "\n" - # If text extraction fails, try to OCR (would need extra libraries) - return text - elif file_ext.lower() in ['.docx', '.doc']: - return docx2txt.process(BytesIO(file_data)) - elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']: - # Handle both UTF-8 and other common encodings - try: - return file_data.decode('utf-8', errors='ignore') - except: - encodings = ['latin-1', 'iso-8859-1', 'windows-1252'] - for enc in encodings: - try: - return file_data.decode(enc, errors='ignore') - except: - pass - # Last resort fallback - return file_data.decode('utf-8', errors='ignore') - elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']: - # For these types, we would need additional libraries - # For now, return a placeholder message - return f"[Content of {file_ext} file - install additional libraries for full text extraction]" - else: - return "" - except Exception as e: - logger.error(f"Error extracting text: {e}") - return "" - - def build_index(self): - """Build both document and chunk search indices""" - if not self.file_texts: - return False - - try: - if self.use_transformer: - # Use sentence transformer models for embeddings - logger.info("Building document and chunk embeddings with transformer model...") - self.vectors = self.model.encode(self.file_texts, show_progress_bar=False) - - # Build chunk-level index if we have chunks - if self.chunks: - # Process in batches to avoid memory issues - batch_size = 32 - chunk_vectors = [] - for i in range(0, len(self.chunks), batch_size): - batch = self.chunks[i:i+batch_size] - batch_vectors = self.model.encode(batch, show_progress_bar=False) - chunk_vectors.append(batch_vectors) - self.chunk_vectors = np.vstack(chunk_vectors) - else: - # Build document-level index - self.vectors = self.vectorizer.fit_transform(self.file_texts) - - # Build chunk-level index if we have chunks - if self.chunks: - self.chunk_vectors = self.vectorizer.transform(self.chunks) - - return True - except Exception as e: - logger.error(f"Error building search index: {e}") - return False - - def expand_query(self, query): - """Add related terms to query for better recall - mini LLM function""" - # Dictionary of related terms for common keywords - expansions = { - "exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"], - "test": ["exam", "quiz", "assessment", "paper"], - "document": ["file", "paper", "report", "doc", "documentation"], - "manual": ["guide", "instruction", "documentation", "handbook"], - "tutorial": ["guide", "instructions", "how-to", "lesson"], - "article": ["paper", "publication", "journal", "research"], - "research": ["study", "investigation", "paper", "analysis"], - "book": ["textbook", "publication", "volume", "edition"], - "thesis": ["dissertation", "paper", "research", "study"], - "report": ["document", "paper", "analysis", "summary"], - "assignment": ["homework", "task", "project", "work"], - "lecture": ["class", "presentation", "talk", "lesson"], - "notes": ["annotations", "summary", "outline", "study material"], - "syllabus": ["curriculum", "course outline", "program", "plan"], - "paper": ["document", "article", "publication", "exam", "test"], - "question": ["problem", "query", "exercise", "inquiry"], - "solution": ["answer", "resolution", "explanation", "result"], - "reference": ["source", "citation", "bibliography", "resource"], - "analysis": ["examination", "study", "evaluation", "assessment"], - "guide": ["manual", "instruction", "handbook", "tutorial"], - "worksheet": ["exercise", "activity", "handout", "practice"], - "review": ["evaluation", "assessment", "critique", "feedback"], - "material": ["resource", "content", "document", "information"], - "data": ["information", "statistics", "figures", "numbers"] - } - - # Enhanced query expansion simulating a mini-LLM - query_words = re.findall(r'\b\w+\b', query.lower()) - expanded_terms = set() - - # Directly add expansions from our dictionary - for word in query_words: - if word in expansions: - expanded_terms.update(expansions[word]) - - # Add common academic file formats if not already included - if any(term in query.lower() for term in ["file", "document", "download", "paper"]): - if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]): - expanded_terms.update(["pdf", "docx", "pptx", "xlsx"]) - - # Add special academic terms when the query seems related to education - if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]): - expanded_terms.update(["syllabus", "lecture", "notes", "textbook"]) - - # Return original query plus expanded terms - if expanded_terms: - expanded_query = f"{query} {' '.join(expanded_terms)}" - logger.info(f"Expanded query: '{query}' -> '{expanded_query}'") - return expanded_query - return query - - def search(self, query, top_k=5, search_chunks=True): - """Enhanced search with both document and chunk-level search""" - if self.vectors is None: - return [] - - # Simulate a small LLM by expanding the query with related terms - expanded_query = self.expand_query(query) - - try: - results = [] - - if self.use_transformer: - # Transform the query to embedding - query_vector = self.model.encode([expanded_query])[0] - - # First search at document level for higher-level matches - if self.vectors is not None: - # Compute similarities between query and documents - doc_similarities = cosine_similarity( - query_vector.reshape(1, -1), - self.vectors - ).flatten() - - top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] - - for i, idx in enumerate(top_doc_indices): - if doc_similarities[idx] > 0.2: # Threshold to exclude irrelevant results - results.append({ - 'file_info': self.file_metadata[idx], - 'score': float(doc_similarities[idx]), - 'rank': i+1, - 'match_type': 'document', - 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' - }) - - # Then search at chunk level for more specific matches if enabled - if search_chunks and self.chunk_vectors is not None: - # Compute similarities between query and chunks - chunk_similarities = cosine_similarity( - query_vector.reshape(1, -1), - self.chunk_vectors - ).flatten() - - top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results - - # Use a set to avoid duplicate file results - seen_files = set(r['file_info']['url'] for r in results) - - for i, idx in enumerate(top_chunk_indices): - if chunk_similarities[idx] > 0.25: # Higher threshold for chunks - file_index = self.chunk_metadata[idx]['file_index'] - file_info = self.file_metadata[file_index] - - # Only add if we haven't already included this file - if file_info['url'] not in seen_files: - seen_files.add(file_info['url']) - results.append({ - 'file_info': file_info, - 'score': float(chunk_similarities[idx]), - 'rank': len(results) + 1, - 'match_type': 'chunk', - 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', - 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] - }) - - # Stop after we've found enough results - if len(results) >= top_k*1.5: - break - else: - # Fallback to TF-IDF if transformers not available - query_vector = self.vectorizer.transform([expanded_query]) - - # First search at document level - if self.vectors is not None: - doc_similarities = cosine_similarity(query_vector, self.vectors).flatten() - top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] - - for i, idx in enumerate(top_doc_indices): - if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results - results.append({ - 'file_info': self.file_metadata[idx], - 'score': float(doc_similarities[idx]), - 'rank': i+1, - 'match_type': 'document', - 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' - }) - - # Then search at chunk level if enabled - if search_chunks and self.chunk_vectors is not None: - chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten() - top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] - - # Avoid duplicates - seen_files = set(r['file_info']['url'] for r in results) - - for i, idx in enumerate(top_chunk_indices): - if chunk_similarities[idx] > 0.15: - file_index = self.chunk_metadata[idx]['file_index'] - file_info = self.file_metadata[file_index] - - if file_info['url'] not in seen_files: - seen_files.add(file_info['url']) - results.append({ - 'file_info': file_info, - 'score': float(chunk_similarities[idx]), - 'rank': len(results) + 1, - 'match_type': 'chunk', - 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', - 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] - }) - - if len(results) >= top_k*1.5: - break - - # Sort combined results by score - results.sort(key=lambda x: x['score'], reverse=True) - - # Re-rank and truncate - for i, result in enumerate(results[:top_k]): - result['rank'] = i+1 - - return results[:top_k] - except Exception as e: - logger.error(f"Error during search: {e}") - return [] - -# -------------------- Utility Functions -------------------- -def get_random_user_agent(): - return random.choice(USER_AGENTS) - -def sizeof_fmt(num, suffix='B'): - for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: - if abs(num) < 1024.0: - return f"{num:3.1f}{unit}{suffix}" - num /= 1024.0 - return f"{num:.1f}Y{suffix}" - -def create_zip_file(file_paths, output_dir): - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") - with zipfile.ZipFile(zip_path, 'w') as zipf: - for file_path in file_paths: - zipf.write(file_path, os.path.basename(file_path)) - return zip_path - -def get_file_extension(url, default='.pdf'): - """Extract file extension from URL or filename""" - path = urlparse(url).path - ext = os.path.splitext(path)[1].lower() - if not ext: - return default - return ext - -def humanize_file_size(size_bytes): - """Format file size in human-readable format""" - if size_bytes < 1024: - return f"{size_bytes} bytes" - for unit in ['KB', 'MB', 'GB', 'TB']: - size_bytes /= 1024.0 - if size_bytes < 1024.0: - return f"{size_bytes:.1f} {unit}" - return f"{size_bytes:.1f} PB" - -def get_domain(url): - """Extract domain from URL""" - parsed = urlparse(url) - return parsed.netloc - -def is_valid_file_url(url, extensions): - """Check if URL is a valid file URL based on extension""" - return any(url.lower().endswith(ext) for ext in extensions) - -def detect_captcha(html_content): - """Detect common captcha patterns in HTML content""" - captcha_patterns = [ - 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile', - 'challenge', 'solve the following', 'verify you are human' - ] - html_lower = html_content.lower() - return any(pattern in html_lower for pattern in captcha_patterns) - -def is_download_link(url): - """Enhanced function to detect if a URL is likely a download link""" - # Check for obvious download indicators in URL - url_lower = url.lower() - - # Check for common download-related terms in the URL - download_terms = [ - 'download', 'dl', 'get', 'file', 'attachment', 'export', 'view', - 'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document' - ] - if any(term in url_lower for term in download_terms): - return True - - # Check for common download script patterns - script_patterns = [ - 'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php', - 'download.aspx', 'getfile.aspx', 'file.aspx', - 'downloadhandler', 'filehandler', 'filedownload', - 'download.jsp', 'download.cgi', 'download.do', - 'download-file', 'get-file', - 'downloadfile', 'getfile', 'viewfile', - 'Action=downloadfile', 'action=download', 'action=view', - 'download?', 'file?', 'get?', 'view?' - ] - if any(pattern in url_lower for pattern in script_patterns): - return True - - # Check for common file extensions in the URL path or parameters - path = urlparse(url).path - common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', - '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg', - '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov'] - - if any(ext in path.lower() for ext in common_extensions): - return True - - # Check for file ID or file parameters in URL - params = parse_qs(urlparse(url).query) - param_keys = params.keys() - file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid'] - if any(key.lower() in file_param_indicators for key in param_keys): - return True - - # Check for complex encoding patterns like in the example URL - if 'Action=downloadfile' in url or 'fname=' in url: - return True - - return False - -def normalize_download_url(url): - """Normalize download URLs to handle various formats and encodings""" - try: - # Handle common URL shorteners and redirections - parsed = urlparse(url) - - # Handle phpMyAdmin-style encoded URLs - if 'Action=downloadfile' in url and 'file=' in url: - # Extract the encoded file parameter - params = parse_qs(parsed.query) - if 'file' in params: - # This is just a placeholder - in a real implementation, - # you would need to handle the specific encoding used - encoded_file = params['file'][0] - # Keep the URL as is for now, since we'll handle it during download - return url - - # Handle URLs with fname parameter (like in the example) - if 'fname=' in url: - # Keep as is - we'll handle this specially during download - return url - - # For other URLs, make sure they are properly quoted - path = parsed.path - # Only quote the path portion if needed - if '%' not in path and ' ' in path: - path = quote(path) - - # Reconstruct the URL - normalized = parsed._replace(path=path).geturl() - return normalized - except Exception as e: - logger.error(f"Error normalizing URL {url}: {e}") - return url - -# -------------------- Google Drive Functions -------------------- -def get_google_auth_url(): - client_config = GOOGLE_OAUTH_CONFIG["web"] - flow = google_auth_oauthlib.flow.Flow.from_client_config( - {"web": client_config}, - scopes=["https://www.googleapis.com/auth/drive.file"] - ) - flow.redirect_uri = client_config["redirect_uris"][0] - authorization_url, _ = flow.authorization_url( - access_type="offline", - include_granted_scopes="true", - prompt="consent" - ) - return authorization_url - -def exchange_code_for_credentials(auth_code): - if not auth_code.strip(): - return None, "No code provided." - try: - client_config = GOOGLE_OAUTH_CONFIG["web"] - flow = google_auth_oauthlib.flow.Flow.from_client_config( - {"web": client_config}, - scopes=["https://www.googleapis.com/auth/drive.file"] - ) - flow.redirect_uri = client_config["redirect_uris"][0] - flow.fetch_token(code=auth_code.strip()) - creds = flow.credentials - if not creds or not creds.valid: - return None, "Could not validate credentials. Check code and try again." - return creds, "Google Sign-In successful!" - except Exception as e: - return None, f"Error during token exchange: {e}" - -def google_drive_upload(file_path, credentials, folder_id=None): - try: - drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) - file_metadata = {'name': os.path.basename(file_path)} - if folder_id: - file_metadata['parents'] = [folder_id] - media = googleapiclient.http.MediaFileUpload(file_path, resumable=True) - created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() - return created.get("id", "") - except Exception as e: - return f"Error uploading to Drive: {str(e)}" - -def create_drive_folder(drive_service, name): - folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'} - folder = drive_service.files().create(body=folder_metadata, fields='id').execute() - return folder.get('id') - -# -------------------- Playwright Setup -------------------- -def install_playwright_dependencies(): - try: - # Set environment variable for Playwright browsers path - os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") - - # Install system dependencies - subprocess.run(['apt-get', 'update', '-y'], check=True) - packages = [ - 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', - 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', - 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' - ] - subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) - - # Install Playwright and dependencies - subprocess.run(['pip', 'install', 'playwright'], check=True) - subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) - - st.success("Playwright dependencies installed successfully!") - except Exception as e: - st.error(f"Error installing Playwright dependencies: {e}") - st.info("You may need to manually install dependencies. Check console for details.") - logger.error(f"Playwright setup error: {e}") - traceback.print_exc() - -# -------------------- Download Manager Class -------------------- -class DownloadManager: - def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False): - self.use_proxy = use_proxy - self.proxy = proxy - self.query = query - self.num_results = num_results - self.playwright = None - self.browser = None - self.context = None - self.page = None - self.use_stealth = use_stealth - self.proxy_rotation = proxy_rotation - self.request_count = 0 - self.captcha_detected = False - self.download_timeout = 300 # 5 minutes timeout for downloads - # Track visited URLs to avoid revisiting the same URL multiple times - self.visited_urls = set() - # Track successfully downloaded files to avoid redownloading - self.downloaded_files = set() - - async def __aenter__(self): - self.playwright = await async_playwright().start() - - # Prepare browser args with stealth settings - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--no-zygote', - '--single-process', - '--disable-web-security', - '--disable-features=IsolateOrigins', - '--disable-site-isolation-trials' - ] - - # Add stealth-specific args - if self.use_stealth: - browser_args.extend([ - '--disable-blink-features=AutomationControlled', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-webgl', - '--disable-webrtc' - ]) - - # Setup browser options - opts = { - "headless": True, - "args": browser_args - } - - # Configure proxy if specified - if self.use_proxy and self.proxy: - opts["proxy"] = {"server": self.proxy} - - # Launch browser with options - self.browser = await self.playwright.chromium.launch(**opts) - - # Setup browser context with enhanced settings - context_opts = { - "user_agent": get_random_user_agent(), - "viewport": {"width": 1920, "height": 1080}, - "device_scale_factor": 1, - "has_touch": False, - "is_mobile": False, - "ignore_https_errors": True, - "accept_downloads": True - } - - # Apply stealth-specific settings to the context - if self.use_stealth: - # Apply JS-injection for enhanced stealth - context_opts["bypass_csp"] = True - self.context = await self.browser.new_context(**context_opts) - - # Execute stealth JS to avoid detection - await self.context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change navigator properties - const newProto = navigator.__proto__; - delete newProto.webdriver; - - // Overwrite the plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - // Handle languages more naturally - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - - // Modify hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => 4 - }); - - // Modify deviceMemory - Object.defineProperty(navigator, 'deviceMemory', { - get: () => 8 - }); - - // WebGL modifications - const getParameter = WebGLRenderingContext.prototype.getParameter; - WebGLRenderingContext.prototype.getParameter = function(parameter) { - if (parameter === 37445) { - return 'Intel Inc.'; - } - if (parameter === 37446) { - return 'Intel Iris OpenGL Engine'; - } - return getParameter.apply(this, arguments); - }; - } - """) - else: - # Regular context without stealth - self.context = await self.browser.new_context(**context_opts) - - # Create page with enhanced headers - self.page = await self.context.new_page() - await self.page.set_extra_http_headers({ - 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Cache-Control': 'max-age=0', - 'DNT': '1', # Do Not Track - 'Referer': 'https://www.google.com/', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'cross-site', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1' - }) - - # Add delay for mouse movements to simulate human behavior - if self.use_stealth: - await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500)) - await self.page.wait_for_timeout(random.randint(200, 500)) - - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - if self.browser: - await self.browser.close() - if self.playwright: - await self.playwright.stop() - - async def rotate_proxy_if_needed(self): - """Rotate proxy if proxy rotation is enabled and threshold is reached""" - if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]: - self.request_count += 1 - if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]: - # Get next proxy from the pool - next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0) - PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list - - # Close existing context and create new one with the new proxy - if self.context: - await self.context.close() - - # Create new context with the new proxy - context_opts = { - "user_agent": get_random_user_agent(), - "proxy": {"server": next_proxy}, - "accept_downloads": True - } - self.context = await self.browser.new_context(**context_opts) - self.page = await self.context.new_page() - - # Reset counter - self.request_count = 0 - logger.info(f"Rotated to new proxy: {next_proxy}") - - async def handle_captcha(self, page): - """Detect and handle captchas if possible""" - # Check for common captcha patterns - content = await page.content() - if detect_captcha(content): - self.captcha_detected = True - logger.warning("Captcha detected on page") - - # Strategies for handling captchas: - # 1. For simple captchas, try to extract the image and solve it - captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]') - if captcha_img: - logger.info("Found captcha image, attempting to capture") - - # Take screenshot of the captcha - captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png") - await captcha_img.screenshot(path=captcha_path) - - # In a real implementation, you would send this to a captcha solving service - # For now, just log the detection - logger.info(f"Captcha image saved to {captcha_path}") - - # For demonstration, we'll notify the user but not actually solve it - return False - - # 2. For reCAPTCHA, special handling would be required - recaptcha = await page.query_selector('iframe[src*="recaptcha"]') - if recaptcha: - logger.warning("reCAPTCHA detected, would require external solving service") - return False - - # 3. Try to perform human-like actions that might bypass simple bot checks - await self.perform_human_actions(page) - - # Check if captcha is still present - content = await page.content() - if detect_captcha(content): - logger.warning("Captcha still present after human-like actions") - return False - else: - logger.info("Captcha appears to be resolved") - return True - - return True # No captcha detected - - async def perform_human_actions(self, page): - """Perform human-like actions on the page to possibly bypass simple bot checks""" - try: - # 1. Slowly scroll down the page - for i in range(3): - await page.evaluate(f"window.scrollTo(0, {i * 300})") - await page.wait_for_timeout(random.randint(300, 700)) - - # 2. Random mouse movements - for _ in range(3): - x = random.randint(100, 800) - y = random.randint(100, 600) - await page.mouse.move(x=x, y=y) - await page.wait_for_timeout(random.randint(200, 500)) - - # 3. Click on a non-essential part of the page - try: - await page.click("body", position={"x": 50, "y": 50}) - except: - pass - - # 4. Wait a bit before continuing - await page.wait_for_timeout(1000) - - except Exception as e: - logger.warning(f"Error during human-like actions: {e}") - - async def search_bing(self): - urls = [] - try: - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - search_url = f"https://www.bing.com/search?q={self.query}" - await self.page.goto(search_url, timeout=30000) - await self.page.wait_for_load_state('networkidle') - - # Check for captchas - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected during search, results may be limited") - - # More natural scrolling behavior - for i in range(3): - await self.page.evaluate(f"window.scrollTo(0, {i * 400})") - await self.page.wait_for_timeout(random.randint(300, 800)) - - # Extract search results - links = await self.page.query_selector_all("li.b_algo h2 a") - for link in links[:self.num_results]: - href = await link.get_attribute('href') - if href: - urls.append(href) - - # If we didn't find enough results, try an alternative selector - if len(urls) < self.num_results: - alt_links = await self.page.query_selector_all(".b_caption a") - for link in alt_links: - href = await link.get_attribute('href') - if href and href not in urls: - urls.append(href) - if len(urls) >= self.num_results: - break - - return urls - except Exception as e: - logger.error(f"Error searching Bing: {e}") - return [] - - async def get_file_size(self, url): - try: - await self.rotate_proxy_if_needed() - - # For complex download URLs, we need to be careful with HEAD requests - if '?' in url or 'Action=downloadfile' in url or 'fname=' in url: - # For these URLs, we'll try a more reliable approach using range headers - headers = { - 'User-Agent': get_random_user_agent(), - 'Range': 'bytes=0-0' # Just request the first byte to check headers - } - - try: - with requests.get(url, headers=headers, stream=True, timeout=10) as r: - if 'Content-Range' in r.headers: - content_range = r.headers['Content-Range'] - match = re.search(r'bytes 0-0/(\d+)', content_range) - if match: - size = int(match.group(1)) - return sizeof_fmt(size) - - if 'Content-Length' in r.headers: - size = int(r.headers['Content-Length']) - # If size is 1, it's likely just our single requested byte - if size > 1: - return sizeof_fmt(size) - except Exception as e: - logger.warning(f"Error getting file size with Range request: {e}") - - # Fallback to browser approach - try: - async with self.context.new_page() as page: - response = await page.request.head(url, timeout=15000) - length = response.headers.get('Content-Length', None) - if length: - return sizeof_fmt(int(length)) - except Exception as e: - logger.warning(f"Error getting file size with browser: {e}") - - return "Unknown Size" - else: - # Standard approach for normal URLs - async with self.context.new_page() as page: - response = await page.request.head(url, timeout=15000) - length = response.headers.get('Content-Length', None) - if length: - return sizeof_fmt(int(length)) - else: - return "Unknown Size" - except Exception as e: - logger.warning(f"Error getting file size: {e}") - return "Unknown Size" - - async def get_pdf_metadata(self, url): - try: - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - resp = await page.request.get(url, timeout=15000) - if resp.ok: - content = await resp.body() - pdf = BytesIO(content) - reader = PdfReader(pdf) - return { - 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', - 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', - 'Pages': len(reader.pages), - } - else: - return {} - except Exception as e: - logger.warning(f"Error reading PDF metadata: {e}") - return {} - - async def extract_real_download_url(self, url): - """Enhanced method to extract real download URL, handling complex URLs""" - try: - # Check if this is a complex download URL that needs special handling - if 'Action=downloadfile' in url or 'fname=' in url: - logger.info(f"Complex download URL detected: {url}") - - # For these special cases, we'll use the browser to navigate and intercept redirects - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - # Set up request interception to capture redirects - await page.route('**', lambda route: route.continue_()) - - # Listen for all responses - responses = [] - page.on('response', lambda response: responses.append(response)) - - try: - # Go to the URL - await page.goto(url, wait_until='networkidle', timeout=30000) - - # Check all responses for potential downloads - for response in responses: - # Look for content-disposition headers indicating a download - content_disposition = response.headers.get('Content-Disposition', '') - if 'attachment' in content_disposition or 'filename=' in content_disposition: - return response.url - - # Look for content-type headers indicating a file - content_type = response.headers.get('Content-Type', '') - if content_type and content_type != 'text/html' and not content_type.startswith('text/'): - return response.url - - # If no clear download was detected, return the final URL - return page.url - except Exception as e: - logger.warning(f"Error extracting real download URL: {e}") - return url - else: - # Standard approach for normal URLs - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - response = await page.goto(url, wait_until='networkidle', timeout=30000) - if response and response.headers.get('location'): - return response.headers['location'] - return page.url - except Exception as e: - logger.error(f"Error extracting real download URL: {e}") - return url - - # IMPROVED: Enhanced exam links extraction method - async def get_edu_exam_links(self, url): - """Specialized method for educational exam websites that follows a common pattern.""" - try: - logger.info(f"Fetching exam links from {url}") - links = set() - - # First try with direct requests for speed (but with proper headers) - headers = { - "User-Agent": get_random_user_agent(), - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Referer": "https://www.google.com/", - "DNT": "1" - } - - try: - response = requests.get(url, headers=headers, timeout=30) - - if response.status_code == 200: - # Parse with BeautifulSoup first for efficiency - soup = BeautifulSoup(response.text, "html.parser") - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - - # Look for all links - for a in soup.find_all("a", href=True): - href = a["href"] - full_url = urljoin(url, href) - - # Look for text clues - link_text = a.get_text().lower() - - # Special patterns for exam sites (expanded list) - url_patterns = [ - "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", - "/test/", "/download/", "/files/", "/assignments/", - "paper_", "question_", "exam_", "test_", "past_", - "assignment_", "sample_", "study_material", "notes_", - "/resource/", "/subject/", "/course/", "/material/" - ] - - text_patterns = [ - "exam", "paper", "test", "question", "past", "download", - "assignment", "sample", "study", "material", "notes", - "subject", "course", "resource", "pdf", "document", - "view", "open", "get", "solution", "answer" - ] - - # Check URL for patterns - if any(pattern in full_url.lower() for pattern in url_patterns): - links.add(full_url) - continue - - # Check link text for patterns - if any(pattern in link_text for pattern in text_patterns): - links.add(full_url) - continue - - # Check for common file extensions - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(full_url) - - # Check for download script parameters - if "Action=downloadfile" in url or "fname=" in url: - links.add(url) # Add the URL itself as it's a download link - except Exception as e: - logger.warning(f"Request-based extraction failed: {e}") - - # Browser-based approach for more thorough extraction or if initial approach was inadequate - try: - # Check if we need to proceed with browser-based extraction - if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url: - logger.info("Using browser for enhanced link extraction") - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Navigate to the page with more natural timing - await self.page.goto(url, timeout=45000, wait_until='networkidle') - await self.page.wait_for_timeout(random.randint(1000, 2000)) - - # Handle captchas if present - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected, extraction may be limited") - - # Get base URL for resolving relative links - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - - # Perform natural scrolling to trigger lazy-loaded content - page_height = await self.page.evaluate("document.body.scrollHeight") - viewport_height = await self.page.evaluate("window.innerHeight") - - for scroll_pos in range(0, page_height, viewport_height // 2): - await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})") - await self.page.wait_for_timeout(random.randint(300, 800)) - - # Scroll back to top - await self.page.evaluate("window.scrollTo(0, 0)") - await self.page.wait_for_timeout(500) - - # Extract all links with Playwright (better than just anchor tags) - all_links = await self.page.evaluate(""" - () => { - const results = []; - - // Get all anchor tags - const anchors = document.querySelectorAll('a[href]'); - for (const a of anchors) { - if (a.href) { - results.push({ - href: a.href, - text: a.innerText || a.textContent || '', - isButton: a.classList.contains('btn') || a.role === 'button' - }); - } - } - - // Get buttons that might contain links - const buttons = document.querySelectorAll('button'); - for (const btn of buttons) { - const onclick = btn.getAttribute('onclick') || ''; - if (onclick.includes('window.location') || onclick.includes('download')) { - results.push({ - href: '#button', - text: btn.innerText || btn.textContent || '', - isButton: true, - onclick: onclick - }); - } - } - - return results; - } - """) - - # Process the extracted links - for link_info in all_links: - href = link_info.get('href', '') - text = link_info.get('text', '').lower() - - if href and href != '#button': - # Check URL patterns - url_patterns = [ - "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", - "/test/", "/download/", "/files/", "/assignments/", - "paper_", "question_", "exam_", "test_", "past_", - "assignment_", "sample_", "study_material", "notes_" - ] - - # Check text patterns - text_patterns = [ - "exam", "paper", "test", "question", "past", "download", - "assignment", "sample", "study", "material", "notes", - "pdf", "document", "view", "open", "solution" - ] - - if any(pattern in href.lower() for pattern in url_patterns) or \ - any(pattern in text for pattern in text_patterns) or \ - any(href.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(href) - - # Check for download links in the page - download_links = await self.page.evaluate(""" - () => { - // Find all links that might be download links - const links = Array.from(document.querySelectorAll('a[href]')); - return links - .filter(a => { - const href = a.href.toLowerCase(); - return href.includes('download') || - href.includes('getfile') || - href.includes('view.php') || - href.includes('action=downloadfile') || - href.includes('fname='); - }) - .map(a => a.href); - } - """) - - for dl_link in download_links: - links.add(dl_link) - - # Check for ASP.NET specific elements that might contain exam links - grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive') - for grid in grid_elements: - grid_links = await grid.query_selector_all('a[href]') - for a in grid_links: - href = await a.get_attribute('href') - text = await a.text_content() - - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - links.add(full_url) - - # Try clicking pagination controls to reveal more content - pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a') - for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons - try: - # Check if this is a numeric pagination button (more likely to be useful) - button_text = await button.text_content() - if button_text and button_text.strip().isdigit(): - logger.info(f"Clicking pagination button: {button_text}") - await button.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Extract links from this page - new_page_links = await self.page.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); - } - """) - - for href in new_page_links: - if href and not href.startswith('javascript:'): - if any(pattern in href.lower() for pattern in url_patterns) or \ - any(href.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(href) - except Exception as e: - logger.warning(f"Error clicking pagination button: {e}") - - # Try clicking any controls that might reveal more exam links (more focused approach) - show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn') - for button in show_buttons: - button_text = (await button.text_content() or "").lower() - button_value = (await button.get_attribute("value") or "").lower() - button_id = (await button.get_attribute("id") or "").lower() - - # Look for buttons that seem likely to reveal file lists - promising_terms = ["show", "view", "display", "list", "exam", "paper", "test", - "download", "resource", "material", "browse", "file"] - - if any(term in button_text or term in button_value or term in button_id - for term in promising_terms): - try: - logger.info(f"Clicking button: {button_text or button_value}") - await button.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Get any new links that appeared - new_links = await self.page.query_selector_all('a[href]') - for a in new_links: - href = await a.get_attribute('href') - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - - # Focus on file extensions and patterns - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \ - any(pattern in full_url.lower() for pattern in url_patterns): - links.add(full_url) - except Exception as e: - logger.warning(f"Error clicking button: {e}") - - # Special handling for ASP.NET PostBack links - try: - # Find and interact with ASP.NET __doPostBack elements - postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]') - for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks - try: - onclick = await element.get_attribute('onclick') - if onclick and '__doPostBack' in onclick: - element_text = await element.text_content() - - # Only interact with elements that seem likely to contain exam links - promising_terms = ["show", "view", "list", "exam", "paper", "test", - "download", "resource", "material"] - - if any(term in element_text.lower() for term in promising_terms): - logger.info(f"Clicking ASP.NET postback element: {element_text}") - - # Click the element - await element.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=10000) - - # Extract any new links - new_links = await self.page.query_selector_all('a[href]') - for a in new_links: - href = await a.get_attribute('href') - if href: - full_url = href if href.startswith('http') else urljoin(url, href) - if any(full_url.lower().endswith(ext) for ext in - ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - links.add(full_url) - except Exception as e: - logger.warning(f"Error interacting with postback element: {e}") - except Exception as e: - logger.warning(f"Error during postback handling: {e}") - - except Exception as e: - logger.error(f"Browser-based extraction failed: {e}") - - # Filter links to likely contain exam documents - filtered_links = [] - for link in links: - # Common file extensions for exam documents - if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']): - filtered_links.append(link) - continue - - # Common paths for exam documents - if any(pattern in link.lower() for pattern in [ - "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", - "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/", - "/resource/", "/material/", "/notes/", "/subjectmaterial/" - ]): - filtered_links.append(link) - continue - - # Check for download links (these may not have obvious extensions) - if is_download_link(link): - filtered_links.append(link) - - logger.info(f"Found {len(filtered_links)} potential exam document links") - return filtered_links - - except Exception as e: - logger.error(f"Error getting exam links: {e}") - return [] - - async def discover_hidden_links(self, page): - """Discover hidden links that might be in JavaScript, iframes, or dynamic content""" - hidden_links = set() - - # Execute JavaScript to find links in script tags and data attributes - js_links = await page.evaluate(""" - () => { - const links = new Set(); - - // Extract URLs from script tags - const scripts = document.querySelectorAll('script'); - for (const script of scripts) { - const content = script.textContent || ''; - const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || []; - for (let match of urlMatches) { - links.add(match.replace(/["']/g, '')); - } - } - - // Look for download-related variables in scripts - for (const script of scripts) { - const content = script.textContent || ''; - // Look for common patterns for file URLs in JavaScript - if (content.includes('downloadURL') || content.includes('fileURL') || - content.includes('pdfURL') || content.includes('documentURL')) { - - // Extract potential URLs - const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || []; - for (let match of potentialUrls) { - const url = match.replace(/["']/g, ''); - // Try to resolve relative URLs - if (url.startsWith('/') || !url.includes('://')) { - if (url.startsWith('/')) { - links.add(window.location.origin + url); - } else { - // Handle relative paths more carefully - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + url); - } - } else if (url.startsWith('http')) { - links.add(url); - } - } - } - } - - // Check for links in data attributes - const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]'); - for (const el of elements) { - for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) { - const val = el.getAttribute(attr); - if (val) { - // Try to resolve relative URLs - if (val.startsWith('/')) { - links.add(window.location.origin + val); - } else if (val.startsWith('http')) { - links.add(val); - } else if (!val.startsWith('javascript:') && !val.startsWith('#')) { - // Handle relative paths - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + val); - } - } - } - } - - // Look for URLs in inline event handlers - const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]'); - for (const el of clickableElements) { - for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) { - const val = el.getAttribute(attr); - if (val) { - // Check for JavaScript URLs with window.location - if (val.includes('window.location') || val.includes('document.location')) { - const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/); - if (urlMatch && urlMatch[1]) { - const url = urlMatch[1]; - if (url.startsWith('/')) { - links.add(window.location.origin + url); - } else if (url.startsWith('http')) { - links.add(url); - } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + url); - } - } - } - - // Check for direct URLs in attributes - const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || []; - for (let match of urlMatches) { - links.add(match.replace(/["']/g, '')); - } - - // Check for download.php and similar patterns - if (val.includes('download.php') || val.includes('getfile.php') || - val.includes('Action=downloadfile') || val.includes('viewfile.php')) { - - // Handle both onclick handlers and direct hrefs - let url = ''; - if (attr === 'href') { - url = val; - } else { - // Extract URL from JavaScript - const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i); - if (jsUrlMatch) { - url = jsUrlMatch[1]; - } - } - - // Resolve URL if needed - if (url) { - if (url.startsWith('/')) { - links.add(window.location.origin + url); - } else if (url.startsWith('http')) { - links.add(url); - } else if (!url.startsWith('javascript:') && !url.startsWith('#')) { - const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1); - links.add(base + url); - } - } - } - } - } - } - - // Find PHP/ASP file download links - const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]'); - for (const link of fileLinks) { - links.add(link.href); - } - - return Array.from(links); - } - """) - - for link in js_links: - hidden_links.add(link) - - # Extract links from iframes - iframes = await page.query_selector_all('iframe') - for iframe in iframes: - try: - frame = await iframe.content_frame() - if frame: - iframe_links = await frame.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')) - .map(a => a.href) - .filter(href => href.startsWith('http')); - } - """) - for link in iframe_links: - hidden_links.add(link) - except Exception as e: - logger.warning(f"Could not extract links from iframe: {e}") - - # Look for links in shadow DOM (used in modern web components) - shadow_links = await page.evaluate(""" - () => { - const links = new Set(); - - // Helper function to recursively process shadow roots - function processShadowRoot(root) { - if (!root) return; - - // Get links in this shadow root - const shadowLinks = root.querySelectorAll('a[href]'); - for (const link of shadowLinks) { - if (link.href && link.href.startsWith('http')) { - links.add(link.href); - } - } - - // Process nested shadow roots - const elements = root.querySelectorAll('*'); - for (const el of elements) { - if (el.shadowRoot) { - processShadowRoot(el.shadowRoot); - } - } - } - - // Find all shadow roots in the document - const elements = document.querySelectorAll('*'); - for (const el of elements) { - if (el.shadowRoot) { - processShadowRoot(el.shadowRoot); - } - } - - return Array.from(links); - } - """) - - for link in shadow_links: - hidden_links.add(link) - - # Look for download links in forms - form_links = await page.evaluate(""" - () => { - const links = new Set(); - - // Check for form actions that might be download endpoints - const forms = document.querySelectorAll('form'); - for (const form of forms) { - const action = form.action || ''; - if (action && ( - action.includes('download') || - action.includes('getfile') || - action.includes('viewfile') || - action.includes('Action=downloadfile') - )) { - // Collect input values that might be needed for the download - const inputs = {}; - const formInputs = form.querySelectorAll('input[name]'); - for (const input of formInputs) { - inputs[input.name] = input.value; - } - - // Store both the form action and any important inputs - links.add(action); - } - } - - return Array.from(links); - } - """) - - for link in form_links: - hidden_links.add(link) - - return hidden_links - - async def extract_downloadable_files(self, url, custom_ext_list): - found_files = [] - try: - # Normalize the URL to handle special cases - normalized_url = normalize_download_url(url) - - # Skip if we've already visited this URL - if normalized_url in self.visited_urls: - logger.info(f"Skipping already visited URL: {normalized_url}") - return [] - - # Mark this URL as visited - self.visited_urls.add(normalized_url) - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # First check if this is a direct download link (Action=downloadfile or fname parameter) - if is_download_link(normalized_url): - logger.info(f"Processing potential direct download link: {normalized_url}") - - # Try to extract the real download URL if needed - real_url = await self.extract_real_download_url(normalized_url) - - # Determine filename - for complex URLs this can be tricky - filename = os.path.basename(urlparse(real_url).path) - - # Handle URL-encoded filenames - if '%' in filename: - try: - filename = unquote(filename) - except Exception: - pass - - # For URLs with download parameters, try to extract filename from query - if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): - # Look for file parameter - params = parse_qs(urlparse(normalized_url).query) - - # Check common filename parameters - for param in ['file', 'filename', 'name', 'fname', 'f']: - if param in params and params[param]: - potential_filename = params[param][0] - if potential_filename and '/' not in potential_filename and '\\' not in potential_filename: - filename = os.path.basename(potential_filename) - break - - # If still no valid filename, use domain-based fallback - if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'): - domain = get_domain(real_url) - # Try to determine file type from content-type or extension hints in URL - ext = '.pdf' # Default - for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']: - if common_ext in normalized_url.lower(): - ext = common_ext - break - filename = f"file_from_{domain}{ext}" - - # Get file size - size_str = await self.get_file_size(real_url) - - # Add to found files - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': size_str, - 'metadata': {}, - 'download_url': normalized_url # Keep original URL for downloading - }) - - # For direct download links, we can return early - if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)): - return found_files - - # Special handling for educational exam sites - if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): - logger.info("Using specialized handler for educational exam site") - - # Get direct links to exam files - exam_links = await self.get_edu_exam_links(url) - - for link in exam_links: - # Try to resolve any redirection - real_url = await self.extract_real_download_url(link) - filename = os.path.basename(urlparse(real_url).path) - - # If filename is URL encoded (common with Chinese/international sites) - if '%' in filename: - try: - filename = unquote(filename) - except Exception: - pass - - # If filename is empty or invalid, create a sensible one - if not filename or filename == '/': - domain = get_domain(real_url) - ext = get_file_extension(real_url, '.pdf') - filename = f"file_from_{domain}{ext}" - - # Get file size - size_str = await self.get_file_size(real_url) - - # Get metadata for PDFs - meta = {} - if real_url.lower().endswith('.pdf'): - try: - meta = await self.get_pdf_metadata(real_url) - except Exception: - pass - - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': size_str, - 'metadata': meta, - 'download_url': link # Store original link for downloading - }) - - # If we found exam files with the specialized method, return them - if found_files: - return found_files - - # Standard extraction method if specialized method didn't find files - response = await self.page.goto(url, timeout=30000, wait_until='networkidle') - if not response: - return [] - - # Check for captchas - if not await self.handle_captcha(self.page): - logger.warning("Captcha detected, file extraction may be limited") - - # Scroll through the page naturally to trigger lazy loading - await self.page.evaluate(""" - (async () => { - const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); - const height = document.body.scrollHeight; - const scrollStep = Math.floor(window.innerHeight / 2); - - for (let i = 0; i < height; i += scrollStep) { - window.scrollTo(0, i); - await delay(100); - } - - window.scrollTo(0, 0); - })() - """) - await self.page.wait_for_timeout(1000) - - final_url = self.page.url - if '.php' in final_url or 'download' in final_url: - real_url = await self.extract_real_download_url(final_url) - if real_url != final_url: - # Try to detect the filename from headers or URL - response = await self.page.request.head(real_url, timeout=15000) - filename = None - - # Try to get from Content-Disposition header - content_disposition = response.headers.get('Content-Disposition', '') - if 'filename=' in content_disposition: - filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition) - if filename_match: - filename = filename_match.group(1) - - # If not found in headers, use URL basename - if not filename: - filename = os.path.basename(urlparse(real_url).path) - if not filename or filename == '/': - # Generate a name based on domain - domain = get_domain(real_url) - ext = get_file_extension(real_url, '.pdf') - filename = f"file_from_{domain}{ext}" - - found_files.append({ - 'url': real_url, - 'filename': filename, - 'size': await self.get_file_size(real_url), - 'metadata': {}, - 'download_url': final_url # Keep original URL for downloading - }) - return found_files - - await self.page.wait_for_load_state('networkidle', timeout=30000) - content = await self.page.content() - soup = BeautifulSoup(content, 'html.parser') - - default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', - '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', - '.pptx', '.odt', '.txt'] - all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) - - parsed_base = urlparse(final_url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - path_base = os.path.dirname(parsed_base.path) - - # Process all anchor tags - for a in soup.find_all('a', href=True): - href = a['href'].strip() - - if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower(): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - real_url = await self.extract_real_download_url(full_url) - if real_url and real_url != full_url: - found_files.append({ - 'url': real_url, - 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', - 'size': await self.get_file_size(real_url), - 'metadata': {}, - 'download_url': full_url # Original URL for download - }) - continue - - if any(href.lower().endswith(ext) for ext in all_exts): - file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - size_str = await self.get_file_size(file_url) - meta = {} - if file_url.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(file_url) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': size_str, - 'metadata': meta, - 'download_url': file_url # Same as URL for direct links - }) - - # Handle Google Drive links - elif ("drive.google.com" in href) or ("docs.google.com" in href): - file_id = None - for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: - match = re.search(pattern, href) - if match: - file_id = match.group(1) - break - if file_id: - # Get file info to determine type and view-only status - file_type, is_view_only = await self.get_google_drive_file_info(file_id) - - # Create a more informative filename based on info - filename = f"gdrive_{file_id}" - if file_type: - filename = f"{filename}.{file_type}" - - size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") - - found_files.append({ - 'url': href, # Use original URL - 'filename': filename, - 'size': size_str, - 'metadata': { - 'view_only': is_view_only, - 'file_type': file_type, - 'file_id': file_id - }, - 'download_url': href # Same as URL for Google Drive - }) - - # Also check for files in other elements (iframe, embed, object, etc.) - other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) - for elem in other_elements: - src = elem.get('src') or elem.get('data') - if src and any(src.lower().endswith(ext) for ext in all_exts): - file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) - size_str = await self.get_file_size(file_url) - meta = {} - if file_url.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(file_url) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': size_str, - 'metadata': meta, - 'download_url': file_url - }) - - # Check for file links in onclick attributes - onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') - for elem in onclick_elements: - onclick = await elem.get_attribute('onclick') - urls = re.findall(r'(https?://[^\'"]+)', onclick) - for url_match in urls: - if any(url_match.lower().endswith(ext) for ext in all_exts): - size_str = await self.get_file_size(url_match) - meta = {} - if url_match.lower().endswith('.pdf'): - meta = await self.get_pdf_metadata(url_match) - found_files.append({ - 'url': url_match, - 'filename': os.path.basename(url_match.split('?')[0]), - 'size': size_str, - 'metadata': meta, - 'download_url': url_match - }) - - # Also check for data-src and data-url attributes (common in lazy-loaded sites) - data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]') - for elem in data_elements: - for attr in ['data-src', 'data-url', 'data-href', 'data-download']: - try: - value = await elem.get_attribute(attr) - if value and any(value.lower().endswith(ext) for ext in all_exts): - file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) - found_files.append({ - 'url': file_url, - 'filename': os.path.basename(file_url.split('?')[0]), - 'size': await self.get_file_size(file_url), - 'metadata': {}, - 'download_url': file_url - }) - except: - pass - - # Check script tags for JSON data that might contain file URLs - script_elements = soup.find_all('script', type='application/json') - for script in script_elements: - try: - json_data = json.loads(script.string) - # Look for URL patterns in the JSON data - def extract_urls_from_json(obj, urls_found=None): - if urls_found is None: - urls_found = [] - if isinstance(obj, dict): - for k, v in obj.items(): - # Check if any key contains url-like terms - url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download'] - if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'): - urls_found.append(v) - else: - extract_urls_from_json(v, urls_found) - elif isinstance(obj, list): - for item in obj: - extract_urls_from_json(item, urls_found) - return urls_found - - json_urls = extract_urls_from_json(json_data) - for json_url in json_urls: - if any(json_url.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': json_url, - 'filename': os.path.basename(json_url.split('?')[0]), - 'size': await self.get_file_size(json_url), - 'metadata': {}, - 'download_url': json_url - }) - except: - pass - - # Check for hidden download buttons or forms - hidden_elements = await self.page.evaluate(""" - () => { - const results = []; - - // Check for hidden forms with download actions - const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]'); - for (const form of forms) { - const action = form.getAttribute('action') || ''; - results.push({ - type: 'form', - action: action, - inputs: Array.from(form.querySelectorAll('input[name]')).map(input => { - return {name: input.name, value: input.value}; - }) - }); - } - - // Check for hidden download links/buttons - const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => { - const style = window.getComputedStyle(a); - return (style.display === 'none' || style.visibility === 'hidden') && - (a.href.includes('download') || a.href.includes('file')); - }); - - for (const link of hiddenLinks) { - results.push({ - type: 'link', - href: link.href, - text: link.innerText || link.textContent - }); - } - - return results; - } - """) - - # Process hidden elements - for elem in hidden_elements: - if elem['type'] == 'link' and 'href' in elem: - href = elem['href'] - if any(href.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': href, - 'filename': os.path.basename(href.split('?')[0]), - 'size': await self.get_file_size(href), - 'metadata': {}, - 'download_url': href - }) - - # Check for hidden links that might be in JavaScript, iframes, or dynamic content - hidden_links = await self.discover_hidden_links(self.page) - for link in hidden_links: - if any(link.lower().endswith(ext) for ext in all_exts): - found_files.append({ - 'url': link, - 'filename': os.path.basename(link.split('?')[0]), - 'size': await self.get_file_size(link), - 'metadata': {}, - 'download_url': link - }) - - # Deduplicate files by URL - seen_urls = set() - unique_files = [] - for f in found_files: - if f['url'] not in seen_urls: - seen_urls.add(f['url']) - unique_files.append(f) - - return unique_files - except Exception as e: - logger.error(f"Error extracting files from {url}: {e}") - traceback.print_exc() - return [] - - async def download_file(self, file_info, save_dir, referer): - file_url = file_info.get('download_url', file_info['url']) # Use download_url if available - fname = file_info['filename'] - path = os.path.join(save_dir, fname) - base, ext = os.path.splitext(fname) - counter = 1 - while os.path.exists(path): - path = os.path.join(save_dir, f"{base}_{counter}{ext}") - counter += 1 - os.makedirs(save_dir, exist_ok=True) - - # Check if we've already downloaded this file - if file_url in self.downloaded_files: - logger.info(f"File already downloaded: {file_url}") - return None - - try: - # Special handling for Google Drive files - if "drive.google.com" in file_url or "docs.google.com" in file_url: - # Check if it's marked as view-only in metadata - is_view_only = file_info.get('metadata', {}).get('view_only', False) - - # For view-only files, try our most robust approach first - if is_view_only: - logger.info(f"Attempting to download view-only file: {file_url}") - result_path = await self.force_download_viewonly(file_info, path) - if result_path: - self.downloaded_files.add(file_url) - return result_path - - # If that failed, try the regular download approach - logger.info("Primary method failed, trying fallback methods") - - # Try regular download methods - success = await self.download_from_google_drive(file_url, path) - if success: - self.downloaded_files.add(file_url) - return path - - # If all methods failed for Google Drive, try one last approach - logger.warning("All standard methods failed, attempting force download") - result_path = await self.force_download_viewonly(file_info, path) - if result_path: - self.downloaded_files.add(file_url) - return result_path if result_path else None - - # Special handling for complex download URLs - if 'Action=downloadfile' in file_url or 'fname=' in file_url: - logger.info(f"Using browser download approach for complex URL: {file_url}") - - # For these URLs, we'll need to navigate to the page and handle the download - await self.rotate_proxy_if_needed() - - async with self.context.new_page() as page: - # Set up download event listener - download_promise = page.wait_for_event("download") - - # Navigate to the URL - await page.goto(file_url, timeout=60000) - - # Wait for the download to start - try: - download = await download_promise - await download.save_as(path) - - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except Exception as e: - logger.error(f"Browser download failed: {e}") - - # If download didn't start automatically, try to find and click download buttons - download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]') - for button in download_buttons: - try: - await button.click() - try: - download = await download_promise - await download.save_as(path) - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except: - pass - except: - continue - - # If browser approach failed, try direct request as last resort - logger.info("Browser approach failed, trying direct request") - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Try with direct requests first (faster) - try: - headers = { - 'User-Agent': get_random_user_agent(), - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Referer': referer, - 'DNT': '1' - } - - with requests.get(file_url, headers=headers, stream=True, timeout=30) as response: - if response.status_code == 200: - # Check content type to verify it's not HTML/error page - content_type = response.headers.get('Content-Type', '') - if 'text/html' in content_type and not file_url.endswith('.html'): - logger.warning(f"Received HTML instead of expected file: {file_url}") - else: - with open(path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # Verify file was downloaded correctly - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except Exception as e: - logger.warning(f"Direct download failed: {e}, trying browser approach") - - # Original code for non-Google Drive downloads using Playwright - async with self.context.new_page() as page: - headers = { - 'Accept': '*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Referer': referer - } - - # Try to download with timeout protection - try: - response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000) - if response.status == 200: - content = await response.body() - with open(path, 'wb') as f: - f.write(content) - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - else: - logger.error(f"Download failed with status {response.status}: {file_url}") - - # Try to extract error information - error_info = await response.text() - logger.debug(f"Error response: {error_info[:200]}...") - - # Check if this might be a captcha or login issue - if detect_captcha(error_info): - logger.warning("Captcha detected during download") - # For HF Spaces, we can't implement browser-based captcha solving here - # Just log the issue for now - except PlaywrightTimeoutError: - logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}") - - # Try an alternative approach - using the browser's download manager - try: - logger.info("Trying browser download manager approach") - download_promise = page.wait_for_event("download") - await page.goto(file_url, timeout=60000) - - # Wait for download to start (with timeout) - download = await download_promise - await download.save_as(path) - - if os.path.exists(path) and os.path.getsize(path) > 0: - self.downloaded_files.add(file_url) - return path - except Exception as e: - logger.error(f"Browser download manager approach failed: {e}") - - return None - except Exception as e: - logger.error(f"Error downloading {file_url}: {e}") - return None - - # IMPROVED: Enhanced view-only document download method - async def force_download_viewonly(self, file_info, save_path): - """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" - try: - # Extract file ID - file_id = file_info.get('metadata', {}).get('file_id') - if not file_id: - url = file_info['url'] - for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: - match = re.search(pattern, url) - if match: - file_id = match.group(1) - break - - if not file_id: - logger.error("Could not extract file ID") - return None - - file_type = file_info.get('metadata', {}).get('file_type', 'pdf') - base, ext = os.path.splitext(save_path) - if not ext: - save_path = f"{base}.{file_type}" - - logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") - - # Create a dedicated browser instance with better resolution and stealth - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-site-isolation-trials', - '--disable-blink-features=AutomationControlled' # Anti-detection - ] - - browser = await self.playwright.chromium.launch( - headless=True, - args=browser_args - ) - - # Use higher resolution for better quality - context = await browser.new_context( - viewport={'width': 1600, 'height': 1200}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - device_scale_factor=2.0, - accept_downloads=True # Critical for the download workflow - ) - - # Add anti-detection script - await context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change plugins - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - // Handle languages - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - - // Modify hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => 4 - }); - } - """) - - page = await context.new_page() - - try: - # Go to the file view page - logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) - await page.wait_for_load_state('networkidle') - - # Check for any barriers or permissions issues - content = await page.content() - if "the owner has not granted you permission to" in content: - logger.warning("Permission denied error detected") - - # Randomized wait to appear more human-like - await page.wait_for_timeout(random.randint(3000, 7000)) - - # Create temp directory - temp_dir = tempfile.mkdtemp() - - # Special handling for PDFs - if file_type.lower() == 'pdf': - # Use the improved scrolling and detection approach - - # Perform some natural mouse movements and scrolling - await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400)) - await page.wait_for_timeout(random.randint(500, 1000)) - - # Estimate number of pages - estimated_pages = await page.evaluate(""" - () => { - // Method 1: Check page counter text - const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { - const text = el.textContent || ''; - return /\\d+\\s*\\/\\s*\\d+/.test(text); - }); - - if (pageCounters.length > 0) { - const text = pageCounters[0].textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) return parseInt(match[2]); - } - - // Method 2: Check actual page elements - const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pageElements.length > 0) return pageElements.length; - - // Method 3: Look for page thumbnails - const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); - if (thumbnails.length > 0) return thumbnails.length; - - // Fallback: conservative guess - return 50; - } - """) - - logger.info(f"Estimated {estimated_pages} pages in PDF") - - # Initial scroll to trigger lazy loading - logger.info("Initial scroll to bottom to trigger lazy loading...") - await page.keyboard.press("End") - await page.wait_for_timeout(3000) - - # Scroll page by page to ensure all pages are loaded - logger.info("Scrolling page by page...") - max_attempts = min(estimated_pages * 3, 300) - attempt = 0 - prev_blob_count = 0 - - while attempt < max_attempts: - blob_count = await page.evaluate(""" - Array.from(document.getElementsByTagName('img')) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .length - """) - - logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") - - if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10): - logger.info("All pages appear to be loaded.") - break - - # Alternate between PageDown and End keys for more natural scrolling - if attempt % 3 == 0: - await page.keyboard.press("End") - else: - await page.keyboard.press("PageDown") - - # Randomized wait times - await page.wait_for_timeout(random.randint(1500, 3000)) - - # Move mouse randomly to appear more human-like - if attempt % 4 == 0: - await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800)) - - prev_blob_count = blob_count - attempt += 1 - - # Extra wait to ensure everything is loaded - await page.wait_for_timeout(5000) - - # Set up download event listener for the PDF - download_promise = page.wait_for_event("download") - - # Use jsPDF to generate PDF from loaded pages - logger.info("Generating PDF from loaded pages...") - result = await page.evaluate(r''' - (function() { - return new Promise((resolve, reject) => { - let script = document.createElement("script"); - script.onload = function () { - try { - let pdf = new jsPDF(); - let imgs = Array.from(document.getElementsByTagName("img")) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .sort((a, b) => { - const rectA = a.getBoundingClientRect(); - const rectB = b.getBoundingClientRect(); - return rectA.top - rectB.top; - }); - - console.log(`Found ${imgs.length} valid page images to add to PDF`); - - let added = 0; - for (let i = 0; i < imgs.length; i++) { - let img = imgs[i]; - let canvas = document.createElement("canvas"); - let ctx = canvas.getContext("2d"); - canvas.width = img.width; - canvas.height = img.height; - ctx.drawImage(img, 0, 0, img.width, img.height); - let imgData = canvas.toDataURL("image/jpeg", 1.0); - - if (added > 0) { - pdf.addPage(); - } - - pdf.addImage(imgData, 'JPEG', 0, 0); - added++; - } - - pdf.save("download.pdf"); - resolve({success: true, pageCount: added}); - } catch (error) { - reject({success: false, error: error.toString()}); - } - }; - - script.onerror = function() { - reject({success: false, error: "Failed to load jsPDF library"}); - }; - - script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; - document.body.appendChild(script); - }); - })(); - ''') - - if not result.get('success', False): - logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}") - - # Try fallback approach - screenshot method - logger.info("Trying fallback screenshot method...") - - # Navigate back to the first page - await page.evaluate(""" - () => { - // Find and click the "first page" button if available - const buttons = Array.from(document.querySelectorAll('button')); - const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page')); - if (firstPageBtn) firstPageBtn.click(); - } - """) - await page.wait_for_timeout(1000); - - # Create a PDF by taking screenshots of each page - screenshots = [] - current_page = 1 - max_pages = estimated_pages - - # Create a PDF using the reportlab package - while current_page <= max_pages: - screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png") - - # Try to find the current page element - page_elem = await page.query_selector('.drive-viewer-paginated-page') - if page_elem: - await page_elem.screenshot(path=screenshot_path) - else: - # Fallback to full page screenshot - await page.screenshot(path=screenshot_path) - - screenshots.append(screenshot_path) - - # Try to navigate to next page - next_btn = await page.query_selector('button[aria-label="Next page"]') - if next_btn: - is_disabled = await next_btn.get_attribute('disabled') - if is_disabled: - logger.info(f"Reached end of document at page {current_page}") - break - - await next_btn.click() - await page.wait_for_timeout(1000) - current_page += 1 - else: - break - - # Create PDF from screenshots - if screenshots: - first_img = Image.open(screenshots[0]) - width, height = first_img.size - - c = canvas.Canvas(save_path, pagesize=(width, height)) - for screenshot in screenshots: - img = Image.open(screenshot) - c.drawImage(screenshot, 0, 0, width, height) - c.showPage() - c.save() - - # Clean up screenshots - for screenshot in screenshots: - os.remove(screenshot) - - return save_path - - return None - - logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") - - # Wait for the download and save it - download = await download_promise - await download.save_as(save_path) - - # Clean up temp directory - try: - os.rmdir(temp_dir) - except: - pass - - else: - # Non-PDF file handling - screenshot_path = os.path.join(temp_dir, "file.png") - await page.screenshot(path=screenshot_path) - - if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: - # For document types, try to export directly - await self.export_google_doc(file_id, file_type, save_path) - else: - # For other types, save the screenshot with appropriate extension - shutil.copy(screenshot_path, save_path) - - os.remove(screenshot_path) - - # Close browser - await browser.close() - - # Verify file exists and has content - if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: - logger.info(f"Successfully downloaded file to {save_path}") - return save_path - else: - logger.error(f"Generated file is too small or missing: {save_path}") - return None - - except Exception as e: - logger.error(f"Error during force download: {e}") - if browser: - await browser.close() - return None - - except Exception as e: - logger.error(f"Force download preparation failed: {e}") - return None - - async def download_from_google_drive(self, url, save_path): - """Enhanced method to download from Google Drive with multiple fallback approaches""" - # Extract the file ID from different URL formats - file_id = None - url_patterns = [ - r'drive\.google\.com/file/d/([^/]+)', - r'drive\.google\.com/open\?id=([^&]+)', - r'docs\.google\.com/\w+/d/([^/]+)', - r'id=([^&]+)', - r'drive\.google\.com/uc\?id=([^&]+)', - ] - - for pattern in url_patterns: - match = re.search(pattern, url) - if match: - file_id = match.group(1) - break - - if not file_id: - logger.error(f"Could not extract file ID from URL: {url}") - return False - - # Determine file type first (important for handling different file types) - file_type, is_view_only = await self.get_google_drive_file_info(file_id) - logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") - - base, ext = os.path.splitext(save_path) - if not ext and file_type: - # Add the correct extension if missing - save_path = f"{base}.{file_type}" - - # For view-only files, use specialized approaches - if is_view_only: - # Approach 1: For PDFs, use the JS method - if file_type == 'pdf': - success = await self.download_viewonly_pdf_with_js(file_id, save_path) - if success: - return True - - # Approach 2: For Google Docs, Sheets, etc., use export API - if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: - success = await self.export_google_doc(file_id, file_type, save_path) - if success: - return True - - # Approach 3: Try the direct screenshot method for any view-only file - success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) - if success: - return True - - # Try standard approaches for non-view-only files - try: - # Try direct download link first (fastest) - direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t" - - # Add anti-bot headers - headers = { - 'User-Agent': get_random_user_agent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://drive.google.com/', - 'DNT': '1' - } - - # Try with streaming to handle larger files - with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r: - if r.status_code == 200: - # Check if we got HTML instead of the file - content_type = r.headers.get('Content-Type', '') - if 'text/html' in content_type and not file_id.endswith('.html'): - logger.warning("Received HTML instead of file, trying with session cookies") - else: - # Looks like we got the actual file - with open(save_path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # Verify file exists and has content - if os.path.exists(save_path) and os.path.getsize(save_path) > 0: - logger.info("Direct download successful") - return True - - # Try with requests and session cookies - session = requests.Session() - session.headers.update({'User-Agent': get_random_user_agent()}) - - # Visit the page first to get cookies - session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) - - # Try download - url = f"https://drive.google.com/uc?id={file_id}&export=download" - response = session.get(url, stream=True, timeout=30) - - # Check for confirmation token - confirmation_token = None - for k, v in response.cookies.items(): - if k.startswith('download_warning'): - confirmation_token = v - break - - # Use confirmation token if found - if confirmation_token: - url = f"{url}&confirm={confirmation_token}" - response = session.get(url, stream=True, timeout=60) - - # Check if we're getting HTML instead of the file - content_type = response.headers.get('Content-Type', '') - if 'text/html' in content_type: - logger.warning("Received HTML instead of file - likely download restriction") - else: - with open(save_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=1024*1024): - if chunk: - f.write(chunk) - - if os.path.exists(save_path) and os.path.getsize(save_path) > 0: - with open(save_path, 'rb') as f: - content = f.read(100) - if b'' not in content: - logger.info("Successfully downloaded with requests session") - return True - except Exception as e: - logger.warning(f"Requests session download failed: {e}") - - # Try browser-based approach as last resort - try: - async with self.context.new_page() as page: - # Visit the file view page first to get cookies - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) - await page.wait_for_timeout(3000) - - # Set up download event listener - download_promise = page.wait_for_event("download") - - # Try to trigger the download button click - download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]') - if download_button: - await download_button.click() - - # Wait for download to start - try: - download = await download_promise - await download.save_as(save_path) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - except Exception as e: - logger.error(f"Error during browser download: {e}") - return False - else: - # Try the export download URL - await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000) - - # Look for and click any download buttons or links - download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")') - for elem in download_elements: - try: - await elem.click() - # Wait a bit to see if download starts - try: - download = await download_promise - await download.save_as(save_path) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - except: - pass - except: - continue - except Exception as e: - logger.error(f"Browser-based download attempt failed: {e}") - - logger.warning("All standard download methods failed") - return False - - async def download_viewonly_pdf_with_js(self, file_id, save_path): - """Download view-only PDF using the enhanced blob image caching technique""" - try: - # Create a dedicated browser instance with stealth capabilities - browser_args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-web-security', - '--disable-blink-features=AutomationControlled' # Anti-detection - ] - - browser = await self.playwright.chromium.launch( - headless=True, - args=browser_args - ) - - # Setup stealth context - context = await browser.new_context( - viewport={'width': 1600, 'height': 1200}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - accept_downloads=True, # Critical for handling the download event - ignore_https_errors=True - ) - - # Add stealth script - await context.add_init_script(""" - () => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - - // Change plugins and languages to appear more human - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5].map(() => ({ - lengthComputable: true, - loaded: 100, - total: 100 - })) - }); - - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en', 'es'] - }); - } - """) - - page = await context.new_page() - - try: - # Step 1: Navigate to the file with human-like behavior - logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) - await page.wait_for_load_state('networkidle') - - # Perform human-like interactions - await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300)) - await page.wait_for_timeout(random.randint(2000, 5000)) - - # Step 2: Estimate the number of pages - estimated_pages = await page.evaluate(""" - () => { - // Look for page counter in the interface - const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { - const text = el.textContent || ''; - return /\\d+\\s*\\/\\s*\\d+/.test(text); - }); - - if (pageCounters.length > 0) { - const text = pageCounters[0].textContent || ''; - const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); - if (match && match[2]) return parseInt(match[2]); - } - - // If we can't find a counter, check actual pages - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pages.length > 0) return pages.length; - - // Default to a reasonable number if we can't determine - return 50; - } - """) - - logger.info(f"Estimated number of pages: {estimated_pages}") - - # Step 3: Initial scroll to trigger loading - logger.info("Initial scroll to bottom to trigger lazy loading...") - await page.keyboard.press("End") - await page.wait_for_timeout(3000) - - # Step 4: Wait for all pages to load with better feedback and randomization - logger.info("Scrolling through document to load all pages...") - max_attempts = min(estimated_pages * 3, 300) - attempt = 0 - prev_blob_count = 0 - consecutive_same_count = 0 - - while attempt < max_attempts: - # Count blob images (which are the PDF pages) - blob_count = await page.evaluate(""" - Array.from(document.getElementsByTagName('img')) - .filter(img => img.src.startsWith('blob:') && img.width > 100) - .length - """) - - logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") - - # Check if we've loaded all pages or if we're stuck - if blob_count >= estimated_pages: - logger.info(f"All {estimated_pages} pages appear to be loaded.") - break - - if blob_count == prev_blob_count: - consecutive_same_count += 1 - if consecutive_same_count >= 5 and blob_count > 0: - logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.") - break - else: - consecutive_same_count = 0 - - # Mix up the scrolling approach for more human-like behavior - scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"]) - - if scroll_action == "PageDown": - await page.keyboard.press("PageDown") - elif scroll_action == "End": - await page.keyboard.press("End") - elif scroll_action == "ArrowDown": - # Press arrow down multiple times - for _ in range(random.randint(5, 15)): - await page.keyboard.press("ArrowDown") - await page.wait_for_timeout(random.randint(50, 150)) - else: # mouse - # Scroll using mouse wheel - current_y = random.randint(300, 700) - await page.mouse.move(x=random.randint(300, 800), y=current_y) - await page.mouse.wheel(0, random.randint(300, 800)) - - # Random wait between scrolls - await page.wait_for_timeout(random.randint(1000, 3000)) - - prev_blob_count = blob_count - attempt += 1 - - # Extra wait to ensure everything is fully loaded - await page.wait_for_timeout(5000) - - # Step 5: Set up a download event listener - download_promise = page.wait_for_event("download") - - # Step 6: Inject the jsPDF script to generate PDF - logger.info("Generating PDF from loaded pages...") - result = await page.evaluate(r''' - (function() { - return new Promise((resolve, reject) => { - let script = document.createElement("script"); - script.onload = function () { - try { - let pdf = new jsPDF(); - let imgs = document.getElementsByTagName("img"); - let validImages = []; - - // First collect all valid blob images - for (let i = 0; i < imgs.length; i++) { - let img = imgs[i]; - if (!/^blob:/.test(img.src)) continue; - if (img.width < 100 || img.height < 100) continue; - validImages.push(img); - } - - // Sort by position in the document - validImages.sort((a, b) => { - const rectA = a.getBoundingClientRect(); - const rectB = b.getBoundingClientRect(); - return rectA.top - rectB.top; - }); - - console.log(`Found ${validImages.length} valid page images to add to PDF`); - - let added = 0; - // Process each image as a page - for (let i = 0; i < validImages.length; i++) { - let img = validImages[i]; - let canvas = document.createElement("canvas"); - let ctx = canvas.getContext("2d"); - canvas.width = img.width; - canvas.height = img.height; - ctx.drawImage(img, 0, 0, img.width, img.height); - let imgData = canvas.toDataURL("image/jpeg", 1.0); - - if (added > 0) { - pdf.addPage(); - } - - pdf.addImage(imgData, 'JPEG', 0, 0); - added++; - } - - pdf.save("download.pdf"); - resolve({success: true, pageCount: added}); - } catch (error) { - reject({success: false, error: error.toString()}); - } - }; - - script.onerror = function() { - reject({success: false, error: "Failed to load jsPDF library"}); - }; - - // Use a reliable CDN - script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; - document.body.appendChild(script); - }); - })(); - ''') - - if not result.get('success'): - logger.error(f"Error in PDF generation: {result.get('error')}") - return False - - logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") - - # Step 7: Wait for the download to complete and save the file - download = await download_promise - - # Step 8: Save the downloaded file to the specified path - await download.save_as(save_path) - logger.info(f"Successfully saved PDF to {save_path}") - - return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 - - finally: - await browser.close() - - except Exception as e: - logger.error(f"Error in viewonly PDF download process: {e}") - return False - - async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): - """Download any view-only file by taking screenshots""" - try: - async with self.context.new_page() as page: - # Set high-resolution viewport - await page.set_viewport_size({"width": 1600, "height": 1200}) - - # Navigate to the file - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) - - # Make sure the file is loaded - await page.wait_for_load_state('networkidle') - await page.wait_for_timeout(3000) # Extra time for rendering - - # Create directory for screenshots if multiple pages - base_dir = os.path.dirname(save_path) - base_name = os.path.splitext(os.path.basename(save_path))[0] - screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") - os.makedirs(screenshots_dir, exist_ok=True) - - # Check if it's a multi-page document - is_multi_page = await page.evaluate(""" - () => { - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - return pages.length > 1; - } - """) - - if is_multi_page and file_type == 'pdf': - # For multi-page PDFs, take screenshots of each page - page_count = await page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - const container = document.querySelector('.drive-viewer-paginated-scrollable'); - - if (!container || pages.length === 0) return 0; - - // Scroll through to make sure all pages are loaded - const scrollHeight = container.scrollHeight; - const viewportHeight = container.clientHeight; - const scrollStep = viewportHeight; - - for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { - container.scrollTo(0, scrollPos); - await delay(300); - } - - // Scroll back to top - container.scrollTo(0, 0); - await delay(300); - - return pages.length; - } - """) - - logger.info(f"Found {page_count} pages in document") - - # Take screenshots of each page - screenshots = [] - for i in range(page_count): - # Scroll to page - await page.evaluate(f""" - async () => {{ - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const pages = document.querySelectorAll('.drive-viewer-paginated-page'); - if (pages.length <= {i}) return false; - - pages[{i}].scrollIntoView(); - await delay(500); - return true; - }} - """) - - # Take screenshot - screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") - await page.screenshot(path=screenshot_path, clip={ - 'x': 0, - 'y': 0, - 'width': 1600, - 'height': 1200 - }) - screenshots.append(screenshot_path) - - # Combine screenshots into PDF - c = canvas.Canvas(save_path) - for screenshot in screenshots: - img = Image.open(screenshot) - width, height = img.size - - # Add page to PDF - c.setPageSize((width, height)) - c.drawImage(screenshot, 0, 0, width, height) - c.showPage() - - c.save() - - # Clean up screenshots - for screenshot in screenshots: - os.remove(screenshot) - os.rmdir(screenshots_dir) - - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - else: - # For single-page or non-PDF files, just take one screenshot - screenshot_path = os.path.join(screenshots_dir, "screenshot.png") - await page.screenshot(path=screenshot_path, fullPage=True) - - # Convert to requested format if needed - if file_type == 'pdf': - # Create PDF from screenshot - img = Image.open(screenshot_path) - width, height = img.size - - c = canvas.Canvas(save_path, pagesize=(width, height)) - c.drawImage(screenshot_path, 0, 0, width, height) - c.save() - else: - # Just copy the screenshot to the destination with proper extension - shutil.copy(screenshot_path, save_path) - - # Clean up - os.remove(screenshot_path) - os.rmdir(screenshots_dir) - - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - - except Exception as e: - logger.error(f"Error taking screenshots: {e}") - return False - - async def export_google_doc(self, file_id, file_type, save_path): - """Export Google Docs/Sheets/Slides to downloadable formats""" - try: - # Map file types to export formats - export_formats = { - 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx - 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx - 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx - 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'pdf': 'application/pdf', - } - - export_format = export_formats.get(file_type, 'application/pdf') - export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" - - if 'sheet' in file_type or 'xlsx' in file_type: - export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" - elif 'ppt' in file_type or 'presentation' in file_type: - export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" - elif file_type == 'pdf': - export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" - - async with self.context.new_page() as page: - # Get cookies from the main view page first - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') - - # Now try the export - response = await page.goto(export_url, wait_until='networkidle') - - if response.status == 200: - content = await response.body() - with open(save_path, 'wb') as f: - f.write(content) - return os.path.exists(save_path) and os.path.getsize(save_path) > 0 - else: - logger.warning(f"Export failed with status {response.status}") - return False - - except Exception as e: - logger.error(f"Error exporting Google Doc: {e}") - return False - - async def get_google_drive_file_info(self, file_id): - """Get file type and view-only status from Google Drive""" - file_type = None - is_view_only = False - - try: - async with self.context.new_page() as page: - await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) - - # Check if view-only - view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') - is_view_only = view_only_text is not None - - # Check for Google Docs viewer - gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') - gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') - gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') - - if gdocs_viewer: - file_type = 'docx' - elif gsheets_viewer: - file_type = 'xlsx' - elif gslides_viewer: - file_type = 'pptx' - else: - # Check for PDF viewer - pdf_viewer = await page.query_selector('embed[type="application/pdf"]') - if pdf_viewer: - file_type = 'pdf' - else: - # Check for image viewer - img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') - if img_viewer: - # Get image type from src - img_src = await img_viewer.get_attribute('src') - if 'jpg' in img_src or 'jpeg' in img_src: - file_type = 'jpg' - elif 'png' in img_src: - file_type = 'png' - else: - file_type = 'jpg' # Default to jpg - else: - # Generic file type fallback - file_type = 'pdf' # Default to PDF - - # If still no type, check filename - if not file_type: - title_element = await page.query_selector('div[role="heading"]') - if title_element: - title = await title_element.text_content() - if title: - ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) - if ext_match: - file_type = ext_match.group(1).lower() - - except Exception as e: - logger.error(f"Error getting Google Drive file info: {e}") - file_type = 'pdf' # Default to PDF if we can't determine - - return file_type, is_view_only - - # IMPROVED: Enhanced sublink extraction method - async def get_sublinks(self, url, limit=10000): - """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" - links = set() - try: - logger.info(f"Fetching sublinks from: {url}") - - # Check if this is a direct download link - if is_download_link(url): - logger.info(f"URL appears to be a direct download link: {url}") - links.add(url) - return list(links)[:limit] - - # Skip if we've already visited this URL - normalized_url = normalize_download_url(url) - if normalized_url in self.visited_urls: - logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}") - return list(links)[:limit] - - # Add to visited URLs - self.visited_urls.add(normalized_url) - - # Special handling for educational sites like phsms.cloud.ncnu.edu.tw - if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in - ["exam", "test", "pastpaper", "eduexp"]): - logger.info("Using specialized exam site sublink extraction") - edu_links = await self.get_edu_exam_links(url) - for link in edu_links: - links.add(link) - - # If we found a good number of links with the specialized method, return them - if len(links) > 5: - logger.info(f"Found {len(links)} sublinks with specialized method") - return list(links)[:limit] - - # Rotate proxy if needed - await self.rotate_proxy_if_needed() - - # Standard sublink extraction for all sites - try: - await self.page.goto(url, timeout=30000, wait_until='networkidle') - except Exception as e: - logger.warning(f"Error navigating to URL for sublink extraction: {e}") - # Continue with what we have, we'll try to extract links anyway - - # Get base URL for resolving relative links - parsed_base = urlparse(url) - base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" - path_base = os.path.dirname(parsed_base.path) - - # Perform initial scrolling to load lazy content - await self.page.evaluate(""" - async () => { - const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); - const height = document.body.scrollHeight; - const step = Math.floor(window.innerHeight / 2); - - for (let i = 0; i < height; i += step) { - window.scrollTo(0, i); - await delay(150); - } - - window.scrollTo(0, 0); - } - """) - await self.page.wait_for_timeout(1000) - - # Check if page has ASP.NET elements which might need special handling - is_aspnet = await self.page.evaluate(''' - () => { - return document.querySelector('form#aspnetForm') !== null || - document.querySelector('input[name="__VIEWSTATE"]') !== null; - } - ''') - - if is_aspnet: - logger.info("Detected ASP.NET page, using enhanced extraction method") - - # Try to interact with ASP.NET controls that might reveal more links - # Look for dropdowns, buttons, and grid elements - dropdowns = await self.page.query_selector_all('select') - buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') - - # Try interacting with dropdowns first - for dropdown in dropdowns: - try: - # Get all options - options = await self.page.evaluate(''' - (dropdown) => { - return Array.from(dropdown.options).map(o => o.value); - } - ''', dropdown) - - # Try selecting each option - for option in options: - if option: - await dropdown.select_option(value=option) - await self.page.wait_for_timeout(1000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error interacting with dropdown: {e}") - - # Try clicking buttons (but avoid dangerous ones like "delete") - safe_buttons = [] - for button in buttons: - button_text = await button.text_content() or "" - button_value = await button.get_attribute("value") or "" - button_id = await button.get_attribute("id") or "" - combined_text = (button_text + button_value + button_id).lower() - - # Skip potentially destructive buttons - if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): - continue - - # Prioritize buttons that might show more content - if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): - safe_buttons.append(button) - - # Click the safe buttons - for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks - try: - await button.click() - await self.page.wait_for_timeout(1000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error clicking button: {e}") - - # Extract links from the initial page state - await self.extract_all_link_types(links, base_url, path_base) - - # Look specifically for links inside grid/table views which are common in ASP.NET applications - grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') - for cell in grid_cells: - try: - href = await cell.get_attribute('href') - if href: - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links.add(full_url) - except Exception as e: - logger.warning(f"Error extracting grid link: {e}") - - # Extract links from onclick attributes and javascript:__doPostBack calls - postback_links = await self.page.evaluate(''' - () => { - const results = []; - // Find elements with onclick containing __doPostBack - const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); - for (const el of elements) { - // Extract the postback target - const onclick = el.getAttribute('onclick') || ''; - const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); - if (match && match[1]) { - // Get the visible text to use as description - const text = el.innerText || el.textContent || 'Link'; - results.push({ - id: match[1], - text: text.trim() - }); - } - } - return results; - } - ''') - - # Try interacting with some of the postback links - for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions - try: - logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") - await self.page.evaluate(f''' - () => {{ - if (typeof __doPostBack === 'function') {{ - __doPostBack('{postback["id"]}', ''); - }} - }} - ''') - await self.page.wait_for_timeout(1500) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Extract any new links that appeared - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error with postback: {e}") - - # Look for pagination controls and try to navigate through them - pagination_elements = await self.page.query_selector_all( - 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]' - ) - - # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops) - for i in range(min(5, len(pagination_elements))): - try: - # Focus on elements that look like "next page" buttons - el = pagination_elements[i] - el_text = await el.text_content() or "" - - # Only click if this looks like a pagination control - if "next" in el_text.lower() or ">" == el_text.strip() or "โ†’" == el_text.strip(): - logger.info(f"Clicking pagination control: {el_text}") - await el.click() - await self.page.wait_for_timeout(2000) - await self.page.wait_for_load_state('networkidle', timeout=5000) - - # Get new links from this page - await self.extract_all_link_types(links, base_url, path_base) - except Exception as e: - logger.warning(f"Error clicking pagination: {e}") - - # Check for hidden links that might be revealed by JavaScript - hidden_links = await self.page.evaluate(""" - () => { - // Try to execute common JavaScript patterns that reveal hidden content - try { - // Common patterns used in websites to initially hide content - const hiddenContainers = document.querySelectorAll( - '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]' - ); - - // Attempt to make them visible - hiddenContainers.forEach(el => { - el.style.display = 'block'; - el.style.visibility = 'visible'; - el.classList.remove('hidden', 'hide'); - }); - - // Return any newly visible links - return Array.from(document.querySelectorAll('a[href]')).map(a => a.href); - } catch (e) { - return []; - } - } - """) - - # Add any newly discovered links - for href in hidden_links: - if href and not href.startswith('javascript:'): - links.add(href) - - # Find all download links - download_links = await self.page.evaluate(""" - () => { - return Array.from(document.querySelectorAll('a[href]')) - .filter(a => { - const href = a.href.toLowerCase(); - return href.includes('download') || - href.includes('file') || - href.includes('get') || - href.includes('view.php') || - href.includes('action=') || - href.includes('fname='); - }) - .map(a => a.href); - } - """) - - for download_link in download_links: - links.add(download_link) - - # Also check for hidden links in JavaScript, iframes, or dynamic content - js_links = await self.discover_hidden_links(self.page) - for link in js_links: - links.add(link) - - logger.info(f"Found {len(links)} sublinks") - - # Prioritize download links - prioritized_links = [] - normal_links = [] - - for link in links: - if is_download_link(link): - prioritized_links.append(link) - else: - normal_links.append(link) - - # Return prioritized links first, then normal links, up to the limit - result = prioritized_links + normal_links - return result[:limit] - - except Exception as e: - logger.error(f"Error getting sublinks from {url}: {e}") - return list(links)[:limit] # Return what we have so far - - async def extract_all_link_types(self, links_set, base_url, path_base): - """Extract all types of links from the current page""" - # Get all tag links - a_links = await self.page.query_selector_all('a[href]') - for a in a_links: - try: - href = await a.get_attribute('href') - if href and not href.startswith('javascript:') and not href.startswith('#'): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Get iframe sources - iframes = await self.page.query_selector_all('iframe[src]') - for iframe in iframes: - try: - src = await iframe.get_attribute('src') - if src and not src.startswith('javascript:') and not src.startswith('about:'): - full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Get links from onclick attributes that reference URLs - onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') - for el in onclick_elements: - try: - onclick = await el.get_attribute('onclick') - urls = re.findall(r'(https?://[^\'"]+)', onclick) - for url in urls: - links_set.add(url) - except Exception: - pass - - # Look for URLs in data-* attributes - data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') - for el in data_elements: - for attr in ['data-url', 'data-href', 'data-src']: - try: - value = await el.get_attribute(attr) - if value and not value.startswith('javascript:'): - full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Look for special anchor links that might not have href attributes - special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') - for anchor in special_anchors: - try: - href = await anchor.get_attribute('href') - if href and not href.startswith('javascript:') and not href.startswith('#'): - full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) - links_set.add(full_url) - except Exception: - pass - - # Extract links from JSON data embedded in the page - script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]') - for script in script_elements: - try: - script_content = await script.text_content() - if script_content: - # Look for URLs in the JSON content - urls = re.findall(r'(https?://[^\'"]+)', script_content) - for url in urls: - links_set.add(url) - except Exception: - pass - - def resolve_relative_url(self, relative_url, base_url, path_base): - """Properly resolve relative URLs considering multiple formats""" - if relative_url.startswith('/'): - # Absolute path relative to domain - return f"{base_url}{relative_url}" - elif relative_url.startswith('./'): - # Explicit relative path - return f"{base_url}{path_base}/{relative_url[2:]}" - elif relative_url.startswith('../'): - # Parent directory - parent_path = '/'.join(path_base.split('/')[:-1]) - return f"{base_url}{parent_path}/{relative_url[3:]}" - else: - # Regular relative path - return f"{base_url}{path_base}/{relative_url}" - - async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): - if not custom_ext_list: - custom_ext_list = [] - progress_text = st.empty() - progress_bar = st.progress(0) - file_count_text = st.empty() - - try: - # Reset the visited URLs for a fresh deep search - self.visited_urls = set() - - progress_text.text("Analyzing main page...") - # Special handling for ASP.NET pages - is_aspnet = False - try: - await self.page.goto(url, timeout=30000, wait_until='networkidle') - is_aspnet = await self.page.evaluate(''' - () => { - return document.querySelector('form#aspnetForm') !== null || - document.querySelector('input[name="__VIEWSTATE"]') !== null; - } - ''') - except Exception: - pass - - # Check if this URL is a direct download - if is_download_link(url): - progress_text.text("URL appears to be a direct download. Analyzing...") - - # Try to extract file directly - normalized_url = normalize_download_url(url) - file_info = { - 'url': normalized_url, - 'download_url': normalized_url, - 'filename': os.path.basename(urlparse(normalized_url).path) or 'download', - 'size': 'Unknown Size', - 'metadata': {} - } - - # Add to visited URLs - self.visited_urls.add(normalized_url) - progress_bar.progress(1.0) - return [file_info] - - # Extract files from main page - main_files = await self.extract_downloadable_files(url, custom_ext_list) - initial_count = len(main_files) - file_count_text.text(f"Found {initial_count} files on main page") - - # Get sublinks with enhanced method - progress_text.text("Getting sublinks...") - sublinks = await self.get_sublinks(url, sublink_limit) - total_links = len(sublinks) - progress_text.text(f"Found {total_links} sublinks to process") - - # Always include files from the main page, regardless of sublinks - all_files = main_files - - if not sublinks: - progress_bar.progress(1.0) - return all_files - - # Process each sublink - for i, sublink in enumerate(sublinks, 1): - progress = i / total_links - progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") - progress_bar.progress(progress) - - try: - # Check if this is a direct download link - if is_download_link(sublink): - # For download links, just add the link directly - normalized_url = normalize_download_url(sublink) - - # Skip if already visited - if normalized_url in self.visited_urls: - continue - - # Mark as visited - self.visited_urls.add(normalized_url) - - # Get file size if possible - size_str = await self.get_file_size(normalized_url) - - # Get filename, with fallback to domain-based name - filename = os.path.basename(urlparse(normalized_url).path) - if not filename or filename == '/' or '?' in filename: - domain = get_domain(normalized_url) - ext = '.pdf' # Default extension - for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']: - if common_ext in normalized_url.lower(): - ext = common_ext - break - filename = f"file_from_{domain}{ext}" - - # Add file to results - all_files.append({ - 'url': normalized_url, - 'download_url': normalized_url, - 'filename': filename, - 'size': size_str, - 'metadata': {} - }) - file_count_text.text(f"Found {len(all_files)} total files") - continue - - # For regular links, use a longer timeout for ASP.NET pages which can be slower - sub_timeout = timeout * 2 if is_aspnet else timeout - - # Skip already visited URLs - if sublink in self.visited_urls: - continue - - # Extract files from sublink - sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) - all_files.extend(sub_files) - file_count_text.text(f"Found {len(all_files)} total files") - except Exception as e: - logger.warning(f"Error processing sublink {sublink}: {e}") - - # Deduplicate files - seen_urls = set() - unique_files = [] - for f in all_files: - if f['url'] not in seen_urls: - seen_urls.add(f['url']) - unique_files.append(f) - - final_count = len(unique_files) - progress_text.text(f"Deep search complete!") - file_count_text.text(f"Found {final_count} unique files") - progress_bar.progress(1.0) - return unique_files - - except Exception as e: - logger.error(f"Deep search error: {e}") - progress_text.text(f"Error during deep search: {str(e)}") - return [] - - finally: - await asyncio.sleep(2) - if not st.session_state.get('keep_progress', False): - progress_text.empty() - progress_bar.empty() - -# -------------------- Main App -------------------- -def main(): - - # Custom CSS for better appearance - st.markdown(""" - - """, unsafe_allow_html=True) - - # Initialize session state for storing files - if 'files' not in st.session_state: - st.session_state.files = [] - if 'downloaded_paths' not in st.session_state: - st.session_state.downloaded_paths = [] - if 'download_complete' not in st.session_state: - st.session_state.download_complete = False - if 'selected_tab' not in st.session_state: - st.session_state.selected_tab = 0 - if 'rag_search' not in st.session_state: - st.session_state.rag_search = EnhancedRAGSearch() - if 'keep_progress' not in st.session_state: - st.session_state.keep_progress = False - if 'google_credentials' not in st.session_state: - st.session_state.google_credentials = None - if 'mode' not in st.session_state: - st.session_state.mode = "Standard" - if 'use_proxy' not in st.session_state: - st.session_state.use_proxy = False - if 'proxy_string' not in st.session_state: - st.session_state.proxy_string = None - if 'stealth_mode' not in st.session_state: - st.session_state.stealth_mode = True - - # ============================ - # SIDEBAR - # ============================ - with st.sidebar: - st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50) - st.markdown("", unsafe_allow_html=True) - - # Mode Selection - st.markdown("", unsafe_allow_html=True) - - # Quick Settings - st.markdown("", unsafe_allow_html=True) - - # Google Drive Integration - st.markdown("", unsafe_allow_html=True) - - # Preset buttons for common EDU sites - if st.session_state.mode == "Education Mode": - st.markdown("", unsafe_allow_html=True) - - # Tool status - st.markdown("", unsafe_allow_html=True) - - # App info - st.markdown("", unsafe_allow_html=True) - - # ============================ - # MAIN CONTENT AREA - # ============================ - - # Header section - col1, col2 = st.columns([5, 1]) - with col1: - st.markdown("

Advanced File Downloader

", unsafe_allow_html=True) - with col2: - st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70) - - mode_descriptions = { - "Standard": "A versatile tool for discovering and downloading files from any website.", - "Education Mode": "Optimized for educational resources, exams, and academic materials.", - "Research Mode": "Focused on research papers, datasets, and academic publications.", - "Media Mode": "Enhanced for finding and downloading images, videos, and audio files." - } - - st.markdown(f"

{mode_descriptions[st.session_state.mode]}

", unsafe_allow_html=True) - - # Main tabs - tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"]) - - # Tab 1: Search & Download - with tabs[0]: - st.markdown("

Find and Download Files

", unsafe_allow_html=True) - - col1, col2 = st.columns([3, 1]) - with col1: - url = st.text_input("Enter a URL to search for downloadable files:", - placeholder="e.g., https://example.com/resources", - value=st.session_state.get('preset_url', '')) - with col2: - # Initialize search_method with either session state or default value - initial_search_method = st.session_state.get('search_method', "Deep Search") - search_method = st.selectbox("Search Method", - ["Deep Search", "Quick Search", "Exam Site Mode"], - index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method)) - # Update session state when changed - if search_method != st.session_state.get('search_method'): - st.session_state.search_method = search_method - - # Advanced options in an expander - with st.expander("Search Options", expanded=False): - col1, col2, col3 = st.columns(3) - with col1: - depth = st.slider("Search Depth", min_value=1, max_value=5, value=2, - help="Higher values will search more links but take longer") - prioritize_pdfs = st.checkbox("Prioritize PDFs", - value=st.session_state.get('prioritize_pdfs', True), - help="Focus on finding PDF files first") - with col2: - timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60) - follow_subdomains = st.checkbox("Follow Subdomains", value=True, - help="Include links from subdomains in the search") - with col3: - # Default extensions based on mode - default_extensions = { - "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip", - "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx", - "Research Mode": ".pdf,.txt,.csv,.json,.xlsx", - "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov" - } - - custom_extensions = st.text_area( - "Custom File Extensions", - value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]), - help="Comma-separated list of file extensions to look for" - ) - - # Update session state when extensions changed - if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions: - st.session_state.custom_extensions = custom_extensions - - search_col1, search_col2 = st.columns([4, 1]) - with search_col1: - search_button = st.button("๐Ÿ” Start Search", use_container_width=True) - with search_col2: - clear_button = st.button("๐Ÿงน Clear Results", use_container_width=True) - - # File results section - if st.session_state.files: - st.markdown("

Found Files

", unsafe_allow_html=True) - - # File filtering options - filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1]) - with filter_col1: - file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.") - with filter_col2: - sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"]) - with filter_col3: - show_only_pdfs = st.checkbox("PDFs Only", value=False) - - # Sort files based on selection - sorted_files = list(st.session_state.files) - if sort_option == "Name": - sorted_files.sort(key=lambda x: x['filename']) - elif sort_option == "Size (Largest)": - # Convert size strings to comparable values - def parse_size(size_str): - if 'Unknown' in size_str: - return 0 - try: - value = float(size_str.split(' ')[0]) - unit = size_str.split(' ')[1] - multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} - return value * multipliers.get(unit, 0) - except: - return 0 - - sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True) - elif sort_option == "Size (Smallest)": - def parse_size(size_str): - if 'Unknown' in size_str: - return float('inf') - try: - value = float(size_str.split(' ')[0]) - unit = size_str.split(' ')[1] - multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} - return value * multipliers.get(unit, 0) - except: - return float('inf') - - sorted_files.sort(key=lambda x: parse_size(x['size'])) - - # File list with selection - file_container = st.container() - with file_container: - selected_files = [] - displayed_files = [] - - for i, file in enumerate(sorted_files): - # Apply filters - if file_filter and file_filter.lower() not in file['filename'].lower(): - continue - if show_only_pdfs and not file['filename'].lower().endswith('.pdf'): - continue - - displayed_files.append(i) - with st.container(): - col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1]) - with col1: - selected = st.checkbox("", key=f"select_{i}", value=True) - if selected: - selected_files.append(i) - with col2: - file_icon = "๐Ÿ“„" - if file['filename'].lower().endswith('.pdf'): - file_icon = "๐Ÿ“" - elif file['filename'].lower().endswith(('.doc', '.docx')): - file_icon = "๐Ÿ“‹" - elif file['filename'].lower().endswith(('.xls', '.xlsx')): - file_icon = "๐Ÿ“Š" - elif file['filename'].lower().endswith(('.ppt', '.pptx')): - file_icon = "๐Ÿ–ผ๏ธ" - elif file['filename'].lower().endswith(('.jpg', '.png', '.gif')): - file_icon = "๐Ÿ–ผ๏ธ" - elif file['filename'].lower().endswith(('.mp3', '.wav')): - file_icon = "๐Ÿ”Š" - elif file['filename'].lower().endswith(('.mp4', '.avi', '.mov')): - file_icon = "๐ŸŽฌ" - - st.markdown(f"**{file_icon} {file['filename']}**") - st.markdown(f"{file['url'][:60]}...", unsafe_allow_html=True) - with col3: - st.markdown(f"**Size:** {file['size']}") - with col4: - st.button("Preview", key=f"preview_{i}") - - st.divider() - - if not displayed_files: - st.info("No files match your current filters. Try adjusting your search criteria.") - - # Download options - if selected_files: - col1, col2 = st.columns(2) - with col1: - download_dir = st.text_input("Download Directory", value="downloads") - with col2: - download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True) - - download_col1, download_col2, download_col3 = st.columns([3, 1, 1]) - with download_col1: - download_button = st.button("โฌ‡๏ธ Download Selected Files", use_container_width=True) - with download_col2: - google_drive_button = st.button("๐Ÿ“ค Upload to Drive", - use_container_width=True, - disabled=not st.session_state.google_credentials) - with download_col3: - select_all = st.button("Select All Files", use_container_width=True) - - # Handle select all button - if select_all: - for i in displayed_files: - st.session_state[f"select_{i}"] = True - st.rerun() - - # Download progress/results - if st.session_state.download_complete: - st.success(f"โœ… Downloaded {len(st.session_state.downloaded_paths)} files successfully!") - download_links = [] - for path in st.session_state.downloaded_paths: - with open(path, "rb") as f: - file_content = f.read() - file_name = os.path.basename(path) - download_links.append((file_name, file_content)) - - if len(download_links) > 0: - if download_option == "ZIP Archive": - # Create ZIP archive for download - zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir) - with open(zip_path, "rb") as f: - zip_content = f.read() - st.download_button("๐Ÿ“ฆ Download ZIP Archive", - zip_content, - file_name=os.path.basename(zip_path), - mime="application/zip") - else: - # Show individual file download links - st.markdown("

Download Files

", unsafe_allow_html=True) - - # Create a grid of download buttons - cols = st.columns(3) - for idx, (name, content) in enumerate(download_links): - mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream' - with cols[idx % 3]: - st.download_button( - f"๐Ÿ“„ {name}", - content, - file_name=name, - mime=mime_type, - key=f"dl_{name}", - use_container_width=True - ) - - # Tab 2: Local File Search - with tabs[1]: - st.markdown("

Search Downloaded Files

", unsafe_allow_html=True) - st.write("Upload files to search through their content with AI-powered semantic search.") - - # File upload - uploaded_files = st.file_uploader("Upload documents for search", - accept_multiple_files=True, - type=['pdf', 'docx', 'txt', 'csv', 'json']) - - if uploaded_files: - # Build search index on upload - col1, col2 = st.columns([4, 1]) - with col1: - use_transformer = st.checkbox("Use AI Transformer Model", value=HAVE_TRANSFORMERS, - help="Uses advanced AI for more accurate semantic search (if available)") - with col2: - if st.button("Build Search Index", use_container_width=True): - with st.spinner("Processing files and building search index..."): - files_added = 0 - for uploaded_file in uploaded_files: - file_info = { - 'filename': uploaded_file.name, - 'url': f'local://{uploaded_file.name}', - 'size': humanize_file_size(uploaded_file.size) - } - success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info) - if success: - files_added += 1 - - if files_added > 0: - index_built = st.session_state.rag_search.build_index() - if index_built: - st.success(f"โœ… Successfully indexed {files_added} files!") - else: - st.error("Failed to build search index.") - else: - st.warning("No valid text could be extracted from the files.") - - # Search interface - st.markdown("

Search Files

", unsafe_allow_html=True) - - col1, col2 = st.columns([4, 1]) - with col1: - query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change") - with col2: - expand_query = st.checkbox("Auto-expand query", value=True, - help="Automatically add related terms to your search") - - col1, col2 = st.columns([4, 1]) - with col1: - if st.button("๐Ÿ” Search Documents", use_container_width=True): - if not query: - st.warning("Please enter a search query") - else: - with st.spinner("Searching..."): - results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True) - - if results: - st.markdown(f"**Found {len(results)} relevant documents:**") - for i, result in enumerate(results): - with st.container(): - st.markdown(f"
", unsafe_allow_html=True) - st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})") - - if result.get('chunk_preview'): - st.markdown("**Matching content:**") - st.text(result['chunk_preview']) - - st.markdown("
", unsafe_allow_html=True) - else: - st.info("No matching documents found. Try a different query.") - with col2: - num_results = st.number_input("Max results", min_value=1, max_value=20, value=5) - - # Quick search tips - with st.expander("Search Tips", expanded=False): - st.markdown(""" - ### Effective Search Tips - - - **Be specific** with your queries for more accurate results - - **Try different phrasings** if you don't get the results you expect - - Use **quotation marks** for exact phrase matching - - For **complex topics**, break down your search into multiple queries - - **Combine related terms** to improve recall - - The search engine uses advanced algorithms to understand the semantic meaning of your query, - not just keyword matching. - """) - - # Tab 3: Advanced Configuration - with tabs[2]: - st.markdown("

Advanced Settings

", unsafe_allow_html=True) - - config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"]) - - # Browser Settings tab - with config_tabs[0]: - col1, col2 = st.columns(2) - with col1: - use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode, - help="Makes browser harder to detect as automated, but may be slower") - - handle_captchas = st.checkbox("Handle Captchas Automatically", value=False, - help="Attempt to solve simple captchas automatically") - - download_timeout = st.slider("Download Timeout (seconds)", - min_value=30, max_value=600, value=300, - help="Maximum time to wait for downloads to complete") - with col2: - user_agent = st.selectbox("User Agent", USER_AGENTS, index=0, - help="Browser identity to use when accessing websites") - - save_screenshots = st.checkbox("Save Browser Screenshots", value=False, - help="Save screenshots when errors occur for debugging") - - browser_lang = st.selectbox("Browser Language", - ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"], - index=0) - - if st.button("Update Browser Settings"): - st.session_state.stealth_mode = use_stealth - st.success("Browser settings updated!") - - # Dependency installation section - st.markdown("

Dependencies

", unsafe_allow_html=True) - if st.button("Install Playwright Dependencies"): - with st.spinner("Installing dependencies..."): - install_playwright_dependencies() - - # Proxy Configuration tab - with config_tabs[1]: - proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy, - help="Route requests through a proxy server for anonymity or bypassing restrictions") - - if proxy_enabled: - proxy_col1, proxy_col2 = st.columns(2) - with proxy_col1: - proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"]) - proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1") - with proxy_col2: - proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080") - proxy_auth = st.text_input("Proxy Authentication (optional)", - placeholder="username:password", type="password") - - st.markdown("

Proxy Rotation

", unsafe_allow_html=True) - use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False, - help="Automatically rotate between multiple proxies for better anonymity") - - if use_proxy_rotation: - proxy_list = st.text_area("Proxy List (one per line)", - placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080") - rotation_interval = st.slider("Rotation Interval (requests)", - min_value=1, max_value=50, value=10, - help="How often to switch proxies") - - if st.button("Save Proxy Configuration"): - # Construct the proxy string - proxy_string = None - if proxy_enabled and proxy_host and proxy_port: - proxy_prefix = f"{proxy_type.lower()}://" - proxy_auth_str = f"{proxy_auth}@" if proxy_auth else "" - proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}" - - # Update session state - st.session_state.use_proxy = proxy_enabled - st.session_state.proxy_string = proxy_string - - # Configure proxy rotation if enabled - # Configure proxy rotation if enabled - if use_proxy_rotation and proxy_list: - PROXY_ROTATION_CONFIG["enabled"] = True - PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval - PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] - - st.success("Proxy configuration updated!") - - # Download Options tab - with config_tabs[2]: - col1, col2 = st.columns(2) - with col1: - st.markdown("

Download Behavior

", unsafe_allow_html=True) - - skip_existing = st.checkbox("Skip Existing Files", value=True, - help="Don't download files that already exist locally") - - auto_rename = st.checkbox("Auto-Rename Duplicates", value=True, - help="Automatically rename files instead of overwriting") - - verify_downloads = st.checkbox("Verify Downloads", value=True, - help="Check file integrity after download") - - max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3, - help="Number of times to retry failed downloads") - - with col2: - st.markdown("

File Organization

", unsafe_allow_html=True) - - auto_organize = st.checkbox("Auto-Organize Files", value=True, - help="Automatically organize files by type") - - default_dir = st.text_input("Default Download Directory", value="downloads", - help="Default location to save downloaded files") - - org_by_domain = st.checkbox("Organize by Domain", value=False, - help="Create subdirectories based on source domains") - - org_by_type = st.checkbox("Organize by File Type", value=False, - help="Create subdirectories based on file types") - - if st.button("Save Download Settings"): - st.session_state.download_settings = { - "skip_existing": skip_existing, - "auto_rename": auto_rename, - "verify_downloads": verify_downloads, - "max_retries": max_retries, - "auto_organize": auto_organize, - "default_dir": default_dir, - "org_by_domain": org_by_domain, - "org_by_type": org_by_type - } - st.success("Download settings saved!") - - # System tab - with config_tabs[3]: - col1, col2 = st.columns(2) - with col1: - st.markdown("

Memory & Performance

", unsafe_allow_html=True) - - max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3, - help="Maximum number of simultaneous downloads") - - memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024, - help="Maximum memory to use for file processing") - - processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2, - help="Number of threads to use for file processing") - - with col2: - st.markdown("

Logs & Diagnostics

", unsafe_allow_html=True) - - log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1, - help="Detail level for application logs") - - save_debug_info = st.checkbox("Save Debug Information", value=False, - help="Save detailed information about program execution") - - log_dir = st.text_input("Log Directory", value="logs", - help="Directory to save log files") - - if st.button("Apply System Settings"): - st.session_state.system_settings = { - "max_concurrent": max_concurrent, - "memory_limit": memory_limit, - "processing_threads": processing_threads, - "log_level": log_level, - "save_debug_info": save_debug_info, - "log_dir": log_dir - } - # Update logging configuration - log_level_num = getattr(logging, log_level) - logging.getLogger().setLevel(log_level_num) - st.success("System settings applied!") - - # Reset application button - st.markdown("

Application Control

", unsafe_allow_html=True) - reset_col1, reset_col2 = st.columns([1, 3]) - with reset_col1: - if st.button("Reset Application", use_container_width=True): - for key in list(st.session_state.keys()): - if key != 'google_credentials': # Preserve Google auth - del st.session_state[key] - st.success("Application has been reset!") - st.rerun() - with reset_col2: - st.info("This will clear all search results, downloaded files, and reset settings to defaults.") - - # Tab 4: Help - with tabs[3]: - st.markdown("

Help & Documentation

", unsafe_allow_html=True) - - help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"]) - - with help_tabs[0]: - st.markdown(""" - ### Getting Started - - 1. **Enter a URL** on the Search & Download tab - 2. Select a **Search Method**: - - **Deep Search**: Thorough but slower - - **Quick Search**: Fast but may miss some files - - **Exam Site Mode**: Optimized for educational resource sites - 3. Click **Start Search** to find downloadable files - 4. Select files you want to download - 5. Click **Download Selected Files** - - #### Using Different Modes - - Select a mode from the sidebar to optimize the tool for different use cases: - - - **Standard Mode**: Balanced for general use - - **Education Mode**: Optimized for finding academic materials - - **Research Mode**: Better for research papers and datasets - - **Media Mode**: Enhanced for finding images, videos, and audio - - For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials. - """) - - with help_tabs[1]: - st.markdown(""" - ### Advanced Features - - - **Local File Search**: Upload files and search through their content using the enhanced RAG search - - **Custom Extensions**: Specify additional file types to look for beyond the default set - - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers - - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity - - **Google Drive Integration**: Upload downloaded files directly to your Google Drive - - #### Search Tips - - - For educational sites, include specific terms like "exam", "test", "paper" in the URL - - When using Local File Search, try different variations of your query for better results - - Use filtering and sorting options to find the most relevant files quickly - - #### File Organization - - You can configure automatic file organization in the Advanced Configuration tab: - - - **Organize by Domain**: Creates folders based on the source website - - **Organize by File Type**: Separates files into folders by their extension - - **Auto-Rename**: Prevents overwriting existing files with same names - """) - - with help_tabs[2]: - st.markdown(""" - ### Troubleshooting - - #### Common Issues - - - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions - - **Downloads failing**: Check if the site requires authentication or uses captchas - - **Slow performance**: Reduce search depth or disable stealth mode for faster results - - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings - - #### Captcha Issues - - Some websites use captchas to prevent automated access. If you encounter captchas: - - 1. Try using a different proxy - 2. Enable "Handle Captchas Automatically" for simple captchas - 3. For complex captchas, you may need to manually access the site first - - #### Proxy Problems - - If you're having issues with proxies: - - 1. Verify your proxy is working with an external tool - 2. Check that you've entered the correct format (http://host:port) - 3. Some websites may block known proxy IPs - - #### Memory Usage - - If the application is using too much memory: - - 1. Reduce the "Memory Limit" in System settings - 2. Process fewer files at once - 3. Use lower search depth values - """) - - with help_tabs[3]: - st.markdown(""" - ### About This Tool - - **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources. - - #### Key Features - - - **Smart Discovery**: Finds downloadable files even when they're not directly linked - - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques - - **Educational Focus**: Specialized detection for exam papers and academic resources - - **Stealth Capabilities**: Avoids detection by anti-scraping measures - - #### Technical Details - - This tool uses: - - - **Playwright**: For browser automation and stealth capabilities - - **Sentence Transformers**: For AI-powered semantic search - - **Streamlit**: For the user interface - - **Google Drive API**: For cloud integration - - #### Credits - - Created with Python, Streamlit, Playwright, and various AI libraries. - - For issues or suggestions, please contact the developer. - - Version 2.0 - March 2025 - """) - - # Handle search and download actions - if search_button and url: - # Reset files and downloaded paths - st.session_state.files = [] - st.session_state.downloaded_paths = [] - st.session_state.download_complete = False - - # Clear the preset URL if it was used - if 'preset_url' in st.session_state: - st.session_state.preset_url = '' - - # Prepare custom extensions - custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()] - - # Configure proxy from session state - proxy_string = st.session_state.proxy_string if st.session_state.use_proxy else None - - # Set up proxy rotation if enabled - if 'use_proxy_rotation' in locals() and use_proxy_rotation and proxy_list: - PROXY_ROTATION_CONFIG["enabled"] = True - PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval - PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] - - # Configure search parameters based on method - sublink_limit = 5000 if search_method == "Deep Search" else 1000 - search_depth = depth if search_method == "Deep Search" else 1 - is_exam_site = search_method == "Exam Site Mode" - - # Execute the search asynchronously - async def run_search(): - async with DownloadManager( - use_proxy=st.session_state.use_proxy, - proxy=proxy_string, - use_stealth=st.session_state.stealth_mode, - proxy_rotation=PROXY_ROTATION_CONFIG["enabled"] - ) as manager: - # For exam sites, use specialized approach - if is_exam_site: - st.session_state.keep_progress = True - edu_links = await manager.get_edu_exam_links(url) - all_files = [] - - progress_text = st.empty() - progress_bar = st.progress(0) - - # Process each exam link - for i, link in enumerate(edu_links): - progress = (i+1) / max(1, len(edu_links)) - progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}") - progress_bar.progress(progress) - - files = await manager.extract_downloadable_files(link, custom_ext_list) - all_files.extend(files) - - st.session_state.files = all_files - progress_text.empty() - progress_bar.empty() - st.session_state.keep_progress = False - - else: - # Use general search method - files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout) - st.session_state.files = files - - # Run the search - asyncio.run(run_search()) - st.rerun() - - # Handle download button - if 'download_button' in locals() and download_button and selected_files: - # Create download directory - os.makedirs(download_dir, exist_ok=True) - - # Reset download state - st.session_state.downloaded_paths = [] - st.session_state.download_complete = False - - # Get selected files - files_to_download = [st.session_state.files[i] for i in selected_files] - - # Execute the download asynchronously - async def run_download(): - async with DownloadManager( - use_proxy=st.session_state.use_proxy, - proxy=st.session_state.proxy_string, - use_stealth=st.session_state.stealth_mode - ) as manager: - download_progress = st.progress(0) - status_text = st.empty() - - for i, file_info in enumerate(files_to_download): - progress = (i) / len(files_to_download) - status_text.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}") - download_progress.progress(progress) - - downloaded_path = await manager.download_file( - file_info, - download_dir, - get_domain(file_info['url']) - ) - - if downloaded_path: - st.session_state.downloaded_paths.append(downloaded_path) - - download_progress.progress(1.0) - status_text.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!") - st.session_state.download_complete = True - - # Run the download - asyncio.run(run_download()) - st.rerun() - - # Handle Google Drive upload - if 'google_drive_button' in locals() and google_drive_button and st.session_state.google_credentials and st.session_state.downloaded_paths: - with st.spinner("Uploading to Google Drive..."): - drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials) - - # Create folder if it doesn't exist - folder_id = None - folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader" - - # Check if folder exists - query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false" - results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute() - items = results.get('files', []) - - if not items: - # Create folder - folder_id = create_drive_folder(drive_service, folder_name) - else: - folder_id = items[0]['id'] - - # Upload each file - upload_progress = st.progress(0) - status_text = st.empty() - uploaded_count = 0 - - for i, path in enumerate(st.session_state.downloaded_paths): - progress = i / len(st.session_state.downloaded_paths) - status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}") - upload_progress.progress(progress) - - result = google_drive_upload(path, st.session_state.google_credentials, folder_id) - if isinstance(result, str) and not result.startswith("Error"): - uploaded_count += 1 - - upload_progress.progress(1.0) - status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'") - - st.success(f"โœ… Files uploaded to Google Drive successfully!") - - # Handle clear button - if clear_button: - st.session_state.files = [] - st.session_state.downloaded_paths = [] - st.session_state.download_complete = False - if 'preset_url' in st.session_state: - st.session_state.preset_url = '' - st.rerun() - -if __name__ == "__main__": - main() \ No newline at end of file