diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,20 +1,79 @@
+import streamlit as st
+# This MUST be the first Streamlit command
+st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐")
+
+# Core imports
import os
-import json
+import subprocess
+from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import asyncio
-import streamlit as st
import logging
+from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote
+import re
+from pathlib import Path
+from io import BytesIO
+import random
+from bs4 import BeautifulSoup
+from PyPDF2 import PdfReader
+import zipfile
+import tempfile
+import mimetypes
+import requests
+import datetime
+import traceback
+import base64
+import shutil
+import json
+import time
+from PIL import Image
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+import google_auth_oauthlib.flow
+import googleapiclient.discovery
+import google.auth.transport.requests
+import googleapiclient.http
-# Configure logging
+# Enhanced RAG search imports
+import nltk
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import docx2txt
+
+# Try to import sentence-transformers for better embeddings
+try:
+ from sentence_transformers import SentenceTransformer
+ HAVE_TRANSFORMERS = True
+except ImportError:
+ HAVE_TRANSFORMERS = False
+
+# Try to download NLTK data if not already present
+try:
+ nltk.data.find('tokenizers/punkt')
+except LookupError:
+ try:
+ nltk.download('punkt', quiet=True)
+ except:
+ pass
+
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ try:
+ nltk.download('stopwords', quiet=True)
+ from nltk.corpus import stopwords
+ STOPWORDS = set(stopwords.words('english'))
+ except:
+ STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by'])
+
+# -------------------- Logging Setup --------------------
logging.basicConfig(
level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s',
- handlers=[
- logging.FileHandler('app.log'),
- logging.StreamHandler()
- ]
+ format='%(asctime)s - %(levelname)s - %(message)s'
)
+logger = logging.getLogger(__name__)
-# Load Google OAuth config from environment variables
+# -------------------- Google OAuth Config --------------------
GOOGLE_OAUTH_CONFIG = {
"web": {
"client_id": os.environ.get("GOOGLE_CLIENT_ID"),
@@ -27,23 +86,3785 @@ GOOGLE_OAUTH_CONFIG = {
}
}
-# Setup the UI
-st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐")
+# -------------------- Stealth and UA Settings --------------------
+# Extended user agent list for better variety
+USER_AGENTS = [
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
+ 'Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0'
+]
-# Import the core components (still keeping modular organization)
-from utils import USER_AGENTS, STEALTH_SETTINGS, PROXY_ROTATION_CONFIG
-from utils import (
- get_random_user_agent, sizeof_fmt, create_zip_file, humanize_file_size, get_domain,
- is_download_link, normalize_download_url, detect_captcha, show_user_friendly_error
-)
-from google_drive import (
- get_google_auth_url, exchange_code_for_credentials, google_drive_upload, create_drive_folder
-)
-from download_manager import DownloadManager
-from rag_search import EnhancedRAGSearch
+# Stealth browser settings
+STEALTH_SETTINGS = {
+ # Hardware features to modify/disable
+ "hardware_concurrency": 4,
+ "device_memory": 8,
+ # Browser features to enable/disable
+ "webgl_vendor": "Google Inc. (Intel)",
+ "webgl_renderer": "Intel Iris OpenGL Engine",
+ "languages": ["en-US", "en"],
+ "disable_webrtc": True,
+ # Additional timing randomization
+ "navigator_platform": "Win32",
+ "touch_support": False
+}
+
+# Proxy rotation configuration (if using multiple proxies)
+PROXY_ROTATION_CONFIG = {
+ "enabled": False, # Set to True to enable rotation
+ "rotation_interval": 10, # Rotate every 10 requests
+ "proxies": [] # Will be populated from the UI if needed
+}
+
+# -------------------- Enhanced RAG Search with Small LLM --------------------
+class EnhancedRAGSearch:
+ def __init__(self):
+ self.file_texts = []
+ self.chunks = [] # Document chunks for more targeted search
+ self.chunk_metadata = [] # Metadata for each chunk
+ self.file_metadata = []
+ self.languages = []
+ self.model = None
+
+ # Try to load the sentence transformer model if available
+ if HAVE_TRANSFORMERS:
+ try:
+ # Use a small, efficient model
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
+ self.use_transformer = True
+ logger.info("Using sentence-transformers for RAG")
+ except Exception as e:
+ logger.warning(f"Error loading sentence-transformer: {e}")
+ self.use_transformer = False
+ else:
+ self.use_transformer = False
+
+ # Fallback to TF-IDF if transformers not available
+ if not self.use_transformer:
+ self.vectorizer = TfidfVectorizer(
+ stop_words='english',
+ ngram_range=(1, 2), # Use bigrams for better context
+ max_features=15000, # Use more features for better representation
+ min_df=1 # Include rare terms
+ )
+
+ self.vectors = None
+ self.chunk_vectors = None
+
+ def add_file(self, file_data, file_info):
+ """Add a file to the search index with improved processing"""
+ file_ext = os.path.splitext(file_info['filename'])[1].lower()
+ text = self.extract_text(file_data, file_ext)
+
+ if text:
+ # Store the whole document text
+ self.file_texts.append(text)
+ self.file_metadata.append(file_info)
+
+ # Try to detect language
+ try:
+ # Simple language detection based on stopwords
+ words = re.findall(r'\b\w+\b', text.lower())
+ english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100]))
+ lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown'
+ self.languages.append(lang)
+ except:
+ self.languages.append('en') # Default to English
+
+ # Create chunks for more granular search
+ chunks = self.create_chunks(text)
+ for chunk in chunks:
+ self.chunks.append(chunk)
+ self.chunk_metadata.append({
+ 'file_info': file_info,
+ 'chunk_size': len(chunk),
+ 'file_index': len(self.file_texts) - 1
+ })
+
+ return True
+ return False
+
+ def create_chunks(self, text, chunk_size=1000, overlap=200):
+ """Split text into overlapping chunks for better search precision"""
+ # Try to use NLTK for sentence-aware chunking
+ try:
+ sentences = nltk.sent_tokenize(text)
+ chunks = []
+ current_chunk = ""
+
+ for sentence in sentences:
+ if len(current_chunk) + len(sentence) <= chunk_size:
+ current_chunk += sentence + " "
+ else:
+ # Add current chunk if it has content
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+
+ # Start new chunk with overlap from previous chunk
+ if len(current_chunk) > overlap:
+ # Find the last space within the overlap region
+ overlap_text = current_chunk[-overlap:]
+ last_space = overlap_text.rfind(' ')
+ if last_space != -1:
+ current_chunk = current_chunk[-(overlap-last_space):] + sentence + " "
+ else:
+ current_chunk = sentence + " "
+ else:
+ current_chunk = sentence + " "
+
+ # Add the last chunk if it has content
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+
+ return chunks
+ except:
+ # Fallback to simpler chunking approach
+ chunks = []
+ for i in range(0, len(text), chunk_size - overlap):
+ chunk = text[i:i + chunk_size]
+ if chunk:
+ chunks.append(chunk)
+ return chunks
+
+ def extract_text(self, file_data, file_ext):
+ """Extract text from different file types with enhanced support"""
+ try:
+ if file_ext.lower() == '.pdf':
+ reader = PyPDF2.PdfReader(BytesIO(file_data))
+ text = ""
+ for page in reader.pages:
+ extracted = page.extract_text()
+ if extracted:
+ text += extracted + "\n"
+ # If text extraction fails, try to OCR (would need extra libraries)
+ return text
+ elif file_ext.lower() in ['.docx', '.doc']:
+ return docx2txt.process(BytesIO(file_data))
+ elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']:
+ # Handle both UTF-8 and other common encodings
+ try:
+ return file_data.decode('utf-8', errors='ignore')
+ except:
+ encodings = ['latin-1', 'iso-8859-1', 'windows-1252']
+ for enc in encodings:
+ try:
+ return file_data.decode(enc, errors='ignore')
+ except:
+ pass
+ # Last resort fallback
+ return file_data.decode('utf-8', errors='ignore')
+ elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']:
+ # For these types, we would need additional libraries
+ # For now, return a placeholder message
+ return f"[Content of {file_ext} file - install additional libraries for full text extraction]"
+ else:
+ return ""
+ except Exception as e:
+ logger.error(f"Error extracting text: {e}")
+ return ""
+
+ def build_index(self):
+ """Build both document and chunk search indices"""
+ if not self.file_texts:
+ return False
+
+ try:
+ if self.use_transformer:
+ # Use sentence transformer models for embeddings
+ logger.info("Building document and chunk embeddings with transformer model...")
+ self.vectors = self.model.encode(self.file_texts, show_progress_bar=False)
+
+ # Build chunk-level index if we have chunks
+ if self.chunks:
+ # Process in batches to avoid memory issues
+ batch_size = 32
+ chunk_vectors = []
+ for i in range(0, len(self.chunks), batch_size):
+ batch = self.chunks[i:i+batch_size]
+ batch_vectors = self.model.encode(batch, show_progress_bar=False)
+ chunk_vectors.append(batch_vectors)
+ self.chunk_vectors = np.vstack(chunk_vectors)
+ else:
+ # Build document-level index
+ self.vectors = self.vectorizer.fit_transform(self.file_texts)
+
+ # Build chunk-level index if we have chunks
+ if self.chunks:
+ self.chunk_vectors = self.vectorizer.transform(self.chunks)
+
+ return True
+ except Exception as e:
+ logger.error(f"Error building search index: {e}")
+ return False
+
+ def expand_query(self, query):
+ """Add related terms to query for better recall - mini LLM function"""
+ # Dictionary of related terms for common keywords
+ expansions = {
+ "exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"],
+ "test": ["exam", "quiz", "assessment", "paper"],
+ "document": ["file", "paper", "report", "doc", "documentation"],
+ "manual": ["guide", "instruction", "documentation", "handbook"],
+ "tutorial": ["guide", "instructions", "how-to", "lesson"],
+ "article": ["paper", "publication", "journal", "research"],
+ "research": ["study", "investigation", "paper", "analysis"],
+ "book": ["textbook", "publication", "volume", "edition"],
+ "thesis": ["dissertation", "paper", "research", "study"],
+ "report": ["document", "paper", "analysis", "summary"],
+ "assignment": ["homework", "task", "project", "work"],
+ "lecture": ["class", "presentation", "talk", "lesson"],
+ "notes": ["annotations", "summary", "outline", "study material"],
+ "syllabus": ["curriculum", "course outline", "program", "plan"],
+ "paper": ["document", "article", "publication", "exam", "test"],
+ "question": ["problem", "query", "exercise", "inquiry"],
+ "solution": ["answer", "resolution", "explanation", "result"],
+ "reference": ["source", "citation", "bibliography", "resource"],
+ "analysis": ["examination", "study", "evaluation", "assessment"],
+ "guide": ["manual", "instruction", "handbook", "tutorial"],
+ "worksheet": ["exercise", "activity", "handout", "practice"],
+ "review": ["evaluation", "assessment", "critique", "feedback"],
+ "material": ["resource", "content", "document", "information"],
+ "data": ["information", "statistics", "figures", "numbers"]
+ }
+
+ # Enhanced query expansion simulating a mini-LLM
+ query_words = re.findall(r'\b\w+\b', query.lower())
+ expanded_terms = set()
+
+ # Directly add expansions from our dictionary
+ for word in query_words:
+ if word in expansions:
+ expanded_terms.update(expansions[word])
+
+ # Add common academic file formats if not already included
+ if any(term in query.lower() for term in ["file", "document", "download", "paper"]):
+ if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]):
+ expanded_terms.update(["pdf", "docx", "pptx", "xlsx"])
+
+ # Add special academic terms when the query seems related to education
+ if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]):
+ expanded_terms.update(["syllabus", "lecture", "notes", "textbook"])
+
+ # Return original query plus expanded terms
+ if expanded_terms:
+ expanded_query = f"{query} {' '.join(expanded_terms)}"
+ logger.info(f"Expanded query: '{query}' -> '{expanded_query}'")
+ return expanded_query
+ return query
+
+ def search(self, query, top_k=5, search_chunks=True):
+ """Enhanced search with both document and chunk-level search"""
+ if self.vectors is None:
+ return []
+
+ # Simulate a small LLM by expanding the query with related terms
+ expanded_query = self.expand_query(query)
+
+ try:
+ results = []
+
+ if self.use_transformer:
+ # Transform the query to embedding
+ query_vector = self.model.encode([expanded_query])[0]
+
+ # First search at document level for higher-level matches
+ if self.vectors is not None:
+ # Compute similarities between query and documents
+ doc_similarities = cosine_similarity(
+ query_vector.reshape(1, -1),
+ self.vectors
+ ).flatten()
+
+ top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
+
+ for i, idx in enumerate(top_doc_indices):
+ if doc_similarities[idx] > 0.2: # Threshold to exclude irrelevant results
+ results.append({
+ 'file_info': self.file_metadata[idx],
+ 'score': float(doc_similarities[idx]),
+ 'rank': i+1,
+ 'match_type': 'document',
+ 'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
+ })
+
+ # Then search at chunk level for more specific matches if enabled
+ if search_chunks and self.chunk_vectors is not None:
+ # Compute similarities between query and chunks
+ chunk_similarities = cosine_similarity(
+ query_vector.reshape(1, -1),
+ self.chunk_vectors
+ ).flatten()
+
+ top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results
+
+ # Use a set to avoid duplicate file results
+ seen_files = set(r['file_info']['url'] for r in results)
+
+ for i, idx in enumerate(top_chunk_indices):
+ if chunk_similarities[idx] > 0.25: # Higher threshold for chunks
+ file_index = self.chunk_metadata[idx]['file_index']
+ file_info = self.file_metadata[file_index]
+
+ # Only add if we haven't already included this file
+ if file_info['url'] not in seen_files:
+ seen_files.add(file_info['url'])
+ results.append({
+ 'file_info': file_info,
+ 'score': float(chunk_similarities[idx]),
+ 'rank': len(results) + 1,
+ 'match_type': 'chunk',
+ 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
+ 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
+ })
+
+ # Stop after we've found enough results
+ if len(results) >= top_k*1.5:
+ break
+ else:
+ # Fallback to TF-IDF if transformers not available
+ query_vector = self.vectorizer.transform([expanded_query])
+
+ # First search at document level
+ if self.vectors is not None:
+ doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
+ top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
+
+ for i, idx in enumerate(top_doc_indices):
+ if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results
+ results.append({
+ 'file_info': self.file_metadata[idx],
+ 'score': float(doc_similarities[idx]),
+ 'rank': i+1,
+ 'match_type': 'document',
+ 'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
+ })
+
+ # Then search at chunk level if enabled
+ if search_chunks and self.chunk_vectors is not None:
+ chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
+ top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1]
+
+ # Avoid duplicates
+ seen_files = set(r['file_info']['url'] for r in results)
+
+ for i, idx in enumerate(top_chunk_indices):
+ if chunk_similarities[idx] > 0.15:
+ file_index = self.chunk_metadata[idx]['file_index']
+ file_info = self.file_metadata[file_index]
+
+ if file_info['url'] not in seen_files:
+ seen_files.add(file_info['url'])
+ results.append({
+ 'file_info': file_info,
+ 'score': float(chunk_similarities[idx]),
+ 'rank': len(results) + 1,
+ 'match_type': 'chunk',
+ 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
+ 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
+ })
+
+ if len(results) >= top_k*1.5:
+ break
+
+ # Sort combined results by score
+ results.sort(key=lambda x: x['score'], reverse=True)
+
+ # Re-rank and truncate
+ for i, result in enumerate(results[:top_k]):
+ result['rank'] = i+1
+
+ return results[:top_k]
+ except Exception as e:
+ logger.error(f"Error during search: {e}")
+ return []
+
+# -------------------- Utility Functions --------------------
+def get_random_user_agent():
+ return random.choice(USER_AGENTS)
+
+def sizeof_fmt(num, suffix='B'):
+ for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+ if abs(num) < 1024.0:
+ return f"{num:3.1f}{unit}{suffix}"
+ num /= 1024.0
+ return f"{num:.1f}Y{suffix}"
+
+def create_zip_file(file_paths, output_dir):
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+ zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
+ for file_path in file_paths:
+ zipf.write(file_path, os.path.basename(file_path))
+ return zip_path
+
+def get_file_extension(url, default='.pdf'):
+ """Extract file extension from URL or filename"""
+ path = urlparse(url).path
+ ext = os.path.splitext(path)[1].lower()
+ if not ext:
+ return default
+ return ext
+
+def humanize_file_size(size_bytes):
+ """Format file size in human-readable format"""
+ if size_bytes < 1024:
+ return f"{size_bytes} bytes"
+ for unit in ['KB', 'MB', 'GB', 'TB']:
+ size_bytes /= 1024.0
+ if size_bytes < 1024.0:
+ return f"{size_bytes:.1f} {unit}"
+ return f"{size_bytes:.1f} PB"
+
+def get_domain(url):
+ """Extract domain from URL"""
+ parsed = urlparse(url)
+ return parsed.netloc
+
+def is_valid_file_url(url, extensions):
+ """Check if URL is a valid file URL based on extension"""
+ return any(url.lower().endswith(ext) for ext in extensions)
+
+def detect_captcha(html_content):
+ """Detect common captcha patterns in HTML content"""
+ captcha_patterns = [
+ 'captcha', 'recaptcha', 'g-recaptcha', 'hcaptcha', 'cf-turnstile',
+ 'challenge', 'solve the following', 'verify you are human'
+ ]
+ html_lower = html_content.lower()
+ return any(pattern in html_lower for pattern in captcha_patterns)
+
+def is_download_link(url):
+ """Enhanced function to detect if a URL is likely a download link"""
+ # Check for obvious download indicators in URL
+ url_lower = url.lower()
+
+ # Check for common download-related terms in the URL
+ download_terms = [
+ 'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
+ 'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
+ ]
+ if any(term in url_lower for term in download_terms):
+ return True
+
+ # Check for common download script patterns
+ script_patterns = [
+ 'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
+ 'download.aspx', 'getfile.aspx', 'file.aspx',
+ 'downloadhandler', 'filehandler', 'filedownload',
+ 'download.jsp', 'download.cgi', 'download.do',
+ 'download-file', 'get-file',
+ 'downloadfile', 'getfile', 'viewfile',
+ 'Action=downloadfile', 'action=download', 'action=view',
+ 'download?', 'file?', 'get?', 'view?'
+ ]
+ if any(pattern in url_lower for pattern in script_patterns):
+ return True
+
+ # Check for common file extensions in the URL path or parameters
+ path = urlparse(url).path
+ common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
+ '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
+ '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']
+
+ if any(ext in path.lower() for ext in common_extensions):
+ return True
+
+ # Check for file ID or file parameters in URL
+ params = parse_qs(urlparse(url).query)
+ param_keys = params.keys()
+ file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
+ if any(key.lower() in file_param_indicators for key in param_keys):
+ return True
+
+ # Check for complex encoding patterns like in the example URL
+ if 'Action=downloadfile' in url or 'fname=' in url:
+ return True
+
+ return False
+
+def normalize_download_url(url):
+ """Normalize download URLs to handle various formats and encodings"""
+ try:
+ # Handle common URL shorteners and redirections
+ parsed = urlparse(url)
+
+ # Handle phpMyAdmin-style encoded URLs
+ if 'Action=downloadfile' in url and 'file=' in url:
+ # Extract the encoded file parameter
+ params = parse_qs(parsed.query)
+ if 'file' in params:
+ # This is just a placeholder - in a real implementation,
+ # you would need to handle the specific encoding used
+ encoded_file = params['file'][0]
+ # Keep the URL as is for now, since we'll handle it during download
+ return url
+
+ # Handle URLs with fname parameter (like in the example)
+ if 'fname=' in url:
+ # Keep as is - we'll handle this specially during download
+ return url
+
+ # For other URLs, make sure they are properly quoted
+ path = parsed.path
+ # Only quote the path portion if needed
+ if '%' not in path and ' ' in path:
+ path = quote(path)
+
+ # Reconstruct the URL
+ normalized = parsed._replace(path=path).geturl()
+ return normalized
+ except Exception as e:
+ logger.error(f"Error normalizing URL {url}: {e}")
+ return url
+
+# -------------------- Google Drive Functions --------------------
+def get_google_auth_url():
+ client_config = GOOGLE_OAUTH_CONFIG["web"]
+ flow = google_auth_oauthlib.flow.Flow.from_client_config(
+ {"web": client_config},
+ scopes=["https://www.googleapis.com/auth/drive.file"]
+ )
+ flow.redirect_uri = client_config["redirect_uris"][0]
+ authorization_url, _ = flow.authorization_url(
+ access_type="offline",
+ include_granted_scopes="true",
+ prompt="consent"
+ )
+ return authorization_url
+
+def exchange_code_for_credentials(auth_code):
+ if not auth_code.strip():
+ return None, "No code provided."
+ try:
+ client_config = GOOGLE_OAUTH_CONFIG["web"]
+ flow = google_auth_oauthlib.flow.Flow.from_client_config(
+ {"web": client_config},
+ scopes=["https://www.googleapis.com/auth/drive.file"]
+ )
+ flow.redirect_uri = client_config["redirect_uris"][0]
+ flow.fetch_token(code=auth_code.strip())
+ creds = flow.credentials
+ if not creds or not creds.valid:
+ return None, "Could not validate credentials. Check code and try again."
+ return creds, "Google Sign-In successful!"
+ except Exception as e:
+ return None, f"Error during token exchange: {e}"
+
+def google_drive_upload(file_path, credentials, folder_id=None):
+ try:
+ drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
+ file_metadata = {'name': os.path.basename(file_path)}
+ if folder_id:
+ file_metadata['parents'] = [folder_id]
+ media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
+ created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
+ return created.get("id", "")
+ except Exception as e:
+ return f"Error uploading to Drive: {str(e)}"
+
+def create_drive_folder(drive_service, name):
+ folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
+ folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
+ return folder.get('id')
+
+# -------------------- Playwright Setup --------------------
+def install_playwright_dependencies():
+ try:
+ # Set environment variable for Playwright browsers path
+ os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
+
+ # Install system dependencies
+ subprocess.run(['apt-get', 'update', '-y'], check=True)
+ packages = [
+ 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
+ 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
+ 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
+ ]
+ subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
+
+ # Install Playwright and dependencies
+ subprocess.run(['pip', 'install', 'playwright'], check=True)
+ subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
+
+ st.success("Playwright dependencies installed successfully!")
+ except Exception as e:
+ st.error(f"Error installing Playwright dependencies: {e}")
+ st.info("You may need to manually install dependencies. Check console for details.")
+ logger.error(f"Playwright setup error: {e}")
+ traceback.print_exc()
+
+# -------------------- Download Manager Class --------------------
+class DownloadManager:
+ def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5, use_stealth=True, proxy_rotation=False):
+ self.use_proxy = use_proxy
+ self.proxy = proxy
+ self.query = query
+ self.num_results = num_results
+ self.playwright = None
+ self.browser = None
+ self.context = None
+ self.page = None
+ self.use_stealth = use_stealth
+ self.proxy_rotation = proxy_rotation
+ self.request_count = 0
+ self.captcha_detected = False
+ self.download_timeout = 300 # 5 minutes timeout for downloads
+ # Track visited URLs to avoid revisiting the same URL multiple times
+ self.visited_urls = set()
+ # Track successfully downloaded files to avoid redownloading
+ self.downloaded_files = set()
+
+ async def __aenter__(self):
+ self.playwright = await async_playwright().start()
+
+ # Prepare browser args with stealth settings
+ browser_args = [
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-gpu',
+ '--no-zygote',
+ '--single-process',
+ '--disable-web-security',
+ '--disable-features=IsolateOrigins',
+ '--disable-site-isolation-trials'
+ ]
+
+ # Add stealth-specific args
+ if self.use_stealth:
+ browser_args.extend([
+ '--disable-blink-features=AutomationControlled',
+ '--disable-features=IsolateOrigins,site-per-process',
+ '--disable-webgl',
+ '--disable-webrtc'
+ ])
+
+ # Setup browser options
+ opts = {
+ "headless": True,
+ "args": browser_args
+ }
+
+ # Configure proxy if specified
+ if self.use_proxy and self.proxy:
+ opts["proxy"] = {"server": self.proxy}
+
+ # Launch browser with options
+ self.browser = await self.playwright.chromium.launch(**opts)
+
+ # Setup browser context with enhanced settings
+ context_opts = {
+ "user_agent": get_random_user_agent(),
+ "viewport": {"width": 1920, "height": 1080},
+ "device_scale_factor": 1,
+ "has_touch": False,
+ "is_mobile": False,
+ "ignore_https_errors": True,
+ "accept_downloads": True
+ }
+
+ # Apply stealth-specific settings to the context
+ if self.use_stealth:
+ # Apply JS-injection for enhanced stealth
+ context_opts["bypass_csp"] = True
+ self.context = await self.browser.new_context(**context_opts)
+
+ # Execute stealth JS to avoid detection
+ await self.context.add_init_script("""
+ () => {
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => false,
+ });
+
+ // Change navigator properties
+ const newProto = navigator.__proto__;
+ delete newProto.webdriver;
+
+ // Overwrite the plugins
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5].map(() => ({
+ lengthComputable: true,
+ loaded: 100,
+ total: 100
+ }))
+ });
+
+ // Handle languages more naturally
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['en-US', 'en', 'es']
+ });
+
+ // Modify hardware concurrency
+ Object.defineProperty(navigator, 'hardwareConcurrency', {
+ get: () => 4
+ });
+
+ // Modify deviceMemory
+ Object.defineProperty(navigator, 'deviceMemory', {
+ get: () => 8
+ });
+
+ // WebGL modifications
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
+ WebGLRenderingContext.prototype.getParameter = function(parameter) {
+ if (parameter === 37445) {
+ return 'Intel Inc.';
+ }
+ if (parameter === 37446) {
+ return 'Intel Iris OpenGL Engine';
+ }
+ return getParameter.apply(this, arguments);
+ };
+ }
+ """)
+ else:
+ # Regular context without stealth
+ self.context = await self.browser.new_context(**context_opts)
+
+ # Create page with enhanced headers
+ self.page = await self.context.new_page()
+ await self.page.set_extra_http_headers({
+ 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+ 'Cache-Control': 'max-age=0',
+ 'DNT': '1', # Do Not Track
+ 'Referer': 'https://www.google.com/',
+ 'Sec-Fetch-Dest': 'document',
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Sec-Fetch-Site': 'cross-site',
+ 'Sec-Fetch-User': '?1',
+ 'Upgrade-Insecure-Requests': '1'
+ })
+
+ # Add delay for mouse movements to simulate human behavior
+ if self.use_stealth:
+ await self.page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 500))
+ await self.page.wait_for_timeout(random.randint(200, 500))
+
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ if self.browser:
+ await self.browser.close()
+ if self.playwright:
+ await self.playwright.stop()
+
+ async def rotate_proxy_if_needed(self):
+ """Rotate proxy if proxy rotation is enabled and threshold is reached"""
+ if self.proxy_rotation and PROXY_ROTATION_CONFIG["enabled"]:
+ self.request_count += 1
+ if self.request_count >= PROXY_ROTATION_CONFIG["rotation_interval"] and PROXY_ROTATION_CONFIG["proxies"]:
+ # Get next proxy from the pool
+ next_proxy = PROXY_ROTATION_CONFIG["proxies"].pop(0)
+ PROXY_ROTATION_CONFIG["proxies"].append(next_proxy) # Move to end of list
+
+ # Close existing context and create new one with the new proxy
+ if self.context:
+ await self.context.close()
+
+ # Create new context with the new proxy
+ context_opts = {
+ "user_agent": get_random_user_agent(),
+ "proxy": {"server": next_proxy},
+ "accept_downloads": True
+ }
+ self.context = await self.browser.new_context(**context_opts)
+ self.page = await self.context.new_page()
+
+ # Reset counter
+ self.request_count = 0
+ logger.info(f"Rotated to new proxy: {next_proxy}")
-# Initialize session state variables
-def initialize_session_state():
+ async def handle_captcha(self, page):
+ """Detect and handle captchas if possible"""
+ # Check for common captcha patterns
+ content = await page.content()
+ if detect_captcha(content):
+ self.captcha_detected = True
+ logger.warning("Captcha detected on page")
+
+ # Strategies for handling captchas:
+ # 1. For simple captchas, try to extract the image and solve it
+ captcha_img = await page.query_selector('img[alt*="captcha" i], img[src*="captcha" i]')
+ if captcha_img:
+ logger.info("Found captcha image, attempting to capture")
+
+ # Take screenshot of the captcha
+ captcha_path = os.path.join(tempfile.gettempdir(), "captcha.png")
+ await captcha_img.screenshot(path=captcha_path)
+
+ # In a real implementation, you would send this to a captcha solving service
+ # For now, just log the detection
+ logger.info(f"Captcha image saved to {captcha_path}")
+
+ # For demonstration, we'll notify the user but not actually solve it
+ return False
+
+ # 2. For reCAPTCHA, special handling would be required
+ recaptcha = await page.query_selector('iframe[src*="recaptcha"]')
+ if recaptcha:
+ logger.warning("reCAPTCHA detected, would require external solving service")
+ return False
+
+ # 3. Try to perform human-like actions that might bypass simple bot checks
+ await self.perform_human_actions(page)
+
+ # Check if captcha is still present
+ content = await page.content()
+ if detect_captcha(content):
+ logger.warning("Captcha still present after human-like actions")
+ return False
+ else:
+ logger.info("Captcha appears to be resolved")
+ return True
+
+ return True # No captcha detected
+
+ async def perform_human_actions(self, page):
+ """Perform human-like actions on the page to possibly bypass simple bot checks"""
+ try:
+ # 1. Slowly scroll down the page
+ for i in range(3):
+ await page.evaluate(f"window.scrollTo(0, {i * 300})")
+ await page.wait_for_timeout(random.randint(300, 700))
+
+ # 2. Random mouse movements
+ for _ in range(3):
+ x = random.randint(100, 800)
+ y = random.randint(100, 600)
+ await page.mouse.move(x=x, y=y)
+ await page.wait_for_timeout(random.randint(200, 500))
+
+ # 3. Click on a non-essential part of the page
+ try:
+ await page.click("body", position={"x": 50, "y": 50})
+ except:
+ pass
+
+ # 4. Wait a bit before continuing
+ await page.wait_for_timeout(1000)
+
+ except Exception as e:
+ logger.warning(f"Error during human-like actions: {e}")
+
+ async def search_bing(self):
+ urls = []
+ try:
+ # Rotate proxy if needed
+ await self.rotate_proxy_if_needed()
+
+ search_url = f"https://www.bing.com/search?q={self.query}"
+ await self.page.goto(search_url, timeout=30000)
+ await self.page.wait_for_load_state('networkidle')
+
+ # Check for captchas
+ if not await self.handle_captcha(self.page):
+ logger.warning("Captcha detected during search, results may be limited")
+
+ # More natural scrolling behavior
+ for i in range(3):
+ await self.page.evaluate(f"window.scrollTo(0, {i * 400})")
+ await self.page.wait_for_timeout(random.randint(300, 800))
+
+ # Extract search results
+ links = await self.page.query_selector_all("li.b_algo h2 a")
+ for link in links[:self.num_results]:
+ href = await link.get_attribute('href')
+ if href:
+ urls.append(href)
+
+ # If we didn't find enough results, try an alternative selector
+ if len(urls) < self.num_results:
+ alt_links = await self.page.query_selector_all(".b_caption a")
+ for link in alt_links:
+ href = await link.get_attribute('href')
+ if href and href not in urls:
+ urls.append(href)
+ if len(urls) >= self.num_results:
+ break
+
+ return urls
+ except Exception as e:
+ logger.error(f"Error searching Bing: {e}")
+ return []
+
+ async def get_file_size(self, url):
+ try:
+ await self.rotate_proxy_if_needed()
+
+ # For complex download URLs, we need to be careful with HEAD requests
+ if '?' in url or 'Action=downloadfile' in url or 'fname=' in url:
+ # For these URLs, we'll try a more reliable approach using range headers
+ headers = {
+ 'User-Agent': get_random_user_agent(),
+ 'Range': 'bytes=0-0' # Just request the first byte to check headers
+ }
+
+ try:
+ with requests.get(url, headers=headers, stream=True, timeout=10) as r:
+ if 'Content-Range' in r.headers:
+ content_range = r.headers['Content-Range']
+ match = re.search(r'bytes 0-0/(\d+)', content_range)
+ if match:
+ size = int(match.group(1))
+ return sizeof_fmt(size)
+
+ if 'Content-Length' in r.headers:
+ size = int(r.headers['Content-Length'])
+ # If size is 1, it's likely just our single requested byte
+ if size > 1:
+ return sizeof_fmt(size)
+ except Exception as e:
+ logger.warning(f"Error getting file size with Range request: {e}")
+
+ # Fallback to browser approach
+ try:
+ async with self.context.new_page() as page:
+ response = await page.request.head(url, timeout=15000)
+ length = response.headers.get('Content-Length', None)
+ if length:
+ return sizeof_fmt(int(length))
+ except Exception as e:
+ logger.warning(f"Error getting file size with browser: {e}")
+
+ return "Unknown Size"
+ else:
+ # Standard approach for normal URLs
+ async with self.context.new_page() as page:
+ response = await page.request.head(url, timeout=15000)
+ length = response.headers.get('Content-Length', None)
+ if length:
+ return sizeof_fmt(int(length))
+ else:
+ return "Unknown Size"
+ except Exception as e:
+ logger.warning(f"Error getting file size: {e}")
+ return "Unknown Size"
+
+ async def get_pdf_metadata(self, url):
+ try:
+ await self.rotate_proxy_if_needed()
+
+ async with self.context.new_page() as page:
+ resp = await page.request.get(url, timeout=15000)
+ if resp.ok:
+ content = await resp.body()
+ pdf = BytesIO(content)
+ reader = PdfReader(pdf)
+ return {
+ 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
+ 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
+ 'Pages': len(reader.pages),
+ }
+ else:
+ return {}
+ except Exception as e:
+ logger.warning(f"Error reading PDF metadata: {e}")
+ return {}
+
+ async def extract_real_download_url(self, url):
+ """Enhanced method to extract real download URL, handling complex URLs"""
+ try:
+ # Check if this is a complex download URL that needs special handling
+ if 'Action=downloadfile' in url or 'fname=' in url:
+ logger.info(f"Complex download URL detected: {url}")
+
+ # For these special cases, we'll use the browser to navigate and intercept redirects
+ await self.rotate_proxy_if_needed()
+
+ async with self.context.new_page() as page:
+ # Set up request interception to capture redirects
+ await page.route('**', lambda route: route.continue_())
+
+ # Listen for all responses
+ responses = []
+ page.on('response', lambda response: responses.append(response))
+
+ try:
+ # Go to the URL
+ await page.goto(url, wait_until='networkidle', timeout=30000)
+
+ # Check all responses for potential downloads
+ for response in responses:
+ # Look for content-disposition headers indicating a download
+ content_disposition = response.headers.get('Content-Disposition', '')
+ if 'attachment' in content_disposition or 'filename=' in content_disposition:
+ return response.url
+
+ # Look for content-type headers indicating a file
+ content_type = response.headers.get('Content-Type', '')
+ if content_type and content_type != 'text/html' and not content_type.startswith('text/'):
+ return response.url
+
+ # If no clear download was detected, return the final URL
+ return page.url
+ except Exception as e:
+ logger.warning(f"Error extracting real download URL: {e}")
+ return url
+ else:
+ # Standard approach for normal URLs
+ await self.rotate_proxy_if_needed()
+
+ async with self.context.new_page() as page:
+ response = await page.goto(url, wait_until='networkidle', timeout=30000)
+ if response and response.headers.get('location'):
+ return response.headers['location']
+ return page.url
+ except Exception as e:
+ logger.error(f"Error extracting real download URL: {e}")
+ return url
+
+ # IMPROVED: Enhanced exam links extraction method
+ async def get_edu_exam_links(self, url):
+ """Specialized method for educational exam websites that follows a common pattern."""
+ try:
+ logger.info(f"Fetching exam links from {url}")
+ links = set()
+
+ # First try with direct requests for speed (but with proper headers)
+ headers = {
+ "User-Agent": get_random_user_agent(),
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Referer": "https://www.google.com/",
+ "DNT": "1"
+ }
+
+ try:
+ response = requests.get(url, headers=headers, timeout=30)
+
+ if response.status_code == 200:
+ # Parse with BeautifulSoup first for efficiency
+ soup = BeautifulSoup(response.text, "html.parser")
+ parsed_base = urlparse(url)
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+
+ # Look for all links
+ for a in soup.find_all("a", href=True):
+ href = a["href"]
+ full_url = urljoin(url, href)
+
+ # Look for text clues
+ link_text = a.get_text().lower()
+
+ # Special patterns for exam sites (expanded list)
+ url_patterns = [
+ "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
+ "/test/", "/download/", "/files/", "/assignments/",
+ "paper_", "question_", "exam_", "test_", "past_",
+ "assignment_", "sample_", "study_material", "notes_",
+ "/resource/", "/subject/", "/course/", "/material/"
+ ]
+
+ text_patterns = [
+ "exam", "paper", "test", "question", "past", "download",
+ "assignment", "sample", "study", "material", "notes",
+ "subject", "course", "resource", "pdf", "document",
+ "view", "open", "get", "solution", "answer"
+ ]
+
+ # Check URL for patterns
+ if any(pattern in full_url.lower() for pattern in url_patterns):
+ links.add(full_url)
+ continue
+
+ # Check link text for patterns
+ if any(pattern in link_text for pattern in text_patterns):
+ links.add(full_url)
+ continue
+
+ # Check for common file extensions
+ if any(full_url.lower().endswith(ext) for ext in
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+ links.add(full_url)
+
+ # Check for download script parameters
+ if "Action=downloadfile" in url or "fname=" in url:
+ links.add(url) # Add the URL itself as it's a download link
+ except Exception as e:
+ logger.warning(f"Request-based extraction failed: {e}")
+
+ # Browser-based approach for more thorough extraction or if initial approach was inadequate
+ try:
+ # Check if we need to proceed with browser-based extraction
+ if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url:
+ logger.info("Using browser for enhanced link extraction")
+
+ # Rotate proxy if needed
+ await self.rotate_proxy_if_needed()
+
+ # Navigate to the page with more natural timing
+ await self.page.goto(url, timeout=45000, wait_until='networkidle')
+ await self.page.wait_for_timeout(random.randint(1000, 2000))
+
+ # Handle captchas if present
+ if not await self.handle_captcha(self.page):
+ logger.warning("Captcha detected, extraction may be limited")
+
+ # Get base URL for resolving relative links
+ parsed_base = urlparse(url)
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+
+ # Perform natural scrolling to trigger lazy-loaded content
+ page_height = await self.page.evaluate("document.body.scrollHeight")
+ viewport_height = await self.page.evaluate("window.innerHeight")
+
+ for scroll_pos in range(0, page_height, viewport_height // 2):
+ await self.page.evaluate(f"window.scrollTo(0, {scroll_pos})")
+ await self.page.wait_for_timeout(random.randint(300, 800))
+
+ # Scroll back to top
+ await self.page.evaluate("window.scrollTo(0, 0)")
+ await self.page.wait_for_timeout(500)
+
+ # Extract all links with Playwright (better than just anchor tags)
+ all_links = await self.page.evaluate("""
+ () => {
+ const results = [];
+
+ // Get all anchor tags
+ const anchors = document.querySelectorAll('a[href]');
+ for (const a of anchors) {
+ if (a.href) {
+ results.push({
+ href: a.href,
+ text: a.innerText || a.textContent || '',
+ isButton: a.classList.contains('btn') || a.role === 'button'
+ });
+ }
+ }
+
+ // Get buttons that might contain links
+ const buttons = document.querySelectorAll('button');
+ for (const btn of buttons) {
+ const onclick = btn.getAttribute('onclick') || '';
+ if (onclick.includes('window.location') || onclick.includes('download')) {
+ results.push({
+ href: '#button',
+ text: btn.innerText || btn.textContent || '',
+ isButton: true,
+ onclick: onclick
+ });
+ }
+ }
+
+ return results;
+ }
+ """)
+
+ # Process the extracted links
+ for link_info in all_links:
+ href = link_info.get('href', '')
+ text = link_info.get('text', '').lower()
+
+ if href and href != '#button':
+ # Check URL patterns
+ url_patterns = [
+ "/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
+ "/test/", "/download/", "/files/", "/assignments/",
+ "paper_", "question_", "exam_", "test_", "past_",
+ "assignment_", "sample_", "study_material", "notes_"
+ ]
+
+ # Check text patterns
+ text_patterns = [
+ "exam", "paper", "test", "question", "past", "download",
+ "assignment", "sample", "study", "material", "notes",
+ "pdf", "document", "view", "open", "solution"
+ ]
+
+ if any(pattern in href.lower() for pattern in url_patterns) or \
+ any(pattern in text for pattern in text_patterns) or \
+ any(href.lower().endswith(ext) for ext in
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+ links.add(href)
+
+ # Check for download links in the page
+ download_links = await self.page.evaluate("""
+ () => {
+ // Find all links that might be download links
+ const links = Array.from(document.querySelectorAll('a[href]'));
+ return links
+ .filter(a => {
+ const href = a.href.toLowerCase();
+ return href.includes('download') ||
+ href.includes('getfile') ||
+ href.includes('view.php') ||
+ href.includes('action=downloadfile') ||
+ href.includes('fname=');
+ })
+ .map(a => a.href);
+ }
+ """)
+
+ for dl_link in download_links:
+ links.add(dl_link)
+
+ # Check for ASP.NET specific elements that might contain exam links
+ grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
+ for grid in grid_elements:
+ grid_links = await grid.query_selector_all('a[href]')
+ for a in grid_links:
+ href = await a.get_attribute('href')
+ text = await a.text_content()
+
+ if href:
+ full_url = href if href.startswith('http') else urljoin(url, href)
+ links.add(full_url)
+
+ # Try clicking pagination controls to reveal more content
+ pagination_buttons = await self.page.query_selector_all('a[href*="page"], .pagination a, .pager a')
+ for i, button in enumerate(pagination_buttons[:5]): # Limit to first 5 pagination buttons
+ try:
+ # Check if this is a numeric pagination button (more likely to be useful)
+ button_text = await button.text_content()
+ if button_text and button_text.strip().isdigit():
+ logger.info(f"Clicking pagination button: {button_text}")
+ await button.click()
+ await self.page.wait_for_timeout(2000)
+ await self.page.wait_for_load_state('networkidle', timeout=10000)
+
+ # Extract links from this page
+ new_page_links = await self.page.evaluate("""
+ () => {
+ return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
+ }
+ """)
+
+ for href in new_page_links:
+ if href and not href.startswith('javascript:'):
+ if any(pattern in href.lower() for pattern in url_patterns) or \
+ any(href.lower().endswith(ext) for ext in
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+ links.add(href)
+ except Exception as e:
+ logger.warning(f"Error clicking pagination button: {e}")
+
+ # Try clicking any controls that might reveal more exam links (more focused approach)
+ show_buttons = await self.page.query_selector_all('input[type="button"], button, a.btn')
+ for button in show_buttons:
+ button_text = (await button.text_content() or "").lower()
+ button_value = (await button.get_attribute("value") or "").lower()
+ button_id = (await button.get_attribute("id") or "").lower()
+
+ # Look for buttons that seem likely to reveal file lists
+ promising_terms = ["show", "view", "display", "list", "exam", "paper", "test",
+ "download", "resource", "material", "browse", "file"]
+
+ if any(term in button_text or term in button_value or term in button_id
+ for term in promising_terms):
+ try:
+ logger.info(f"Clicking button: {button_text or button_value}")
+ await button.click()
+ await self.page.wait_for_timeout(2000)
+ await self.page.wait_for_load_state('networkidle', timeout=10000)
+
+ # Get any new links that appeared
+ new_links = await self.page.query_selector_all('a[href]')
+ for a in new_links:
+ href = await a.get_attribute('href')
+ if href:
+ full_url = href if href.startswith('http') else urljoin(url, href)
+
+ # Focus on file extensions and patterns
+ if any(full_url.lower().endswith(ext) for ext in
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']) or \
+ any(pattern in full_url.lower() for pattern in url_patterns):
+ links.add(full_url)
+ except Exception as e:
+ logger.warning(f"Error clicking button: {e}")
+
+ # Special handling for ASP.NET PostBack links
+ try:
+ # Find and interact with ASP.NET __doPostBack elements
+ postback_elements = await self.page.query_selector_all('[onclick*="__doPostBack"]')
+ for i, element in enumerate(postback_elements[:10]): # Limit to avoid too many clicks
+ try:
+ onclick = await element.get_attribute('onclick')
+ if onclick and '__doPostBack' in onclick:
+ element_text = await element.text_content()
+
+ # Only interact with elements that seem likely to contain exam links
+ promising_terms = ["show", "view", "list", "exam", "paper", "test",
+ "download", "resource", "material"]
+
+ if any(term in element_text.lower() for term in promising_terms):
+ logger.info(f"Clicking ASP.NET postback element: {element_text}")
+
+ # Click the element
+ await element.click()
+ await self.page.wait_for_timeout(2000)
+ await self.page.wait_for_load_state('networkidle', timeout=10000)
+
+ # Extract any new links
+ new_links = await self.page.query_selector_all('a[href]')
+ for a in new_links:
+ href = await a.get_attribute('href')
+ if href:
+ full_url = href if href.startswith('http') else urljoin(url, href)
+ if any(full_url.lower().endswith(ext) for ext in
+ ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+ links.add(full_url)
+ except Exception as e:
+ logger.warning(f"Error interacting with postback element: {e}")
+ except Exception as e:
+ logger.warning(f"Error during postback handling: {e}")
+
+ except Exception as e:
+ logger.error(f"Browser-based extraction failed: {e}")
+
+ # Filter links to likely contain exam documents
+ filtered_links = []
+ for link in links:
+ # Common file extensions for exam documents
+ if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
+ filtered_links.append(link)
+ continue
+
+ # Common paths for exam documents
+ if any(pattern in link.lower() for pattern in [
+ "/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
+ "/pastpapers/", "/questionpapers/", "/tests/", "/assignments/",
+ "/resource/", "/material/", "/notes/", "/subjectmaterial/"
+ ]):
+ filtered_links.append(link)
+ continue
+
+ # Check for download links (these may not have obvious extensions)
+ if is_download_link(link):
+ filtered_links.append(link)
+
+ logger.info(f"Found {len(filtered_links)} potential exam document links")
+ return filtered_links
+
+ except Exception as e:
+ logger.error(f"Error getting exam links: {e}")
+ return []
+
+ async def discover_hidden_links(self, page):
+ """Discover hidden links that might be in JavaScript, iframes, or dynamic content"""
+ hidden_links = set()
+
+ # Execute JavaScript to find links in script tags and data attributes
+ js_links = await page.evaluate("""
+ () => {
+ const links = new Set();
+
+ // Extract URLs from script tags
+ const scripts = document.querySelectorAll('script');
+ for (const script of scripts) {
+ const content = script.textContent || '';
+ const urlMatches = content.match(/["'](https?:\/\/[^"']+)["']/g) || [];
+ for (let match of urlMatches) {
+ links.add(match.replace(/["']/g, ''));
+ }
+ }
+
+ // Look for download-related variables in scripts
+ for (const script of scripts) {
+ const content = script.textContent || '';
+ // Look for common patterns for file URLs in JavaScript
+ if (content.includes('downloadURL') || content.includes('fileURL') ||
+ content.includes('pdfURL') || content.includes('documentURL')) {
+
+ // Extract potential URLs
+ const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || [];
+ for (let match of potentialUrls) {
+ const url = match.replace(/["']/g, '');
+ // Try to resolve relative URLs
+ if (url.startsWith('/') || !url.includes('://')) {
+ if (url.startsWith('/')) {
+ links.add(window.location.origin + url);
+ } else {
+ // Handle relative paths more carefully
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+ links.add(base + url);
+ }
+ } else if (url.startsWith('http')) {
+ links.add(url);
+ }
+ }
+ }
+ }
+
+ // Check for links in data attributes
+ const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]');
+ for (const el of elements) {
+ for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) {
+ const val = el.getAttribute(attr);
+ if (val) {
+ // Try to resolve relative URLs
+ if (val.startsWith('/')) {
+ links.add(window.location.origin + val);
+ } else if (val.startsWith('http')) {
+ links.add(val);
+ } else if (!val.startsWith('javascript:') && !val.startsWith('#')) {
+ // Handle relative paths
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+ links.add(base + val);
+ }
+ }
+ }
+ }
+
+ // Look for URLs in inline event handlers
+ const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]');
+ for (const el of clickableElements) {
+ for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) {
+ const val = el.getAttribute(attr);
+ if (val) {
+ // Check for JavaScript URLs with window.location
+ if (val.includes('window.location') || val.includes('document.location')) {
+ const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/);
+ if (urlMatch && urlMatch[1]) {
+ const url = urlMatch[1];
+ if (url.startsWith('/')) {
+ links.add(window.location.origin + url);
+ } else if (url.startsWith('http')) {
+ links.add(url);
+ } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+ links.add(base + url);
+ }
+ }
+ }
+
+ // Check for direct URLs in attributes
+ const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
+ for (let match of urlMatches) {
+ links.add(match.replace(/["']/g, ''));
+ }
+
+ // Check for download.php and similar patterns
+ if (val.includes('download.php') || val.includes('getfile.php') ||
+ val.includes('Action=downloadfile') || val.includes('viewfile.php')) {
+
+ // Handle both onclick handlers and direct hrefs
+ let url = '';
+ if (attr === 'href') {
+ url = val;
+ } else {
+ // Extract URL from JavaScript
+ const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i);
+ if (jsUrlMatch) {
+ url = jsUrlMatch[1];
+ }
+ }
+
+ // Resolve URL if needed
+ if (url) {
+ if (url.startsWith('/')) {
+ links.add(window.location.origin + url);
+ } else if (url.startsWith('http')) {
+ links.add(url);
+ } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+ links.add(base + url);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Find PHP/ASP file download links
+ const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]');
+ for (const link of fileLinks) {
+ links.add(link.href);
+ }
+
+ return Array.from(links);
+ }
+ """)
+
+ for link in js_links:
+ hidden_links.add(link)
+
+ # Extract links from iframes
+ iframes = await page.query_selector_all('iframe')
+ for iframe in iframes:
+ try:
+ frame = await iframe.content_frame()
+ if frame:
+ iframe_links = await frame.evaluate("""
+ () => {
+ return Array.from(document.querySelectorAll('a[href]'))
+ .map(a => a.href)
+ .filter(href => href.startsWith('http'));
+ }
+ """)
+ for link in iframe_links:
+ hidden_links.add(link)
+ except Exception as e:
+ logger.warning(f"Could not extract links from iframe: {e}")
+
+ # Look for links in shadow DOM (used in modern web components)
+ shadow_links = await page.evaluate("""
+ () => {
+ const links = new Set();
+
+ // Helper function to recursively process shadow roots
+ function processShadowRoot(root) {
+ if (!root) return;
+
+ // Get links in this shadow root
+ const shadowLinks = root.querySelectorAll('a[href]');
+ for (const link of shadowLinks) {
+ if (link.href && link.href.startsWith('http')) {
+ links.add(link.href);
+ }
+ }
+
+ // Process nested shadow roots
+ const elements = root.querySelectorAll('*');
+ for (const el of elements) {
+ if (el.shadowRoot) {
+ processShadowRoot(el.shadowRoot);
+ }
+ }
+ }
+
+ // Find all shadow roots in the document
+ const elements = document.querySelectorAll('*');
+ for (const el of elements) {
+ if (el.shadowRoot) {
+ processShadowRoot(el.shadowRoot);
+ }
+ }
+
+ return Array.from(links);
+ }
+ """)
+
+ for link in shadow_links:
+ hidden_links.add(link)
+
+ # Look for download links in forms
+ form_links = await page.evaluate("""
+ () => {
+ const links = new Set();
+
+ // Check for form actions that might be download endpoints
+ const forms = document.querySelectorAll('form');
+ for (const form of forms) {
+ const action = form.action || '';
+ if (action && (
+ action.includes('download') ||
+ action.includes('getfile') ||
+ action.includes('viewfile') ||
+ action.includes('Action=downloadfile')
+ )) {
+ // Collect input values that might be needed for the download
+ const inputs = {};
+ const formInputs = form.querySelectorAll('input[name]');
+ for (const input of formInputs) {
+ inputs[input.name] = input.value;
+ }
+
+ // Store both the form action and any important inputs
+ links.add(action);
+ }
+ }
+
+ return Array.from(links);
+ }
+ """)
+
+ for link in form_links:
+ hidden_links.add(link)
+
+ return hidden_links
+
+ async def extract_downloadable_files(self, url, custom_ext_list):
+ found_files = []
+ try:
+ # Normalize the URL to handle special cases
+ normalized_url = normalize_download_url(url)
+
+ # Skip if we've already visited this URL
+ if normalized_url in self.visited_urls:
+ logger.info(f"Skipping already visited URL: {normalized_url}")
+ return []
+
+ # Mark this URL as visited
+ self.visited_urls.add(normalized_url)
+
+ # Rotate proxy if needed
+ await self.rotate_proxy_if_needed()
+
+ # First check if this is a direct download link (Action=downloadfile or fname parameter)
+ if is_download_link(normalized_url):
+ logger.info(f"Processing potential direct download link: {normalized_url}")
+
+ # Try to extract the real download URL if needed
+ real_url = await self.extract_real_download_url(normalized_url)
+
+ # Determine filename - for complex URLs this can be tricky
+ filename = os.path.basename(urlparse(real_url).path)
+
+ # Handle URL-encoded filenames
+ if '%' in filename:
+ try:
+ filename = unquote(filename)
+ except Exception:
+ pass
+
+ # For URLs with download parameters, try to extract filename from query
+ if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
+ # Look for file parameter
+ params = parse_qs(urlparse(normalized_url).query)
+
+ # Check common filename parameters
+ for param in ['file', 'filename', 'name', 'fname', 'f']:
+ if param in params and params[param]:
+ potential_filename = params[param][0]
+ if potential_filename and '/' not in potential_filename and '\\' not in potential_filename:
+ filename = os.path.basename(potential_filename)
+ break
+
+ # If still no valid filename, use domain-based fallback
+ if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
+ domain = get_domain(real_url)
+ # Try to determine file type from content-type or extension hints in URL
+ ext = '.pdf' # Default
+ for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
+ if common_ext in normalized_url.lower():
+ ext = common_ext
+ break
+ filename = f"file_from_{domain}{ext}"
+
+ # Get file size
+ size_str = await self.get_file_size(real_url)
+
+ # Add to found files
+ found_files.append({
+ 'url': real_url,
+ 'filename': filename,
+ 'size': size_str,
+ 'metadata': {},
+ 'download_url': normalized_url # Keep original URL for downloading
+ })
+
+ # For direct download links, we can return early
+ if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)):
+ return found_files
+
+ # Special handling for educational exam sites
+ if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
+ ["exam", "test", "pastpaper", "eduexp"]):
+ logger.info("Using specialized handler for educational exam site")
+
+ # Get direct links to exam files
+ exam_links = await self.get_edu_exam_links(url)
+
+ for link in exam_links:
+ # Try to resolve any redirection
+ real_url = await self.extract_real_download_url(link)
+ filename = os.path.basename(urlparse(real_url).path)
+
+ # If filename is URL encoded (common with Chinese/international sites)
+ if '%' in filename:
+ try:
+ filename = unquote(filename)
+ except Exception:
+ pass
+
+ # If filename is empty or invalid, create a sensible one
+ if not filename or filename == '/':
+ domain = get_domain(real_url)
+ ext = '.pdf' # Default
+ for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
+ if common_ext in link.lower():
+ ext = common_ext
+ break
+ filename = f"file_from_{domain}{ext}"
+
+ # Get file size
+ size_str = await self.get_file_size(real_url)
+
+ # Get metadata for PDFs
+ meta = {}
+ if real_url.lower().endswith('.pdf'):
+ try:
+ meta = await self.get_pdf_metadata(real_url)
+ except Exception:
+ pass
+
+ found_files.append({
+ 'url': real_url,
+ 'filename': filename,
+ 'size': size_str,
+ 'metadata': meta,
+ 'download_url': link # Store original link for downloading
+ })
+
+ # If we found exam files with the specialized method, return them
+ if found_files:
+ return found_files
+
+ # Standard extraction method if specialized method didn't find files
+ response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
+ if not response:
+ return []
+
+ # Check for captchas
+ if not await self.handle_captcha(self.page):
+ logger.warning("Captcha detected, file extraction may be limited")
+
+ # Scroll through the page naturally to trigger lazy loading
+ await self.page.evaluate("""
+ (async () => {
+ const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
+ const height = document.body.scrollHeight;
+ const scrollStep = Math.floor(window.innerHeight / 2);
+
+ for (let i = 0; i < height; i += scrollStep) {
+ window.scrollTo(0, i);
+ await delay(100);
+ }
+
+ window.scrollTo(0, 0);
+ })()
+ """)
+ await self.page.wait_for_timeout(1000)
+
+ final_url = self.page.url
+ if '.php' in final_url or 'download' in final_url:
+ real_url = await self.extract_real_download_url(final_url)
+ if real_url != final_url:
+ # Try to detect the filename from headers or URL
+ response = await self.page.request.head(real_url, timeout=15000)
+ filename = None
+
+ # Try to get from Content-Disposition header
+ content_disposition = response.headers.get('Content-Disposition', '')
+ if 'filename=' in content_disposition:
+ filename_match = re.search(r'filename=["\'](.*?)["\']', content_disposition)
+ if filename_match:
+ filename = filename_match.group(1)
+
+ # If not found in headers, use URL basename
+ if not filename:
+ filename = os.path.basename(urlparse(real_url).path)
+ if not filename or filename == '/':
+ # Generate a name based on domain
+ domain = get_domain(real_url)
+ ext = '.pdf' # Default
+ for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
+ if common_ext in real_url.lower():
+ ext = common_ext
+ break
+ filename = f"file_from_{domain}{ext}"
+
+ found_files.append({
+ 'url': real_url,
+ 'filename': filename,
+ 'size': await self.get_file_size(real_url),
+ 'metadata': {},
+ 'download_url': final_url # Keep original URL for downloading
+ })
+ return found_files
+
+ await self.page.wait_for_load_state('networkidle', timeout=30000)
+ content = await self.page.content()
+ soup = BeautifulSoup(content, 'html.parser')
+
+ default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
+ '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
+ '.pptx', '.odt', '.txt']
+ all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
+
+ parsed_base = urlparse(final_url)
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+ path_base = os.path.dirname(parsed_base.path)
+
+ # Process all anchor tags
+ for a in soup.find_all('a', href=True):
+ href = a['href'].strip()
+
+ if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower():
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+ real_url = await self.extract_real_download_url(full_url)
+ if real_url and real_url != full_url:
+ found_files.append({
+ 'url': real_url,
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+ 'size': await self.get_file_size(real_url),
+ 'metadata': {},
+ 'download_url': full_url # Original URL for download
+ })
+ continue
+
+ if any(href.lower().endswith(ext) for ext in all_exts):
+ file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+ size_str = await self.get_file_size(file_url)
+ meta = {}
+ if file_url.lower().endswith('.pdf'):
+ meta = await self.get_pdf_metadata(file_url)
+ found_files.append({
+ 'url': file_url,
+ 'filename': os.path.basename(file_url.split('?')[0]),
+ 'size': size_str,
+ 'metadata': meta,
+ 'download_url': file_url # Same as URL for direct links
+ })
+
+ # Handle Google Drive links
+ elif ("drive.google.com" in href) or ("docs.google.com" in href):
+ file_id = None
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+ match = re.search(pattern, href)
+ if match:
+ file_id = match.group(1)
+ break
+ if file_id:
+ # Get file info to determine type and view-only status
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+
+ # Create a more informative filename based on info
+ filename = f"gdrive_{file_id}"
+ if file_type:
+ filename = f"{filename}.{file_type}"
+
+ size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
+
+ found_files.append({
+ 'url': href, # Use original URL
+ 'filename': filename,
+ 'size': size_str,
+ 'metadata': {
+ 'view_only': is_view_only,
+ 'file_type': file_type,
+ 'file_id': file_id
+ },
+ 'download_url': href # Same as URL for Google Drive
+ })
+
+ # Also check for files in other elements (iframe, embed, object, etc.)
+ other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
+ for elem in other_elements:
+ src = elem.get('src') or elem.get('data')
+ if src and any(src.lower().endswith(ext) for ext in all_exts):
+ file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
+ size_str = await self.get_file_size(file_url)
+ meta = {}
+ if file_url.lower().endswith('.pdf'):
+ meta = await self.get_pdf_metadata(file_url)
+ found_files.append({
+ 'url': file_url,
+ 'filename': os.path.basename(file_url.split('?')[0]),
+ 'size': size_str,
+ 'metadata': meta,
+ 'download_url': file_url
+ })
+
+ # Check for file links in onclick attributes
+ onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
+ for elem in onclick_elements:
+ onclick = await elem.get_attribute('onclick')
+ urls = re.findall(r'(https?://[^\'"]+)', onclick)
+ for url_match in urls:
+ if any(url_match.lower().endswith(ext) for ext in all_exts):
+ size_str = await self.get_file_size(url_match)
+ meta = {}
+ if url_match.lower().endswith('.pdf'):
+ meta = await self.get_pdf_metadata(url_match)
+ found_files.append({
+ 'url': url_match,
+ 'filename': os.path.basename(url_match.split('?')[0]),
+ 'size': size_str,
+ 'metadata': meta,
+ 'download_url': url_match
+ })
+
+ # Also check for data-src and data-url attributes (common in lazy-loaded sites)
+ data_elements = await self.page.query_selector_all('[data-src], [data-url], [data-href], [data-download]')
+ for elem in data_elements:
+ for attr in ['data-src', 'data-url', 'data-href', 'data-download']:
+ try:
+ value = await elem.get_attribute(attr)
+ if value and any(value.lower().endswith(ext) for ext in all_exts):
+ file_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
+ found_files.append({
+ 'url': file_url,
+ 'filename': os.path.basename(file_url.split('?')[0]),
+ 'size': await self.get_file_size(file_url),
+ 'metadata': {},
+ 'download_url': file_url
+ })
+ except:
+ pass
+
+ # Check script tags for JSON data that might contain file URLs
+ script_elements = soup.find_all('script', type='application/json')
+ for script in script_elements:
+ try:
+ json_data = json.loads(script.string)
+ # Look for URL patterns in the JSON data
+ def extract_urls_from_json(obj, urls_found=None):
+ if urls_found is None:
+ urls_found = []
+ if isinstance(obj, dict):
+ for k, v in obj.items():
+ # Check if any key contains url-like terms
+ url_keys = ['url', 'href', 'src', 'link', 'file', 'path', 'download']
+ if any(url_key in k.lower() for url_key in url_keys) and isinstance(v, str) and v.startswith('http'):
+ urls_found.append(v)
+ else:
+ extract_urls_from_json(v, urls_found)
+ elif isinstance(obj, list):
+ for item in obj:
+ extract_urls_from_json(item, urls_found)
+ return urls_found
+
+ json_urls = extract_urls_from_json(json_data)
+ for json_url in json_urls:
+ if any(json_url.lower().endswith(ext) for ext in all_exts):
+ found_files.append({
+ 'url': json_url,
+ 'filename': os.path.basename(json_url.split('?')[0]),
+ 'size': await self.get_file_size(json_url),
+ 'metadata': {},
+ 'download_url': json_url
+ })
+ except:
+ pass
+
+ # Check for hidden download buttons or forms
+ hidden_elements = await self.page.evaluate("""
+ () => {
+ const results = [];
+
+ // Check for hidden forms with download actions
+ const forms = document.querySelectorAll('form[action*="download"], form[action*="file"]');
+ for (const form of forms) {
+ const action = form.getAttribute('action') || '';
+ results.push({
+ type: 'form',
+ action: action,
+ inputs: Array.from(form.querySelectorAll('input[name]')).map(input => {
+ return {name: input.name, value: input.value};
+ })
+ });
+ }
+
+ // Check for hidden download links/buttons
+ const hiddenLinks = Array.from(document.querySelectorAll('a[href]')).filter(a => {
+ const style = window.getComputedStyle(a);
+ return (style.display === 'none' || style.visibility === 'hidden') &&
+ (a.href.includes('download') || a.href.includes('file'));
+ });
+
+ for (const link of hiddenLinks) {
+ results.push({
+ type: 'link',
+ href: link.href,
+ text: link.innerText || link.textContent
+ });
+ }
+
+ return results;
+ }
+ """)
+
+ # Process hidden elements
+ for elem in hidden_elements:
+ if elem['type'] == 'link' and 'href' in elem:
+ href = elem['href']
+ if any(href.lower().endswith(ext) for ext in all_exts):
+ found_files.append({
+ 'url': href,
+ 'filename': os.path.basename(href.split('?')[0]),
+ 'size': await self.get_file_size(href),
+ 'metadata': {},
+ 'download_url': href
+ })
+
+ # Check for hidden links that might be in JavaScript, iframes, or dynamic content
+ hidden_links = await self.discover_hidden_links(self.page)
+ for link in hidden_links:
+ if any(link.lower().endswith(ext) for ext in all_exts):
+ found_files.append({
+ 'url': link,
+ 'filename': os.path.basename(link.split('?')[0]),
+ 'size': await self.get_file_size(link),
+ 'metadata': {},
+ 'download_url': link
+ })
+
+ # Deduplicate files by URL
+ seen_urls = set()
+ unique_files = []
+ for f in found_files:
+ if f['url'] not in seen_urls:
+ seen_urls.add(f['url'])
+ unique_files.append(f)
+
+ return unique_files
+ except Exception as e:
+ logger.error(f"Error extracting files from {url}: {e}")
+ traceback.print_exc()
+ return []
+
+ async def download_file(self, file_info, save_dir, referer):
+ file_url = file_info.get('download_url', file_info['url']) # Use download_url if available
+ fname = file_info['filename']
+ path = os.path.join(save_dir, fname)
+ base, ext = os.path.splitext(fname)
+ counter = 1
+ while os.path.exists(path):
+ path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+ counter += 1
+ os.makedirs(save_dir, exist_ok=True)
+
+ # Check if we've already downloaded this file
+ if file_url in self.downloaded_files:
+ logger.info(f"File already downloaded: {file_url}")
+ return None
+
+ try:
+ # Special handling for Google Drive files
+ if "drive.google.com" in file_url or "docs.google.com" in file_url:
+ # Check if it's marked as view-only in metadata
+ is_view_only = file_info.get('metadata', {}).get('view_only', False)
+
+ # For view-only files, try our most robust approach first
+ if is_view_only:
+ logger.info(f"Attempting to download view-only file: {file_url}")
+ result_path = await self.force_download_viewonly(file_info, path)
+ if result_path:
+ self.downloaded_files.add(file_url)
+ return result_path
+
+ # If that failed, try the regular download approach
+ logger.info("Primary method failed, trying fallback methods")
+
+ # Try regular download methods
+ success = await self.download_from_google_drive(file_url, path)
+ if success:
+ self.downloaded_files.add(file_url)
+ return path
+
+ # If all methods failed for Google Drive, try one last approach
+ logger.warning("All standard methods failed, attempting force download")
+ result_path = await self.force_download_viewonly(file_info, path)
+ if result_path:
+ self.downloaded_files.add(file_url)
+ return result_path if result_path else None
+
+ # Special handling for complex download URLs
+ if 'Action=downloadfile' in file_url or 'fname=' in file_url:
+ logger.info(f"Using browser download approach for complex URL: {file_url}")
+
+ # For these URLs, we'll need to navigate to the page and handle the download
+ await self.rotate_proxy_if_needed()
+
+ async with self.context.new_page() as page:
+ # Set up download event listener
+ download_promise = page.wait_for_event("download")
+
+ # Navigate to the URL
+ await page.goto(file_url, timeout=60000)
+
+ # Wait for the download to start
+ try:
+ download = await download_promise
+ await download.save_as(path)
+
+ if os.path.exists(path) and os.path.getsize(path) > 0:
+ self.downloaded_files.add(file_url)
+ return path
+ except Exception as e:
+ logger.error(f"Browser download failed: {e}")
+
+ # If download didn't start automatically, try to find and click download buttons
+ download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]')
+ for button in download_buttons:
+ try:
+ await button.click()
+ try:
+ download = await download_promise
+ await download.save_as(path)
+ if os.path.exists(path) and os.path.getsize(path) > 0:
+ self.downloaded_files.add(file_url)
+ return path
+ except:
+ pass
+ except:
+ continue
+
+ # If browser approach failed, try direct request as last resort
+ logger.info("Browser approach failed, trying direct request")
+
+ # Rotate proxy if needed
+ await self.rotate_proxy_if_needed()
+
+ # Try with direct requests first (faster)
+ try:
+ headers = {
+ 'User-Agent': get_random_user_agent(),
+ 'Accept': '*/*',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Referer': referer,
+ 'DNT': '1'
+ }
+
+ with requests.get(file_url, headers=headers, stream=True, timeout=30) as response:
+ if response.status_code == 200:
+ # Check content type to verify it's not HTML/error page
+ content_type = response.headers.get('Content-Type', '')
+ if 'text/html' in content_type and not file_url.endswith('.html'):
+ logger.warning(f"Received HTML instead of expected file: {file_url}")
+ else:
+ with open(path, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ if chunk:
+ f.write(chunk)
+
+ # Verify file was downloaded correctly
+ if os.path.exists(path) and os.path.getsize(path) > 0:
+ self.downloaded_files.add(file_url)
+ return path
+ except Exception as e:
+ logger.warning(f"Direct download failed: {e}, trying browser approach")
+
+ # Original code for non-Google Drive downloads using Playwright
+ async with self.context.new_page() as page:
+ headers = {
+ 'Accept': '*/*',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Referer': referer
+ }
+
+ # Try to download with timeout protection
+ try:
+ response = await page.request.get(file_url, headers=headers, timeout=self.download_timeout * 1000)
+ if response.status == 200:
+ content = await response.body()
+ with open(path, 'wb') as f:
+ f.write(content)
+ if os.path.exists(path) and os.path.getsize(path) > 0:
+ self.downloaded_files.add(file_url)
+ return path
+ else:
+ logger.error(f"Download failed with status {response.status}: {file_url}")
+
+ # Try to extract error information
+ error_info = await response.text()
+ logger.debug(f"Error response: {error_info[:200]}...")
+
+ # Check if this might be a captcha or login issue
+ if detect_captcha(error_info):
+ logger.warning("Captcha detected during download")
+ # For HF Spaces, we can't implement browser-based captcha solving here
+ # Just log the issue for now
+ except PlaywrightTimeoutError:
+ logger.error(f"Download timed out after {self.download_timeout} seconds: {file_url}")
+
+ # Try an alternative approach - using the browser's download manager
+ try:
+ logger.info("Trying browser download manager approach")
+ download_promise = page.wait_for_event("download")
+ await page.goto(file_url, timeout=60000)
+
+ # Wait for download to start (with timeout)
+ download = await download_promise
+ await download.save_as(path)
+
+ if os.path.exists(path) and os.path.getsize(path) > 0:
+ self.downloaded_files.add(file_url)
+ return path
+ except Exception as e:
+ logger.error(f"Browser download manager approach failed: {e}")
+
+ return None
+ except Exception as e:
+ logger.error(f"Error downloading {file_url}: {e}")
+ return None
+
+ async def force_download_viewonly(self, file_info, save_path):
+ """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
+ try:
+ # Extract file ID
+ file_id = file_info.get('metadata', {}).get('file_id')
+ if not file_id:
+ url = file_info['url']
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+ match = re.search(pattern, url)
+ if match:
+ file_id = match.group(1)
+ break
+
+ if not file_id:
+ logger.error("Could not extract file ID")
+ return None
+
+ file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
+ base, ext = os.path.splitext(save_path)
+ if not ext:
+ save_path = f"{base}.{file_type}"
+
+ logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
+
+ # Create a dedicated browser instance with better resolution and stealth
+ browser_args = [
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-web-security',
+ '--disable-features=IsolateOrigins,site-per-process',
+ '--disable-site-isolation-trials',
+ '--disable-blink-features=AutomationControlled' # Anti-detection
+ ]
+
+ browser = await self.playwright.chromium.launch(
+ headless=True,
+ args=browser_args
+ )
+
+ # Use higher resolution for better quality
+ context = await browser.new_context(
+ viewport={'width': 1600, 'height': 1200},
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ device_scale_factor=2.0,
+ accept_downloads=True # Critical for the download workflow
+ )
+
+ # Add anti-detection script
+ await context.add_init_script("""
+ () => {
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => false,
+ });
+
+ // Change plugins
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5].map(() => ({
+ lengthComputable: true,
+ loaded: 100,
+ total: 100
+ }))
+ });
+
+ // Handle languages
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['en-US', 'en', 'es']
+ });
+
+ // Modify hardware concurrency
+ Object.defineProperty(navigator, 'hardwareConcurrency', {
+ get: () => 4
+ });
+ }
+ """)
+
+ page = await context.new_page()
+
+ try:
+ # Go to the file view page
+ logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
+ await page.wait_for_load_state('networkidle')
+
+ # Check for any barriers or permissions issues
+ content = await page.content()
+ if "the owner has not granted you permission to" in content:
+ logger.warning("Permission denied error detected")
+
+ # Randomized wait to appear more human-like
+ await page.wait_for_timeout(random.randint(3000, 7000))
+
+ # Create temp directory
+ temp_dir = tempfile.mkdtemp()
+
+ # Special handling for PDFs
+ if file_type.lower() == 'pdf':
+ # Use the improved scrolling and detection approach
+
+ # Perform some natural mouse movements and scrolling
+ await page.mouse.move(x=random.randint(200, 400), y=random.randint(200, 400))
+ await page.wait_for_timeout(random.randint(500, 1000))
+
+ # Estimate number of pages
+ estimated_pages = await page.evaluate("""
+ () => {
+ // Method 1: Check page counter text
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+ const text = el.textContent || '';
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
+ });
+
+ if (pageCounters.length > 0) {
+ const text = pageCounters[0].textContent || '';
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
+ if (match && match[2]) return parseInt(match[2]);
+ }
+
+ // Method 2: Check actual page elements
+ const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
+ if (pageElements.length > 0) return pageElements.length;
+
+ // Method 3: Look for page thumbnails
+ const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
+ if (thumbnails.length > 0) return thumbnails.length;
+
+ // Fallback: conservative guess
+ return 50;
+ }
+ """)
+
+ logger.info(f"Estimated {estimated_pages} pages in PDF")
+
+ # Initial scroll to trigger lazy loading
+ logger.info("Initial scroll to bottom to trigger lazy loading...")
+ await page.keyboard.press("End")
+ await page.wait_for_timeout(3000)
+
+ # Scroll page by page to ensure all pages are loaded
+ logger.info("Scrolling page by page...")
+ max_attempts = min(estimated_pages * 3, 300)
+ attempt = 0
+ prev_blob_count = 0
+
+ while attempt < max_attempts:
+ blob_count = await page.evaluate("""
+ Array.from(document.getElementsByTagName('img'))
+ .filter(img => img.src.startsWith('blob:') && img.width > 100)
+ .length
+ """)
+
+ logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
+
+ if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
+ logger.info("All pages appear to be loaded.")
+ break
+
+ # Alternate between PageDown and End keys for more natural scrolling
+ if attempt % 3 == 0:
+ await page.keyboard.press("End")
+ else:
+ await page.keyboard.press("PageDown")
+
+ # Randomized wait times
+ await page.wait_for_timeout(random.randint(1500, 3000))
+
+ # Move mouse randomly to appear more human-like
+ if attempt % 4 == 0:
+ await page.mouse.move(x=random.randint(200, 800), y=random.randint(200, 800))
+
+ prev_blob_count = blob_count
+ attempt += 1
+
+ # Extra wait to ensure everything is loaded
+ await page.wait_for_timeout(5000)
+
+ # Set up download event listener for the PDF
+ download_promise = page.wait_for_event("download")
+
+ # Use jsPDF to generate PDF from loaded pages
+ logger.info("Generating PDF from loaded pages...")
+ result = await page.evaluate(r'''
+ (function() {
+ return new Promise((resolve, reject) => {
+ let script = document.createElement("script");
+ script.onload = function () {
+ try {
+ let pdf = new jsPDF();
+ let imgs = Array.from(document.getElementsByTagName("img"))
+ .filter(img => img.src.startsWith('blob:') && img.width > 100)
+ .sort((a, b) => {
+ const rectA = a.getBoundingClientRect();
+ const rectB = b.getBoundingClientRect();
+ return rectA.top - rectB.top;
+ });
+
+ console.log(`Found ${imgs.length} valid page images to add to PDF`);
+
+ let added = 0;
+ for (let i = 0; i < imgs.length; i++) {
+ let img = imgs[i];
+ let canvas = document.createElement("canvas");
+ let ctx = canvas.getContext("2d");
+ canvas.width = img.width;
+ canvas.height = img.height;
+ ctx.drawImage(img, 0, 0, img.width, img.height);
+ let imgData = canvas.toDataURL("image/jpeg", 1.0);
+
+ if (added > 0) {
+ pdf.addPage();
+ }
+
+ pdf.addImage(imgData, 'JPEG', 0, 0);
+ added++;
+ }
+
+ pdf.save("download.pdf");
+ resolve({success: true, pageCount: added});
+ } catch (error) {
+ reject({success: false, error: error.toString()});
+ }
+ };
+
+ script.onerror = function() {
+ reject({success: false, error: "Failed to load jsPDF library"});
+ };
+
+ script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
+ document.body.appendChild(script);
+ });
+ })();
+ ''')
+
+ if not result.get('success', False):
+ logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
+
+ # Try fallback approach - screenshot method
+ logger.info("Trying fallback screenshot method...")
+
+ # Navigate back to the first page
+ await page.evaluate("""
+ () => {
+ // Find and click the "first page" button if available
+ const buttons = Array.from(document.querySelectorAll('button'));
+ const firstPageBtn = buttons.find(b => b.getAttribute('aria-label')?.includes('First page'));
+ if (firstPageBtn) firstPageBtn.click();
+ }
+ """)
+ await page.wait_for_timeout(1000);
+
+ # Create a PDF by taking screenshots of each page
+ screenshots = []
+ current_page = 1
+ max_pages = estimated_pages
+
+ # Create a PDF using the reportlab package
+ while current_page <= max_pages:
+ screenshot_path = os.path.join(temp_dir, f"page_{current_page}.png")
+
+ # Try to find the current page element
+ page_elem = await page.query_selector('.drive-viewer-paginated-page')
+ if page_elem:
+ await page_elem.screenshot(path=screenshot_path)
+ else:
+ # Fallback to full page screenshot
+ await page.screenshot(path=screenshot_path)
+
+ screenshots.append(screenshot_path)
+
+ # Try to navigate to next page
+ next_btn = await page.query_selector('button[aria-label="Next page"]')
+ if next_btn:
+ is_disabled = await next_btn.get_attribute('disabled')
+ if is_disabled:
+ logger.info(f"Reached end of document at page {current_page}")
+ break
+
+ await next_btn.click()
+ await page.wait_for_timeout(1000)
+ current_page += 1
+ else:
+ break
+
+ # Create PDF from screenshots
+ if screenshots:
+ first_img = Image.open(screenshots[0])
+ width, height = first_img.size
+
+ c = canvas.Canvas(save_path, pagesize=(width, height))
+ for screenshot in screenshots:
+ img = Image.open(screenshot)
+ c.drawImage(screenshot, 0, 0, width, height)
+ c.showPage()
+ c.save()
+
+ # Clean up screenshots
+ for screenshot in screenshots:
+ os.remove(screenshot)
+
+ return save_path
+
+ return None
+
+ logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
+
+ # Wait for the download and save it
+ download = await download_promise
+ await download.save_as(save_path)
+
+ # Clean up temp directory
+ try:
+ os.rmdir(temp_dir)
+ except:
+ pass
+
+ else:
+ # Non-PDF file handling
+ screenshot_path = os.path.join(temp_dir, "file.png")
+ await page.screenshot(path=screenshot_path)
+
+ if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
+ # For document types, try to export directly
+ await self.export_google_doc(file_id, file_type, save_path)
+ else:
+ # For other types, save the screenshot with appropriate extension
+ shutil.copy(screenshot_path, save_path)
+
+ os.remove(screenshot_path)
+
+ # Close browser
+ await browser.close()
+
+ # Verify file exists and has content
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
+ logger.info(f"Successfully downloaded file to {save_path}")
+ return save_path
+ else:
+ logger.error(f"Generated file is too small or missing: {save_path}")
+ return None
+
+ except Exception as e:
+ logger.error(f"Error during force download: {e}")
+ if browser:
+ await browser.close()
+ return None
+
+ except Exception as e:
+ logger.error(f"Force download preparation failed: {e}")
+ return None
+
+ async def download_from_google_drive(self, url, save_path):
+ """Enhanced method to download from Google Drive with multiple fallback approaches"""
+ # Extract the file ID from different URL formats
+ file_id = None
+ url_patterns = [
+ r'drive\.google\.com/file/d/([^/]+)',
+ r'drive\.google\.com/open\?id=([^&]+)',
+ r'docs\.google\.com/\w+/d/([^/]+)',
+ r'id=([^&]+)',
+ r'drive\.google\.com/uc\?id=([^&]+)',
+ ]
+
+ for pattern in url_patterns:
+ match = re.search(pattern, url)
+ if match:
+ file_id = match.group(1)
+ break
+
+ if not file_id:
+ logger.error(f"Could not extract file ID from URL: {url}")
+ return False
+
+ # Determine file type first (important for handling different file types)
+ file_type, is_view_only = await self.get_google_drive_file_info(file_id)
+ logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
+
+ base, ext = os.path.splitext(save_path)
+ if not ext and file_type:
+ # Add the correct extension if missing
+ save_path = f"{base}.{file_type}"
+
+ # For view-only files, use specialized approaches
+ if is_view_only:
+ # Approach 1: For PDFs, use the JS method
+ if file_type == 'pdf':
+ success = await self.download_viewonly_pdf_with_js(file_id, save_path)
+ if success:
+ return True
+
+ # Approach 2: For Google Docs, Sheets, etc., use export API
+ if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
+ success = await self.export_google_doc(file_id, file_type, save_path)
+ if success:
+ return True
+
+ # Approach 3: Try the direct screenshot method for any view-only file
+ success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
+ if success:
+ return True
+
+ # Try standard approaches for non-view-only files
+ try:
+ # Try direct download link first (fastest)
+ direct_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
+
+ # Add anti-bot headers
+ headers = {
+ 'User-Agent': get_random_user_agent(),
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.9',
+ 'Referer': 'https://drive.google.com/',
+ 'DNT': '1'
+ }
+
+ # Try with streaming to handle larger files
+ with requests.get(direct_url, headers=headers, stream=True, timeout=60) as r:
+ if r.status_code == 200:
+ # Check if we got HTML instead of the file
+ content_type = r.headers.get('Content-Type', '')
+ if 'text/html' in content_type and not file_id.endswith('.html'):
+ logger.warning("Received HTML instead of file, trying with session cookies")
+ else:
+ # Looks like we got the actual file
+ with open(save_path, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=8192):
+ if chunk:
+ f.write(chunk)
+
+ # Verify file exists and has content
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+ logger.info("Direct download successful")
+ return True
+
+ # Try with requests and session cookies
+ session = requests.Session()
+ session.headers.update({'User-Agent': get_random_user_agent()})
+
+ # Visit the page first to get cookies
+ session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
+
+ # Try download
+ url = f"https://drive.google.com/uc?id={file_id}&export=download"
+ response = session.get(url, stream=True, timeout=30)
+
+ # Check for confirmation token
+ confirmation_token = None
+ for k, v in response.cookies.items():
+ if k.startswith('download_warning'):
+ confirmation_token = v
+ break
+
+ # Use confirmation token if found
+ if confirmation_token:
+ url = f"{url}&confirm={confirmation_token}"
+ response = session.get(url, stream=True, timeout=60)
+
+ # Check if we're getting HTML instead of the file
+ content_type = response.headers.get('Content-Type', '')
+ if 'text/html' in content_type:
+ logger.warning("Received HTML instead of file - likely download restriction")
+ else:
+ with open(save_path, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=1024*1024):
+ if chunk:
+ f.write(chunk)
+
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+ with open(save_path, 'rb') as f:
+ content = f.read(100)
+ if b'' not in content:
+ logger.info("Successfully downloaded with requests session")
+ return True
+ except Exception as e:
+ logger.warning(f"Requests session download failed: {e}")
+
+ # Try browser-based approach as last resort
+ try:
+ async with self.context.new_page() as page:
+ # Visit the file view page first to get cookies
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
+ await page.wait_for_timeout(3000)
+
+ # Set up download event listener
+ download_promise = page.wait_for_event("download")
+
+ # Try to trigger the download button click
+ download_button = await page.query_selector('button[aria-label*="Download"], [data-tooltip*="Download"]')
+ if download_button:
+ await download_button.click()
+
+ # Wait for download to start
+ try:
+ download = await download_promise
+ await download.save_as(save_path)
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+ except Exception as e:
+ logger.error(f"Error during browser download: {e}")
+ return False
+ else:
+ # Try the export download URL
+ await page.goto(f"https://drive.google.com/uc?id={file_id}&export=download", timeout=30000)
+
+ # Look for and click any download buttons or links
+ download_elements = await page.query_selector_all('a[href*="download"], a[href*="export"], form[action*="download"], button:has-text("Download")')
+ for elem in download_elements:
+ try:
+ await elem.click()
+ # Wait a bit to see if download starts
+ try:
+ download = await download_promise
+ await download.save_as(save_path)
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+ except:
+ pass
+ except:
+ continue
+ except Exception as e:
+ logger.error(f"Browser-based download attempt failed: {e}")
+
+ logger.warning("All standard download methods failed")
+ return False
+
+ async def download_viewonly_pdf_with_js(self, file_id, save_path):
+ """Download view-only PDF using the enhanced blob image caching technique"""
+ try:
+ # Create a dedicated browser instance with stealth capabilities
+ browser_args = [
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-web-security',
+ '--disable-blink-features=AutomationControlled' # Anti-detection
+ ]
+
+ browser = await self.playwright.chromium.launch(
+ headless=True,
+ args=browser_args
+ )
+
+ # Setup stealth context
+ context = await browser.new_context(
+ viewport={'width': 1600, 'height': 1200},
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ accept_downloads=True, # Critical for handling the download event
+ ignore_https_errors=True
+ )
+
+ # Add stealth script
+ await context.add_init_script("""
+ () => {
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => false,
+ });
+
+ // Change plugins and languages to appear more human
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5].map(() => ({
+ lengthComputable: true,
+ loaded: 100,
+ total: 100
+ }))
+ });
+
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['en-US', 'en', 'es']
+ });
+ }
+ """)
+
+ page = await context.new_page()
+
+ try:
+ # Step 1: Navigate to the file with human-like behavior
+ logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
+ await page.wait_for_load_state('networkidle')
+
+ # Perform human-like interactions
+ await page.mouse.move(x=random.randint(100, 500), y=random.randint(100, 300))
+ await page.wait_for_timeout(random.randint(2000, 5000))
+
+ # Step 2: Estimate the number of pages
+ estimated_pages = await page.evaluate("""
+ () => {
+ // Look for page counter in the interface
+ const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
+ const text = el.textContent || '';
+ return /\\d+\\s*\\/\\s*\\d+/.test(text);
+ });
+
+ if (pageCounters.length > 0) {
+ const text = pageCounters[0].textContent || '';
+ const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
+ if (match && match[2]) return parseInt(match[2]);
+ }
+
+ // If we can't find a counter, check actual pages
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+ if (pages.length > 0) return pages.length;
+
+ // Default to a reasonable number if we can't determine
+ return 50;
+ }
+ """)
+
+ logger.info(f"Estimated number of pages: {estimated_pages}")
+
+ # Step 3: Initial scroll to trigger loading
+ logger.info("Initial scroll to bottom to trigger lazy loading...")
+ await page.keyboard.press("End")
+ await page.wait_for_timeout(3000)
+
+ # Step 4: Wait for all pages to load with better feedback and randomization
+ logger.info("Scrolling through document to load all pages...")
+ max_attempts = min(estimated_pages * 3, 300)
+ attempt = 0
+ prev_blob_count = 0
+ consecutive_same_count = 0
+
+ while attempt < max_attempts:
+ # Count blob images (which are the PDF pages)
+ blob_count = await page.evaluate("""
+ Array.from(document.getElementsByTagName('img'))
+ .filter(img => img.src.startsWith('blob:') && img.width > 100)
+ .length
+ """)
+
+ logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
+
+ # Check if we've loaded all pages or if we're stuck
+ if blob_count >= estimated_pages:
+ logger.info(f"All {estimated_pages} pages appear to be loaded.")
+ break
+
+ if blob_count == prev_blob_count:
+ consecutive_same_count += 1
+ if consecutive_same_count >= 5 and blob_count > 0:
+ logger.info(f"No new pages loaded after {consecutive_same_count} attempts. Assuming all available pages ({blob_count}) are loaded.")
+ break
+ else:
+ consecutive_same_count = 0
+
+ # Mix up the scrolling approach for more human-like behavior
+ scroll_action = random.choice(["PageDown", "End", "ArrowDown", "mouse"])
+
+ if scroll_action == "PageDown":
+ await page.keyboard.press("PageDown")
+ elif scroll_action == "End":
+ await page.keyboard.press("End")
+ elif scroll_action == "ArrowDown":
+ # Press arrow down multiple times
+ for _ in range(random.randint(5, 15)):
+ await page.keyboard.press("ArrowDown")
+ await page.wait_for_timeout(random.randint(50, 150))
+ else: # mouse
+ # Scroll using mouse wheel
+ current_y = random.randint(300, 700)
+ await page.mouse.move(x=random.randint(300, 800), y=current_y)
+ await page.mouse.wheel(0, random.randint(300, 800))
+
+ # Random wait between scrolls
+ await page.wait_for_timeout(random.randint(1000, 3000))
+
+ prev_blob_count = blob_count
+ attempt += 1
+
+ # Extra wait to ensure everything is fully loaded
+ await page.wait_for_timeout(5000)
+
+ # Step 5: Set up a download event listener
+ download_promise = page.wait_for_event("download")
+
+ # Step 6: Inject the jsPDF script to generate PDF
+ logger.info("Generating PDF from loaded pages...")
+ result = await page.evaluate(r'''
+ (function() {
+ return new Promise((resolve, reject) => {
+ let script = document.createElement("script");
+ script.onload = function () {
+ try {
+ let pdf = new jsPDF();
+ let imgs = document.getElementsByTagName("img");
+ let validImages = [];
+
+ // First collect all valid blob images
+ for (let i = 0; i < imgs.length; i++) {
+ let img = imgs[i];
+ if (!/^blob:/.test(img.src)) continue;
+ if (img.width < 100 || img.height < 100) continue;
+ validImages.push(img);
+ }
+
+ // Sort by position in the document
+ validImages.sort((a, b) => {
+ const rectA = a.getBoundingClientRect();
+ const rectB = b.getBoundingClientRect();
+ return rectA.top - rectB.top;
+ });
+
+ console.log(`Found ${validImages.length} valid page images to add to PDF`);
+
+ let added = 0;
+ // Process each image as a page
+ for (let i = 0; i < validImages.length; i++) {
+ let img = validImages[i];
+ let canvas = document.createElement("canvas");
+ let ctx = canvas.getContext("2d");
+ canvas.width = img.width;
+ canvas.height = img.height;
+ ctx.drawImage(img, 0, 0, img.width, img.height);
+ let imgData = canvas.toDataURL("image/jpeg", 1.0);
+
+ if (added > 0) {
+ pdf.addPage();
+ }
+
+ pdf.addImage(imgData, 'JPEG', 0, 0);
+ added++;
+ }
+
+ pdf.save("download.pdf");
+ resolve({success: true, pageCount: added});
+ } catch (error) {
+ reject({success: false, error: error.toString()});
+ }
+ };
+
+ script.onerror = function() {
+ reject({success: false, error: "Failed to load jsPDF library"});
+ };
+
+ // Use a reliable CDN
+ script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
+ document.body.appendChild(script);
+ });
+ })();
+ ''')
+
+ if not result.get('success'):
+ logger.error(f"Error in PDF generation: {result.get('error')}")
+ return False
+
+ logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
+
+ # Step 7: Wait for the download to complete and save the file
+ download = await download_promise
+
+ # Step 8: Save the downloaded file to the specified path
+ await download.save_as(save_path)
+ logger.info(f"Successfully saved PDF to {save_path}")
+
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 1000
+
+ finally:
+ await browser.close()
+
+ except Exception as e:
+ logger.error(f"Error in viewonly PDF download process: {e}")
+ return False
+
+ async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
+ """Download any view-only file by taking screenshots"""
+ try:
+ async with self.context.new_page() as page:
+ # Set high-resolution viewport
+ await page.set_viewport_size({"width": 1600, "height": 1200})
+
+ # Navigate to the file
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
+
+ # Make sure the file is loaded
+ await page.wait_for_load_state('networkidle')
+ await page.wait_for_timeout(3000) # Extra time for rendering
+
+ # Create directory for screenshots if multiple pages
+ base_dir = os.path.dirname(save_path)
+ base_name = os.path.splitext(os.path.basename(save_path))[0]
+ screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
+ os.makedirs(screenshots_dir, exist_ok=True)
+
+ # Check if it's a multi-page document
+ is_multi_page = await page.evaluate("""
+ () => {
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+ return pages.length > 1;
+ }
+ """)
+
+ if is_multi_page and file_type == 'pdf':
+ # For multi-page PDFs, take screenshots of each page
+ page_count = await page.evaluate("""
+ async () => {
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+ const container = document.querySelector('.drive-viewer-paginated-scrollable');
+
+ if (!container || pages.length === 0) return 0;
+
+ // Scroll through to make sure all pages are loaded
+ const scrollHeight = container.scrollHeight;
+ const viewportHeight = container.clientHeight;
+ const scrollStep = viewportHeight;
+
+ for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
+ container.scrollTo(0, scrollPos);
+ await delay(300);
+ }
+
+ // Scroll back to top
+ container.scrollTo(0, 0);
+ await delay(300);
+
+ return pages.length;
+ }
+ """)
+
+ logger.info(f"Found {page_count} pages in document")
+
+ # Take screenshots of each page
+ screenshots = []
+ for i in range(page_count):
+ # Scroll to page
+ await page.evaluate(f"""
+ async () => {{
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+ const pages = document.querySelectorAll('.drive-viewer-paginated-page');
+ if (pages.length <= {i}) return false;
+
+ pages[{i}].scrollIntoView();
+ await delay(500);
+ return true;
+ }}
+ """)
+
+ # Take screenshot
+ screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
+ await page.screenshot(path=screenshot_path, clip={
+ 'x': 0,
+ 'y': 0,
+ 'width': 1600,
+ 'height': 1200
+ })
+ screenshots.append(screenshot_path)
+
+ # Combine screenshots into PDF
+ c = canvas.Canvas(save_path)
+ for screenshot in screenshots:
+ img = Image.open(screenshot)
+ width, height = img.size
+
+ # Add page to PDF
+ c.setPageSize((width, height))
+ c.drawImage(screenshot, 0, 0, width, height)
+ c.showPage()
+
+ c.save()
+
+ # Clean up screenshots
+ for screenshot in screenshots:
+ os.remove(screenshot)
+ os.rmdir(screenshots_dir)
+
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+ else:
+ # For single-page or non-PDF files, just take one screenshot
+ screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
+ await page.screenshot(path=screenshot_path, fullPage=True)
+
+ # Convert to requested format if needed
+ if file_type == 'pdf':
+ # Create PDF from screenshot
+ img = Image.open(screenshot_path)
+ width, height = img.size
+
+ c = canvas.Canvas(save_path, pagesize=(width, height))
+ c.drawImage(screenshot_path, 0, 0, width, height)
+ c.save()
+ else:
+ # Just copy the screenshot to the destination with proper extension
+ shutil.copy(screenshot_path, save_path)
+
+ # Clean up
+ os.remove(screenshot_path)
+ os.rmdir(screenshots_dir)
+
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+
+ except Exception as e:
+ logger.error(f"Error taking screenshots: {e}")
+ return False
+
+ async def export_google_doc(self, file_id, file_type, save_path):
+ """Export Google Docs/Sheets/Slides to downloadable formats"""
+ try:
+ # Map file types to export formats
+ export_formats = {
+ 'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx
+ 'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ 'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx
+ 'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ 'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx
+ 'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ 'pdf': 'application/pdf',
+ }
+
+ export_format = export_formats.get(file_type, 'application/pdf')
+ export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
+
+ if 'sheet' in file_type or 'xlsx' in file_type:
+ export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
+ elif 'ppt' in file_type or 'presentation' in file_type:
+ export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
+ elif file_type == 'pdf':
+ export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
+
+ async with self.context.new_page() as page:
+ # Get cookies from the main view page first
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
+
+ # Now try the export
+ response = await page.goto(export_url, wait_until='networkidle')
+
+ if response.status == 200:
+ content = await response.body()
+ with open(save_path, 'wb') as f:
+ f.write(content)
+ return os.path.exists(save_path) and os.path.getsize(save_path) > 0
+ else:
+ logger.warning(f"Export failed with status {response.status}")
+ return False
+
+ except Exception as e:
+ logger.error(f"Error exporting Google Doc: {e}")
+ return False
+
+ async def get_google_drive_file_info(self, file_id):
+ """Get file type and view-only status from Google Drive"""
+ file_type = None
+ is_view_only = False
+
+ try:
+ async with self.context.new_page() as page:
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
+
+ # Check if view-only
+ view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
+ is_view_only = view_only_text is not None
+
+ # Check for Google Docs viewer
+ gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
+ gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
+ gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
+
+ if gdocs_viewer:
+ file_type = 'docx'
+ elif gsheets_viewer:
+ file_type = 'xlsx'
+ elif gslides_viewer:
+ file_type = 'pptx'
+ else:
+ # Check for PDF viewer
+ pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
+ if pdf_viewer:
+ file_type = 'pdf'
+ else:
+ # Check for image viewer
+ img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
+ if img_viewer:
+ # Get image type from src
+ img_src = await img_viewer.get_attribute('src')
+ if 'jpg' in img_src or 'jpeg' in img_src:
+ file_type = 'jpg'
+ elif 'png' in img_src:
+ file_type = 'png'
+ else:
+ file_type = 'jpg' # Default to jpg
+ else:
+ # Generic file type fallback
+ file_type = 'pdf' # Default to PDF
+
+ # If still no type, check filename
+ if not file_type:
+ title_element = await page.query_selector('div[role="heading"]')
+ if title_element:
+ title = await title_element.text_content()
+ if title:
+ ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
+ if ext_match:
+ file_type = ext_match.group(1).lower()
+
+ except Exception as e:
+ logger.error(f"Error getting Google Drive file info: {e}")
+ file_type = 'pdf' # Default to PDF if we can't determine
+
+ return file_type, is_view_only
+
+ # IMPROVED: Enhanced sublink extraction method
+ async def get_sublinks(self, url, limit=10000):
+ """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
+ links = set()
+ try:
+ logger.info(f"Fetching sublinks from: {url}")
+
+ # Check if this is a direct download link
+ if is_download_link(url):
+ logger.info(f"URL appears to be a direct download link: {url}")
+ links.add(url)
+ return list(links)[:limit]
+
+ # Skip if we've already visited this URL
+ normalized_url = normalize_download_url(url)
+ if normalized_url in self.visited_urls:
+ logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}")
+ return list(links)[:limit]
+
+ # Add to visited URLs
+ self.visited_urls.add(normalized_url)
+
+ # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
+ if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
+ ["exam", "test", "pastpaper", "eduexp"]):
+ logger.info("Using specialized exam site sublink extraction")
+ edu_links = await self.get_edu_exam_links(url)
+ for link in edu_links:
+ links.add(link)
+
+ # If we found a good number of links with the specialized method, return them
+ if len(links) > 5:
+ logger.info(f"Found {len(links)} sublinks with specialized method")
+ return list(links)[:limit]
+
+ # Rotate proxy if needed
+ await self.rotate_proxy_if_needed()
+
+ # Standard sublink extraction for all sites
+ try:
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
+ except Exception as e:
+ logger.warning(f"Error navigating to URL for sublink extraction: {e}")
+ # Continue with what we have, we'll try to extract links anyway
+
+ # Get base URL for resolving relative links
+ parsed_base = urlparse(url)
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+ path_base = os.path.dirname(parsed_base.path)
+
+ # Perform initial scrolling to load lazy content
+ await self.page.evaluate("""
+ async () => {
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+ const height = document.body.scrollHeight;
+ const step = Math.floor(window.innerHeight / 2);
+
+ for (let i = 0; i < height; i += step) {
+ window.scrollTo(0, i);
+ await delay(150);
+ }
+
+ window.scrollTo(0, 0);
+ }
+ """)
+ await self.page.wait_for_timeout(1000)
+
+ # Check if page has ASP.NET elements which might need special handling
+ is_aspnet = await self.page.evaluate('''
+ () => {
+ return document.querySelector('form#aspnetForm') !== null ||
+ document.querySelector('input[name="__VIEWSTATE"]') !== null;
+ }
+ ''')
+
+ if is_aspnet:
+ logger.info("Detected ASP.NET page, using enhanced extraction method")
+
+ # Try to interact with ASP.NET controls that might reveal more links
+ # Look for dropdowns, buttons, and grid elements
+ dropdowns = await self.page.query_selector_all('select')
+ buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
+
+ # Try interacting with dropdowns first
+ for dropdown in dropdowns:
+ try:
+ # Get all options
+ options = await self.page.evaluate('''
+ (dropdown) => {
+ return Array.from(dropdown.options).map(o => o.value);
+ }
+ ''', dropdown)
+
+ # Try selecting each option
+ for option in options:
+ if option:
+ await dropdown.select_option(value=option)
+ await self.page.wait_for_timeout(1000)
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
+
+ # Extract any new links that appeared
+ await self.extract_all_link_types(links, base_url, path_base)
+ except Exception as e:
+ logger.warning(f"Error interacting with dropdown: {e}")
+
+ # Try clicking buttons (but avoid dangerous ones like "delete")
+ safe_buttons = []
+ for button in buttons:
+ button_text = await button.text_content() or ""
+ button_value = await button.get_attribute("value") or ""
+ button_id = await button.get_attribute("id") or ""
+ combined_text = (button_text + button_value + button_id).lower()
+
+ # Skip potentially destructive buttons
+ if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
+ continue
+
+ # Prioritize buttons that might show more content
+ if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
+ safe_buttons.append(button)
+
+ # Click the safe buttons
+ for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks
+ try:
+ await button.click()
+ await self.page.wait_for_timeout(1000)
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
+
+ # Extract any new links that appeared
+ await self.extract_all_link_types(links, base_url, path_base)
+ except Exception as e:
+ logger.warning(f"Error clicking button: {e}")
+
+ # Extract links from the initial page state
+ await self.extract_all_link_types(links, base_url, path_base)
+
+ # Look specifically for links inside grid/table views which are common in ASP.NET applications
+ grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
+ for cell in grid_cells:
+ try:
+ href = await cell.get_attribute('href')
+ if href:
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+ links.add(full_url)
+ except Exception as e:
+ logger.warning(f"Error extracting grid link: {e}")
+
+ # Extract links from onclick attributes and javascript:__doPostBack calls
+ postback_links = await self.page.evaluate('''
+ () => {
+ const results = [];
+ // Find elements with onclick containing __doPostBack
+ const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
+ for (const el of elements) {
+ // Extract the postback target
+ const onclick = el.getAttribute('onclick') || '';
+ const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
+ if (match && match[1]) {
+ // Get the visible text to use as description
+ const text = el.innerText || el.textContent || 'Link';
+ results.push({
+ id: match[1],
+ text: text.trim()
+ });
+ }
+ }
+ return results;
+ }
+ ''')
+
+ # Try interacting with some of the postback links
+ for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions
+ try:
+ logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
+ await self.page.evaluate(f'''
+ () => {{
+ if (typeof __doPostBack === 'function') {{
+ __doPostBack('{postback["id"]}', '');
+ }}
+ }}
+ ''')
+ await self.page.wait_for_timeout(1500)
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
+
+ # Extract any new links that appeared
+ await self.extract_all_link_types(links, base_url, path_base)
+ except Exception as e:
+ logger.warning(f"Error with postback: {e}")
+
+ # Look for pagination controls and try to navigate through them
+ pagination_elements = await self.page.query_selector_all(
+ 'a[href*="page"], .pagination a, .pager a, [onclick*="page"], [aria-label*="Next"]'
+ )
+
+ # Try clicking on pagination links (limit to max 5 pages to avoid infinite loops)
+ for i in range(min(5, len(pagination_elements))):
+ try:
+ # Focus on elements that look like "next page" buttons
+ el = pagination_elements[i]
+ el_text = await el.text_content() or ""
+
+ # Only click if this looks like a pagination control
+ if "next" in el_text.lower() or ">" == el_text.strip() or "โ" == el_text.strip():
+ logger.info(f"Clicking pagination control: {el_text}")
+ await el.click()
+ await self.page.wait_for_timeout(2000)
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
+
+ # Get new links from this page
+ await self.extract_all_link_types(links, base_url, path_base)
+ except Exception as e:
+ logger.warning(f"Error clicking pagination: {e}")
+
+ # Check for hidden links that might be revealed by JavaScript
+ hidden_links = await self.page.evaluate("""
+ () => {
+ // Try to execute common JavaScript patterns that reveal hidden content
+ try {
+ // Common patterns used in websites to initially hide content
+ const hiddenContainers = document.querySelectorAll(
+ '.hidden, .hide, [style*="display: none"], [style*="visibility: hidden"]'
+ );
+
+ // Attempt to make them visible
+ hiddenContainers.forEach(el => {
+ el.style.display = 'block';
+ el.style.visibility = 'visible';
+ el.classList.remove('hidden', 'hide');
+ });
+
+ // Return any newly visible links
+ return Array.from(document.querySelectorAll('a[href]')).map(a => a.href);
+ } catch (e) {
+ return [];
+ }
+ }
+ """)
+
+ # Add any newly discovered links
+ for href in hidden_links:
+ if href and not href.startswith('javascript:'):
+ links.add(href)
+
+ # Find all download links
+ download_links = await self.page.evaluate("""
+ () => {
+ return Array.from(document.querySelectorAll('a[href]'))
+ .filter(a => {
+ const href = a.href.toLowerCase();
+ return href.includes('download') ||
+ href.includes('file') ||
+ href.includes('get') ||
+ href.includes('view.php') ||
+ href.includes('action=') ||
+ href.includes('fname=');
+ })
+ .map(a => a.href);
+ }
+ """)
+
+ for download_link in download_links:
+ links.add(download_link)
+
+ # Also check for hidden links in JavaScript, iframes, or dynamic content
+ js_links = await self.discover_hidden_links(self.page)
+ for link in js_links:
+ links.add(link)
+
+ logger.info(f"Found {len(links)} sublinks")
+
+ # Prioritize download links
+ prioritized_links = []
+ normal_links = []
+
+ for link in links:
+ if is_download_link(link):
+ prioritized_links.append(link)
+ else:
+ normal_links.append(link)
+
+ # Return prioritized links first, then normal links, up to the limit
+ result = prioritized_links + normal_links
+ return result[:limit]
+
+ except Exception as e:
+ logger.error(f"Error getting sublinks from {url}: {e}")
+ return list(links)[:limit] # Return what we have so far
+
+ async def extract_all_link_types(self, links_set, base_url, path_base):
+ """Extract all types of links from the current page"""
+ # Get all tag links
+ a_links = await self.page.query_selector_all('a[href]')
+ for a in a_links:
+ try:
+ href = await a.get_attribute('href')
+ if href and not href.startswith('javascript:') and not href.startswith('#'):
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+ links_set.add(full_url)
+ except Exception:
+ pass
+
+ # Get iframe sources
+ iframes = await self.page.query_selector_all('iframe[src]')
+ for iframe in iframes:
+ try:
+ src = await iframe.get_attribute('src')
+ if src and not src.startswith('javascript:') and not src.startswith('about:'):
+ full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
+ links_set.add(full_url)
+ except Exception:
+ pass
+
+ # Get links from onclick attributes that reference URLs
+ onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
+ for el in onclick_elements:
+ try:
+ onclick = await el.get_attribute('onclick')
+ urls = re.findall(r'(https?://[^\'"]+)', onclick)
+ for url in urls:
+ links_set.add(url)
+ except Exception:
+ pass
+
+ # Look for URLs in data-* attributes
+ data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
+ for el in data_elements:
+ for attr in ['data-url', 'data-href', 'data-src']:
+ try:
+ value = await el.get_attribute(attr)
+ if value and not value.startswith('javascript:'):
+ full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
+ links_set.add(full_url)
+ except Exception:
+ pass
+
+ # Look for special anchor links that might not have href attributes
+ special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
+ for anchor in special_anchors:
+ try:
+ href = await anchor.get_attribute('href')
+ if href and not href.startswith('javascript:') and not href.startswith('#'):
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+ links_set.add(full_url)
+ except Exception:
+ pass
+
+ # Extract links from JSON data embedded in the page
+ script_elements = await self.page.query_selector_all('script[type="application/json"], script[type="text/json"]')
+ for script in script_elements:
+ try:
+ script_content = await script.text_content()
+ if script_content:
+ # Look for URLs in the JSON content
+ urls = re.findall(r'(https?://[^\'"]+)', script_content)
+ for url in urls:
+ links_set.add(url)
+ except Exception:
+ pass
+
+ def resolve_relative_url(self, relative_url, base_url, path_base):
+ """Properly resolve relative URLs considering multiple formats"""
+ if relative_url.startswith('/'):
+ # Absolute path relative to domain
+ return f"{base_url}{relative_url}"
+ elif relative_url.startswith('./'):
+ # Explicit relative path
+ return f"{base_url}{path_base}/{relative_url[2:]}"
+ elif relative_url.startswith('../'):
+ # Parent directory
+ parent_path = '/'.join(path_base.split('/')[:-1])
+ return f"{base_url}{parent_path}/{relative_url[3:]}"
+ else:
+ # Regular relative path
+ return f"{base_url}{path_base}/{relative_url}"
+
+ async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
+ if not custom_ext_list:
+ custom_ext_list = []
+ progress_text = st.empty()
+ progress_bar = st.progress(0)
+ file_count_text = st.empty()
+
+ try:
+ # Reset the visited URLs for a fresh deep search
+ self.visited_urls = set()
+
+ progress_text.text("Analyzing main page...")
+ # Special handling for ASP.NET pages
+ is_aspnet = False
+ try:
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
+ is_aspnet = await self.page.evaluate('''
+ () => {
+ return document.querySelector('form#aspnetForm') !== null ||
+ document.querySelector('input[name="__VIEWSTATE"]') !== null;
+ }
+ ''')
+ except Exception:
+ pass
+
+ # Check if this URL is a direct download
+ if is_download_link(url):
+ progress_text.text("URL appears to be a direct download. Analyzing...")
+
+ # Try to extract file directly
+ normalized_url = normalize_download_url(url)
+ file_info = {
+ 'url': normalized_url,
+ 'download_url': normalized_url,
+ 'filename': os.path.basename(urlparse(normalized_url).path) or 'download',
+ 'size': 'Unknown Size',
+ 'metadata': {}
+ }
+
+ # Add to visited URLs
+ self.visited_urls.add(normalized_url)
+ progress_bar.progress(1.0)
+ return [file_info]
+
+ # Extract files from main page
+ main_files = await self.extract_downloadable_files(url, custom_ext_list)
+ initial_count = len(main_files)
+ file_count_text.text(f"Found {initial_count} files on main page")
+
+ # Get sublinks with enhanced method
+ progress_text.text("Getting sublinks...")
+ sublinks = await self.get_sublinks(url, sublink_limit)
+ total_links = len(sublinks)
+ progress_text.text(f"Found {total_links} sublinks to process")
+
+ # Always include files from the main page, regardless of sublinks
+ all_files = main_files
+
+ if not sublinks:
+ progress_bar.progress(1.0)
+ return all_files
+
+ # Process each sublink
+ for i, sublink in enumerate(sublinks, 1):
+ progress = i / total_links
+ progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
+ progress_bar.progress(progress)
+
+ try:
+ # Check if this is a direct download link
+ if is_download_link(sublink):
+ # For download links, just add the link directly
+ normalized_url = normalize_download_url(sublink)
+
+ # Skip if already visited
+ if normalized_url in self.visited_urls:
+ continue
+
+ # Mark as visited
+ self.visited_urls.add(normalized_url)
+
+ # Get file size if possible
+ size_str = await self.get_file_size(normalized_url)
+
+ # Get filename, with fallback to domain-based name
+ filename = os.path.basename(urlparse(normalized_url).path)
+ if not filename or filename == '/' or '?' in filename:
+ domain = get_domain(normalized_url)
+ ext = '.pdf' # Default extension
+ for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']:
+ if common_ext in normalized_url.lower():
+ ext = common_ext
+ break
+ filename = f"file_from_{domain}{ext}"
+
+ # Add file to results
+ all_files.append({
+ 'url': normalized_url,
+ 'download_url': normalized_url,
+ 'filename': filename,
+ 'size': size_str,
+ 'metadata': {}
+ })
+ file_count_text.text(f"Found {len(all_files)} total files")
+ continue
+
+ # For regular links, use a longer timeout for ASP.NET pages which can be slower
+ sub_timeout = timeout * 2 if is_aspnet else timeout
+
+ # Skip already visited URLs
+ if sublink in self.visited_urls:
+ continue
+
+ # Extract files from sublink
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
+ all_files.extend(sub_files)
+ file_count_text.text(f"Found {len(all_files)} total files")
+ except Exception as e:
+ logger.warning(f"Error processing sublink {sublink}: {e}")
+
+ # Deduplicate files
+ seen_urls = set()
+ unique_files = []
+ for f in all_files:
+ if f['url'] not in seen_urls:
+ seen_urls.add(f['url'])
+ unique_files.append(f)
+
+ final_count = len(unique_files)
+ progress_text.text(f"Deep search complete!")
+ file_count_text.text(f"Found {final_count} unique files")
+ progress_bar.progress(1.0)
+ return unique_files
+
+ except Exception as e:
+ logger.error(f"Deep search error: {e}")
+ progress_text.text(f"Error during deep search: {str(e)}")
+ return []
+
+ finally:
+ await asyncio.sleep(2)
+ if not st.session_state.get('keep_progress', False):
+ progress_text.empty()
+ progress_bar.empty()
+
+# -------------------- Main App --------------------
+def main():
+
+ # Custom CSS for better appearance
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Initialize session state for storing files
if 'files' not in st.session_state:
st.session_state.files = []
if 'downloaded_paths' not in st.session_state:
@@ -66,15 +3887,976 @@ def initialize_session_state():
st.session_state.proxy_string = None
if 'stealth_mode' not in st.session_state:
st.session_state.stealth_mode = True
+
+ # ============================
+ # SIDEBAR
+ # ============================
+ with st.sidebar:
+ st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50)
+ st.markdown(" {mode_descriptions[st.session_state.mode]}Advanced File Downloader
", unsafe_allow_html=True)
+ with col2:
+ st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
+
+ mode_descriptions = {
+ "Standard": "A versatile tool for discovering and downloading files from any website.",
+ "Education Mode": "Optimized for educational resources, exams, and academic materials.",
+ "Research Mode": "Focused on research papers, datasets, and academic publications.",
+ "Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
+ }
+
+ st.markdown(f"Find and Download Files
", unsafe_allow_html=True)
+
+ col1, col2 = st.columns([3, 1])
+ with col1:
+ url = st.text_input("Enter a URL to search for downloadable files:",
+ placeholder="e.g., https://example.com/resources",
+ value=st.session_state.get('preset_url', ''))
+ with col2:
+ # Initialize search_method with either session state or default value
+ initial_search_method = st.session_state.get('search_method', "Deep Search")
+ search_method = st.selectbox("Search Method",
+ ["Deep Search", "Quick Search", "Exam Site Mode"],
+ index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
+ # Update session state when changed
+ if search_method != st.session_state.get('search_method'):
+ st.session_state.search_method = search_method
+
+ # Advanced options in an expander
+ with st.expander("Search Options", expanded=False):
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
+ help="Higher values will search more links but take longer")
+ prioritize_pdfs = st.checkbox("Prioritize PDFs",
+ value=st.session_state.get('prioritize_pdfs', True),
+ help="Focus on finding PDF files first")
+ with col2:
+ timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
+ follow_subdomains = st.checkbox("Follow Subdomains", value=True,
+ help="Include links from subdomains in the search")
+ with col3:
+ # Default extensions based on mode
+ default_extensions = {
+ "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
+ "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
+ "Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
+ "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
+ }
+
+ custom_extensions = st.text_area(
+ "Custom File Extensions",
+ value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
+ help="Comma-separated list of file extensions to look for"
+ )
+
+ # Update session state when extensions changed
+ if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
+ st.session_state.custom_extensions = custom_extensions
+
+ search_col1, search_col2 = st.columns([4, 1])
+ with search_col1:
+ search_button = st.button("๐ Start Search", use_container_width=True)
+ with search_col2:
+ clear_button = st.button("๐งน Clear Results", use_container_width=True)
+
+ # File results section
+ if st.session_state.files:
+ st.markdown("Found Files
", unsafe_allow_html=True)
+
+ # File filtering options
+ filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1])
+ with filter_col1:
+ file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.")
+ with filter_col2:
+ sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"])
+ with filter_col3:
+ show_only_pdfs = st.checkbox("PDFs Only", value=False)
+
+ # Sort files based on selection
+ sorted_files = list(st.session_state.files)
+ if sort_option == "Name":
+ sorted_files.sort(key=lambda x: x['filename'])
+ elif sort_option == "Size (Largest)":
+ # Convert size strings to comparable values
+ def parse_size(size_str):
+ if 'Unknown' in size_str:
+ return 0
+ try:
+ value = float(size_str.split(' ')[0])
+ unit = size_str.split(' ')[1]
+ multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
+ return value * multipliers.get(unit, 0)
+ except:
+ return 0
+
+ sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True)
+ elif sort_option == "Size (Smallest)":
+ def parse_size(size_str):
+ if 'Unknown' in size_str:
+ return float('inf')
+ try:
+ value = float(size_str.split(' ')[0])
+ unit = size_str.split(' ')[1]
+ multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
+ return value * multipliers.get(unit, 0)
+ except:
+ return float('inf')
+
+ sorted_files.sort(key=lambda x: parse_size(x['size']))
+
+ # File list with selection
+ file_container = st.container()
+ with file_container:
+ selected_files = []
+ displayed_files = []
+
+ for i, file in enumerate(sorted_files):
+ # Apply filters
+ if file_filter and file_filter.lower() not in file['filename'].lower():
+ continue
+ if show_only_pdfs and not file['filename'].lower().endswith('.pdf'):
+ continue
+
+ displayed_files.append(i)
+ with st.container():
+ col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1])
+ with col1:
+ selected = st.checkbox("", key=f"select_{i}", value=True)
+ if selected:
+ selected_files.append(i)
+ with col2:
+ file_icon = "๐"
+ if file['filename'].lower().endswith('.pdf'):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.doc', '.docx')):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.xls', '.xlsx')):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.ppt', '.pptx')):
+ file_icon = "๐ผ๏ธ"
+ elif file['filename'].lower().endswith(('.jpg', '.png', '.gif')):
+ file_icon = "๐ผ๏ธ"
+ elif file['filename'].lower().endswith(('.mp3', '.wav')):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.mp4', '.avi', '.mov')):
+ file_icon = "๐ฌ"
+
+ st.markdown(f"**{file_icon} {file['filename']}**")
+ st.markdown(f"{file['url'][:60]}...", unsafe_allow_html=True)
+ with col3:
+ st.markdown(f"**Size:** {file['size']}")
+ with col4:
+ st.button("Preview", key=f"preview_{i}")
+
+ st.divider()
+
+ if not displayed_files:
+ st.info("No files match your current filters. Try adjusting your search criteria.")
+
+ # Download options
+ if selected_files:
+ col1, col2 = st.columns(2)
+ with col1:
+ download_dir = st.text_input("Download Directory", value="downloads")
+ with col2:
+ download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
+
+ download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
+ with download_col1:
+ download_button = st.button("โฌ๏ธ Download Selected Files", use_container_width=True)
+ with download_col2:
+ google_drive_button = st.button("๐ค Upload to Drive",
+ use_container_width=True,
+ disabled=not st.session_state.google_credentials)
+ with download_col3:
+ select_all = st.button("Select All Files", use_container_width=True)
+
+ # Handle select all button
+ if select_all:
+ for i in displayed_files:
+ st.session_state[f"select_{i}"] = True
+ st.rerun()
+
+ # Download progress/results
+ if st.session_state.download_complete:
+ st.success(f"โ
Downloaded {len(st.session_state.downloaded_paths)} files successfully!")
+ download_links = []
+ for path in st.session_state.downloaded_paths:
+ with open(path, "rb") as f:
+ file_content = f.read()
+ file_name = os.path.basename(path)
+ download_links.append((file_name, file_content))
+
+ if len(download_links) > 0:
+ if download_option == "ZIP Archive":
+ # Create ZIP archive for download
+ zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir)
+ with open(zip_path, "rb") as f:
+ zip_content = f.read()
+ st.download_button("๐ฆ Download ZIP Archive",
+ zip_content,
+ file_name=os.path.basename(zip_path),
+ mime="application/zip")
+ else:
+ # Show individual file download links
+ st.markdown("Download Files
", unsafe_allow_html=True)
+
+ # Create a grid of download buttons
+ cols = st.columns(3)
+ for idx, (name, content) in enumerate(download_links):
+ mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream'
+ with cols[idx % 3]:
+ st.download_button(
+ f"๐ {name}",
+ content,
+ file_name=name,
+ mime=mime_type,
+ key=f"dl_{name}",
+ use_container_width=True
+ )
+
+ # Tab 2: Local File Search
+ with tabs[1]:
+ st.markdown("Search Downloaded Files
", unsafe_allow_html=True)
+ st.write("Upload files to search through their content with AI-powered semantic search.")
+
+ # File upload
+ uploaded_files = st.file_uploader("Upload documents for search",
+ accept_multiple_files=True,
+ type=['pdf', 'docx', 'txt', 'csv', 'json'])
+
+ if uploaded_files:
+ # Build search index on upload
+ col1, col2 = st.columns([4, 1])
+ with col1:
+ use_transformer = st.checkbox("Use AI Transformer Model", value=HAVE_TRANSFORMERS,
+ help="Uses advanced AI for more accurate semantic search (if available)")
+ with col2:
+ if st.button("Build Search Index", use_container_width=True):
+ with st.spinner("Processing files and building search index..."):
+ files_added = 0
+ for uploaded_file in uploaded_files:
+ file_info = {
+ 'filename': uploaded_file.name,
+ 'url': f'local://{uploaded_file.name}',
+ 'size': humanize_file_size(uploaded_file.size)
+ }
+ success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
+ if success:
+ files_added += 1
+
+ if files_added > 0:
+ index_built = st.session_state.rag_search.build_index()
+ if index_built:
+ st.success(f"โ
Successfully indexed {files_added} files!")
+ else:
+ st.error("Failed to build search index.")
+ else:
+ st.warning("No valid text could be extracted from the files.")
+
+ # Search interface
+ st.markdown("Search Files
", unsafe_allow_html=True)
+
+ col1, col2 = st.columns([4, 1])
+ with col1:
+ query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
+ with col2:
+ expand_query = st.checkbox("Auto-expand query", value=True,
+ help="Automatically add related terms to your search")
+
+ col1, col2 = st.columns([4, 1])
+ with col1:
+ if st.button("๐ Search Documents", use_container_width=True):
+ if not query:
+ st.warning("Please enter a search query")
+ else:
+ with st.spinner("Searching..."):
+ results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
+
+ if results:
+ st.markdown(f"**Found {len(results)} relevant documents:**")
+ for i, result in enumerate(results):
+ with st.container():
+ st.markdown(f"Advanced Settings
", unsafe_allow_html=True)
+
+ config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
+
+ # Browser Settings tab
+ with config_tabs[0]:
+ col1, col2 = st.columns(2)
+ with col1:
+ use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
+ help="Makes browser harder to detect as automated, but may be slower")
+
+ handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
+ help="Attempt to solve simple captchas automatically")
+
+ download_timeout = st.slider("Download Timeout (seconds)",
+ min_value=30, max_value=600, value=300,
+ help="Maximum time to wait for downloads to complete")
+ with col2:
+ user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
+ help="Browser identity to use when accessing websites")
+
+ save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
+ help="Save screenshots when errors occur for debugging")
+
+ browser_lang = st.selectbox("Browser Language",
+ ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
+ index=0)
+
+ if st.button("Update Browser Settings"):
+ st.session_state.stealth_mode = use_stealth
+ st.success("Browser settings updated!")
+
+ # Dependency installation section
+ st.markdown("Dependencies
", unsafe_allow_html=True)
+ if st.button("Install Playwright Dependencies"):
+ with st.spinner("Installing dependencies..."):
+ install_playwright_dependencies()
+
+ # Proxy Configuration tab
+ with config_tabs[1]:
+ proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
+ help="Route requests through a proxy server for anonymity or bypassing restrictions")
+
+ if proxy_enabled:
+ proxy_col1, proxy_col2 = st.columns(2)
+ with proxy_col1:
+ proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
+ proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
+ with proxy_col2:
+ proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
+ proxy_auth = st.text_input("Proxy Authentication (optional)",
+ placeholder="username:password", type="password")
+
+ st.markdown("Proxy Rotation
", unsafe_allow_html=True)
+ use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
+ help="Automatically rotate between multiple proxies for better anonymity")
+
+ if use_proxy_rotation:
+ proxy_list = st.text_area("Proxy List (one per line)",
+ placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
+ rotation_interval = st.slider("Rotation Interval (requests)",
+ min_value=1, max_value=50, value=10,
+ help="How often to switch proxies")
+
+ if st.button("Save Proxy Configuration"):
+ # Construct the proxy string
+ proxy_string = None
+ if proxy_enabled and proxy_host and proxy_port:
+ proxy_prefix = f"{proxy_type.lower()}://"
+ proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
+ proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
+
+ # Update session state
+ st.session_state.use_proxy = proxy_enabled
+ st.session_state.proxy_string = proxy_string
+
+ # Configure proxy rotation if enabled
+ if use_proxy_rotation and proxy_list:
+ PROXY_ROTATION_CONFIG["enabled"] = True
+ PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
+ PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
+
+ st.success("Proxy configuration updated!")
+
+ # Download Options tab
+ with config_tabs[2]:
+ col1, col2 = st.columns(2)
+ with col1:
+ st.markdown("Download Behavior
", unsafe_allow_html=True)
+
+ skip_existing = st.checkbox("Skip Existing Files", value=True,
+ help="Don't download files that already exist locally")
+
+ auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
+ help="Automatically rename files instead of overwriting")
+
+ verify_downloads = st.checkbox("Verify Downloads", value=True,
+ help="Check file integrity after download")
+
+ max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
+ help="Number of times to retry failed downloads")
+
+ with col2:
+ st.markdown("File Organization
", unsafe_allow_html=True)
+
+ auto_organize = st.checkbox("Auto-Organize Files", value=True,
+ help="Automatically organize files by type")
+
+ default_dir = st.text_input("Default Download Directory", value="downloads",
+ help="Default location to save downloaded files")
+
+ org_by_domain = st.checkbox("Organize by Domain", value=False,
+ help="Create subdirectories based on source domains")
+
+ org_by_type = st.checkbox("Organize by File Type", value=False,
+ help="Create subdirectories based on file types")
+
+ if st.button("Save Download Settings"):
+ st.session_state.download_settings = {
+ "skip_existing": skip_existing,
+ "auto_rename": auto_rename,
+ "verify_downloads": verify_downloads,
+ "max_retries": max_retries,
+ "auto_organize": auto_organize,
+ "default_dir": default_dir,
+ "org_by_domain": org_by_domain,
+ "org_by_type": org_by_type
+ }
+ st.success("Download settings saved!")
+
+ # System tab
+ with config_tabs[3]:
+ col1, col2 = st.columns(2)
+ with col1:
+ st.markdown("Memory & Performance
", unsafe_allow_html=True)
+
+ max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
+ help="Maximum number of simultaneous downloads")
+
+ memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
+ help="Maximum memory to use for file processing")
+
+ processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
+ help="Number of threads to use for file processing")
+
+ with col2:
+ st.markdown("Logs & Diagnostics
", unsafe_allow_html=True)
+
+ log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
+ help="Detail level for application logs")
+
+ save_debug_info = st.checkbox("Save Debug Information", value=False,
+ help="Save detailed information about program execution")
+
+ log_dir = st.text_input("Log Directory", value="logs",
+ help="Directory to save log files")
+
+ if st.button("Apply System Settings"):
+ st.session_state.system_settings = {
+ "max_concurrent": max_concurrent,
+ "memory_limit": memory_limit,
+ "processing_threads": processing_threads,
+ "log_level": log_level,
+ "save_debug_info": save_debug_info,
+ "log_dir": log_dir
+ }
+ # Update logging configuration
+ log_level_num = getattr(logging, log_level)
+ logging.getLogger().setLevel(log_level_num)
+ st.success("System settings applied!")
+
+ # Reset application button
+ st.markdown("Application Control
", unsafe_allow_html=True)
+ reset_col1, reset_col2 = st.columns([1, 3])
+ with reset_col1:
+ if st.button("Reset Application", use_container_width=True):
+ for key in list(st.session_state.keys()):
+ if key != 'google_credentials': # Preserve Google auth
+ del st.session_state[key]
+ st.success("Application has been reset!")
+ st.rerun()
+ with reset_col2:
+ st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
+
+ # Advanced cleanup options
+ st.markdown("Advanced Options
", unsafe_allow_html=True)
+
+ adv_col1, adv_col2 = st.columns(2)
+ with adv_col1:
+ clear_cache = st.button("Clear Cache", use_container_width=True)
+ if clear_cache:
+ # Clear cached files and temporary data
+ temp_dir = tempfile.gettempdir()
+ try:
+ for f in os.listdir(temp_dir):
+ if f.startswith("playwright") or f.startswith("download"):
+ try:
+ os.remove(os.path.join(temp_dir, f))
+ except:
+ pass
+ st.success("Cache cleared successfully!")
+ except Exception as e:
+ st.error(f"Error clearing cache: {e}")
+
+ with adv_col2:
+ export_settings = st.button("Export Settings", use_container_width=True)
+ if export_settings:
+ # Export current settings to JSON
+ settings = {
+ "mode": st.session_state.mode,
+ "stealth_mode": st.session_state.stealth_mode,
+ "use_proxy": st.session_state.use_proxy,
+ "proxy_string": st.session_state.proxy_string,
+ "custom_extensions": st.session_state.get("custom_extensions", ""),
+ "prioritize_pdfs": st.session_state.get("prioritize_pdfs", True),
+ "system_settings": st.session_state.get("system_settings", {}),
+ "download_settings": st.session_state.get("download_settings", {})
+ }
+
+ settings_json = json.dumps(settings, indent=2)
+ b64 = base64.b64encode(settings_json.encode()).decode()
+ href = f'data:application/json;base64,{b64}'
+ st.markdown(f' ", unsafe_allow_html=True)
+
+ # Mode Selection
+ st.markdown(" ", unsafe_allow_html=True)
+
+ # Quick Settings
+ st.markdown(" ", unsafe_allow_html=True)
+
+ # Google Drive Integration
+ st.markdown(" ", unsafe_allow_html=True)
+
+ # Preset buttons for common EDU sites
+ if st.session_state.mode == "Education Mode":
+ st.markdown(" ", unsafe_allow_html=True)
+
+ # Tool status
+ st.markdown(" ", unsafe_allow_html=True)
+
+ # App info
+ st.markdown(" ", unsafe_allow_html=True)
-# Import the UI code while keeping the modular structure
-from ui import setup_ui, create_sidebar, display_file_results, handle_downloads, handle_google_drive_upload
-from main import main as app_main
-
-# Set up and run the application
-def main():
- initialize_session_state()
- app_main()
+ # ============================
+ # MAIN CONTENT AREA
+ # ============================
+
+ # Header section
+ col1, col2 = st.columns([5, 1])
+ with col1:
+ st.markdown("Download Settings File', unsafe_allow_html=True)
+
+ # Tab 4: Help
+ with tabs[3]:
+ st.markdown("