import os import asyncio import streamlit as st from app.ui import ( setup_ui, create_sidebar, display_file_results, handle_downloads, handle_google_drive_upload ) from app.download_manager import DownloadManager from app.rag_search import EnhancedRAGSearch from app.utils import USER_AGENTS def initialize_session_state(): """Initialize session state variables""" if 'files' not in st.session_state: st.session_state.files = [] if 'downloaded_paths' not in st.session_state: st.session_state.downloaded_paths = [] if 'download_complete' not in st.session_state: st.session_state.download_complete = False if 'selected_tab' not in st.session_state: st.session_state.selected_tab = 0 if 'rag_search' not in st.session_state: st.session_state.rag_search = EnhancedRAGSearch() if 'keep_progress' not in st.session_state: st.session_state.keep_progress = False if 'google_credentials' not in st.session_state: st.session_state.google_credentials = None if 'mode' not in st.session_state: st.session_state.mode = "Standard" if 'use_proxy' not in st.session_state: st.session_state.use_proxy = False if 'proxy_string' not in st.session_state: st.session_state.proxy_string = None if 'stealth_mode' not in st.session_state: st.session_state.stealth_mode = True def main(): # Initialize session state initialize_session_state() # Set up UI styling setup_ui() # Create sidebar create_sidebar() # Header section col1, col2 = st.columns([5, 1]) with col1: st.markdown("

Advanced File Downloader

", unsafe_allow_html=True) with col2: st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70) mode_descriptions = { "Standard": "A versatile tool for discovering and downloading files from any website.", "Education Mode": "Optimized for educational resources, exams, and academic materials.", "Research Mode": "Focused on research papers, datasets, and academic publications.", "Media Mode": "Enhanced for finding and downloading images, videos, and audio files." } st.markdown(f"

{mode_descriptions[st.session_state.mode]}

", unsafe_allow_html=True) # Main tabs tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"]) # Tab 1: Search & Download with tabs[0]: st.markdown("

Find and Download Files

", unsafe_allow_html=True) col1, col2 = st.columns([3, 1]) with col1: url = st.text_input("Enter a URL to search for downloadable files:", placeholder="e.g., https://example.com/resources", value=st.session_state.get('preset_url', '')) with col2: # Initialize search_method with either session state or default value initial_search_method = st.session_state.get('search_method', "Deep Search") search_method = st.selectbox("Search Method", ["Deep Search", "Quick Search", "Exam Site Mode"], index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method)) # Update session state when changed if search_method != st.session_state.get('search_method'): st.session_state.search_method = search_method # Advanced options in an expander with st.expander("Search Options", expanded=False): col1, col2, col3 = st.columns(3) with col1: depth = st.slider("Search Depth", min_value=1, max_value=5, value=2, help="Higher values will search more links but take longer") prioritize_pdfs = st.checkbox("Prioritize PDFs", value=st.session_state.get('prioritize_pdfs', True), help="Focus on finding PDF files first") with col2: timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60) follow_subdomains = st.checkbox("Follow Subdomains", value=True, help="Include links from subdomains in the search") with col3: # Default extensions based on mode default_extensions = { "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip", "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx", "Research Mode": ".pdf,.txt,.csv,.json,.xlsx", "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov" } custom_extensions = st.text_area( "Custom File Extensions", value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]), help="Comma-separated list of file extensions to look for" ) # Update session state when extensions changed if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions: st.session_state.custom_extensions = custom_extensions search_col1, search_col2 = st.columns([4, 1]) with search_col1: search_button = st.button("๐Ÿ” Start Search", use_container_width=True) with search_col2: clear_button = st.button("๐Ÿงน Clear Results", use_container_width=True) # File results section if st.session_state.files: # Display file results selected_files, displayed_files = display_file_results(st.session_state.files) # Download options if selected_files: col1, col2 = st.columns(2) with col1: download_dir = st.text_input("Download Directory", value="downloads") with col2: download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True) download_col1, download_col2, download_col3 = st.columns([3, 1, 1]) with download_col1: download_button = st.button("โฌ‡๏ธ Download Selected Files", use_container_width=True) with download_col2: google_drive_button = st.button("๐Ÿ“ค Upload to Drive", use_container_width=True, disabled=not st.session_state.google_credentials) with download_col3: select_all = st.button("Select All Files", use_container_width=True) # Handle select all button if select_all: for i in displayed_files: st.session_state[f"select_{i}"] = True st.rerun() # Handle download button if clicked if download_button: # Create download directory os.makedirs(download_dir, exist_ok=True) handle_downloads(selected_files, download_dir, download_option, download_col1) # Handle Google Drive upload if google_drive_button: handle_google_drive_upload(selected_files) # Tab 2: Local File Search with tabs[1]: st.markdown("

Search Downloaded Files

", unsafe_allow_html=True) st.write("Upload files to search through their content with AI-powered semantic search.") # File upload uploaded_files = st.file_uploader("Upload documents for search", accept_multiple_files=True, type=['pdf', 'docx', 'txt', 'csv', 'json']) if uploaded_files: # Build search index on upload col1, col2 = st.columns([4, 1]) with col1: use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer, help="Uses advanced AI for more accurate semantic search (if available)") with col2: if st.button("Build Search Index", use_container_width=True): with st.spinner("Processing files and building search index..."): files_added = 0 for uploaded_file in uploaded_files: file_info = { 'filename': uploaded_file.name, 'url': f'local://{uploaded_file.name}', 'size': humanize_file_size(uploaded_file.size) } success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info) if success: files_added += 1 if files_added > 0: index_built = st.session_state.rag_search.build_index() if index_built: st.success(f"โœ… Successfully indexed {files_added} files!") else: st.error("Failed to build search index.") else: st.warning("No valid text could be extracted from the files.") # Search interface st.markdown("

Search Files

", unsafe_allow_html=True) col1, col2 = st.columns([4, 1]) with col1: query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change") with col2: expand_query = st.checkbox("Auto-expand query", value=True, help="Automatically add related terms to your search") col1, col2 = st.columns([4, 1]) with col1: if st.button("๐Ÿ” Search Documents", use_container_width=True): if not query: st.warning("Please enter a search query") else: with st.spinner("Searching..."): results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True) if results: st.markdown(f"**Found {len(results)} relevant documents:**") for i, result in enumerate(results): with st.container(): st.markdown(f"
", unsafe_allow_html=True) st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})") if result.get('chunk_preview'): st.markdown("**Matching content:**") st.text(result['chunk_preview']) st.markdown("
", unsafe_allow_html=True) else: st.info("No matching documents found. Try a different query.") with col2: num_results = st.number_input("Max results", min_value=1, max_value=20, value=5) # Quick search tips with st.expander("Search Tips", expanded=False): st.markdown(""" ### Effective Search Tips - **Be specific** with your queries for more accurate results - **Try different phrasings** if you don't get the results you expect - Use **quotation marks** for exact phrase matching - For **complex topics**, break down your search into multiple queries - **Combine related terms** to improve recall The search engine uses advanced algorithms to understand the semantic meaning of your query, not just keyword matching. """) # Tab 3: Advanced Configuration with tabs[2]: st.markdown("

Advanced Settings

", unsafe_allow_html=True) config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"]) # Browser Settings tab with config_tabs[0]: col1, col2 = st.columns(2) with col1: use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode, help="Makes browser harder to detect as automated, but may be slower") handle_captchas = st.checkbox("Handle Captchas Automatically", value=False, help="Attempt to solve simple captchas automatically") download_timeout = st.slider("Download Timeout (seconds)", min_value=30, max_value=600, value=300, help="Maximum time to wait for downloads to complete") with col2: user_agent = st.selectbox("User Agent", USER_AGENTS, index=0, help="Browser identity to use when accessing websites") save_screenshots = st.checkbox("Save Browser Screenshots", value=False, help="Save screenshots when errors occur for debugging") browser_lang = st.selectbox("Browser Language", ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"], index=0) if st.button("Update Browser Settings"): st.session_state.stealth_mode = use_stealth st.success("Browser settings updated!") # Dependency installation section st.markdown("

Dependencies

", unsafe_allow_html=True) if st.button("Install Playwright Dependencies"): from app.ui import install_playwright_dependencies with st.spinner("Installing dependencies..."): install_playwright_dependencies() # Proxy Configuration tab with config_tabs[1]: proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy, help="Route requests through a proxy server for anonymity or bypassing restrictions") if proxy_enabled: proxy_col1, proxy_col2 = st.columns(2) with proxy_col1: proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"]) proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1") with proxy_col2: proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080") proxy_auth = st.text_input("Proxy Authentication (optional)", placeholder="username:password", type="password") st.markdown("

Proxy Rotation

", unsafe_allow_html=True) use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False, help="Automatically rotate between multiple proxies for better anonymity") if use_proxy_rotation: proxy_list = st.text_area("Proxy List (one per line)", placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080") rotation_interval = st.slider("Rotation Interval (requests)", min_value=1, max_value=50, value=10, help="How often to switch proxies") if st.button("Save Proxy Configuration"): # Construct the proxy string proxy_string = None if proxy_enabled and proxy_host and proxy_port: proxy_prefix = f"{proxy_type.lower()}://" proxy_auth_str = f"{proxy_auth}@" if proxy_auth else "" proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}" # Update session state st.session_state.use_proxy = proxy_enabled st.session_state.proxy_string = proxy_string # Configure proxy rotation if enabled from app.utils import PROXY_ROTATION_CONFIG if use_proxy_rotation and proxy_list: PROXY_ROTATION_CONFIG["enabled"] = True PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] st.success("Proxy configuration updated!") # Download Options tab with config_tabs[2]: col1, col2 = st.columns(2) with col1: st.markdown("

Download Behavior

", unsafe_allow_html=True) skip_existing = st.checkbox("Skip Existing Files", value=True, help="Don't download files that already exist locally") auto_rename = st.checkbox("Auto-Rename Duplicates", value=True, help="Automatically rename files instead of overwriting") verify_downloads = st.checkbox("Verify Downloads", value=True, help="Check file integrity after download") max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3, help="Number of times to retry failed downloads") with col2: st.markdown("

File Organization

", unsafe_allow_html=True) auto_organize = st.checkbox("Auto-Organize Files", value=True, help="Automatically organize files by type") default_dir = st.text_input("Default Download Directory", value="downloads", help="Default location to save downloaded files") org_by_domain = st.checkbox("Organize by Domain", value=False, help="Create subdirectories based on source domains") org_by_type = st.checkbox("Organize by File Type", value=False, help="Create subdirectories based on file types") if st.button("Save Download Settings"): st.session_state.download_settings = { "skip_existing": skip_existing, "auto_rename": auto_rename, "verify_downloads": verify_downloads, "max_retries": max_retries, "auto_organize": auto_organize, "default_dir": default_dir, "org_by_domain": org_by_domain, "org_by_type": org_by_type } st.success("Download settings saved!") # System tab with config_tabs[3]: col1, col2 = st.columns(2) with col1: st.markdown("

Memory & Performance

", unsafe_allow_html=True) max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3, help="Maximum number of simultaneous downloads") memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024, help="Maximum memory to use for file processing") processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2, help="Number of threads to use for file processing") with col2: st.markdown("

Logs & Diagnostics

", unsafe_allow_html=True) log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1, help="Detail level for application logs") save_debug_info = st.checkbox("Save Debug Information", value=False, help="Save detailed information about program execution") log_dir = st.text_input("Log Directory", value="logs", help="Directory to save log files") if st.button("Apply System Settings"): import logging st.session_state.system_settings = { "max_concurrent": max_concurrent, "memory_limit": memory_limit, "processing_threads": processing_threads, "log_level": log_level, "save_debug_info": save_debug_info, "log_dir": log_dir } # Update logging configuration log_level_num = getattr(logging, log_level) logging.getLogger().setLevel(log_level_num) st.success("System settings applied!") # Reset application button st.markdown("

Application Control

", unsafe_allow_html=True) reset_col1, reset_col2 = st.columns([1, 3]) with reset_col1: if st.button("Reset Application", use_container_width=True): for key in list(st.session_state.keys()): if key != 'google_credentials': # Preserve Google auth del st.session_state[key] st.success("Application has been reset!") st.rerun() with reset_col2: st.info("This will clear all search results, downloaded files, and reset settings to defaults.") # Tab 4: Help with tabs[3]: st.markdown("

Help & Documentation

", unsafe_allow_html=True) help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"]) with help_tabs[0]: st.markdown(""" ### Getting Started 1. **Enter a URL** on the Search & Download tab 2. Select a **Search Method**: - **Deep Search**: Thorough but slower - **Quick Search**: Fast but may miss some files - **Exam Site Mode**: Optimized for educational resource sites 3. Click **Start Search** to find downloadable files 4. Select files you want to download 5. Click **Download Selected Files** #### Using Different Modes Select a mode from the sidebar to optimize the tool for different use cases: - **Standard Mode**: Balanced for general use - **Education Mode**: Optimized for finding academic materials - **Research Mode**: Better for research papers and datasets - **Media Mode**: Enhanced for finding images, videos, and audio For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials. """) with help_tabs[1]: st.markdown(""" ### Advanced Features - **Local File Search**: Upload files and search through their content using the enhanced RAG search - **Custom Extensions**: Specify additional file types to look for beyond the default set - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity - **Google Drive Integration**: Upload downloaded files directly to your Google Drive #### Search Tips - For educational sites, include specific terms like "exam", "test", "paper" in the URL - When using Local File Search, try different variations of your query for better results - Use filtering and sorting options to find the most relevant files quickly #### File Organization You can configure automatic file organization in the Advanced Configuration tab: - **Organize by Domain**: Creates folders based on the source website - **Organize by File Type**: Separates files into folders by their extension - **Auto-Rename**: Prevents overwriting existing files with same names """) with help_tabs[2]: st.markdown(""" ### Troubleshooting #### Common Issues - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions - **Downloads failing**: Check if the site requires authentication or uses captchas - **Slow performance**: Reduce search depth or disable stealth mode for faster results - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings #### Captcha Issues Some websites use captchas to prevent automated access. If you encounter captchas: 1. Try using a different proxy 2. Enable "Handle Captchas Automatically" for simple captchas 3. For complex captchas, you may need to manually access the site first #### Proxy Problems If you're having issues with proxies: 1. Verify your proxy is working with an external tool 2. Check that you've entered the correct format (http://host:port) 3. Some websites may block known proxy IPs #### Memory Usage If the application is using too much memory: 1. Reduce the "Memory Limit" in System settings 2. Process fewer files at once 3. Use lower search depth values """) with help_tabs[3]: st.markdown(""" ### About This Tool **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources. #### Key Features - **Smart Discovery**: Finds downloadable files even when they're not directly linked - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques - **Educational Focus**: Specialized detection for exam papers and academic resources - **Stealth Capabilities**: Avoids detection by anti-scraping measures #### Technical Details This tool uses: - **Playwright**: For browser automation and stealth capabilities - **Sentence Transformers**: For AI-powered semantic search - **Streamlit**: For the user interface - **Google Drive API**: For cloud integration #### Credits Created with Python, Streamlit, Playwright, and various AI libraries. For issues or suggestions, please contact the developer. Version 2.0 - March 2025 """) # Handle search button if search_button and url: # Reset files and downloaded paths st.session_state.files = [] st.session_state.downloaded_paths = [] st.session_state.download_complete = False # Clear the preset URL if it was used if 'preset_url' in st.session_state: st.session_state.preset_url = '' # Prepare custom extensions custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()] # Configure search parameters based on method sublink_limit = 5000 if search_method == "Deep Search" else 1000 search_depth = depth if search_method == "Deep Search" else 1 is_exam_site = search_method == "Exam Site Mode" # Execute the search asynchronously async def run_search(): async with DownloadManager( use_proxy=st.session_state.use_proxy, proxy=st.session_state.proxy_string, use_stealth=st.session_state.stealth_mode ) as manager: # For exam sites, use specialized approach if is_exam_site: st.session_state.keep_progress = True edu_links = await manager.get_edu_exam_links(url) all_files = [] progress_text = st.empty() progress_bar = st.progress(0) # Process each exam link for i, link in enumerate(edu_links): progress = (i+1) / max(1, len(edu_links)) progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}") progress_bar.progress(progress) files = await manager.extract_downloadable_files(link, custom_ext_list) all_files.extend(files) st.session_state.files = all_files progress_text.empty() progress_bar.empty() st.session_state.keep_progress = False else: # Use general search method files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout) st.session_state.files = files # Run the search asyncio.run(run_search()) st.rerun() # Handle clear button if clear_button: st.session_state.files = [] st.session_state.downloaded_paths = [] st.session_state.download_complete = False if 'preset_url' in st.session_state: st.session_state.preset_url = '' st.rerun() # Entry point if __name__ == "__main__": main()