Spaces:

euler314
/

craw_web

Running

File size: 32,273 Bytes

6158c43

import os
import asyncio
import streamlit as st
from app.ui import (
    setup_ui, create_sidebar, display_file_results,
    handle_downloads, handle_google_drive_upload
)
from app.download_manager import DownloadManager
from app.rag_search import EnhancedRAGSearch
from app.utils import USER_AGENTS

def initialize_session_state():
    """Initialize session state variables"""
    if 'files' not in st.session_state:
        st.session_state.files = []
    if 'downloaded_paths' not in st.session_state:
        st.session_state.downloaded_paths = []
    if 'download_complete' not in st.session_state:
        st.session_state.download_complete = False
    if 'selected_tab' not in st.session_state:
        st.session_state.selected_tab = 0
    if 'rag_search' not in st.session_state:
        st.session_state.rag_search = EnhancedRAGSearch()
    if 'keep_progress' not in st.session_state:
        st.session_state.keep_progress = False
    if 'google_credentials' not in st.session_state:
        st.session_state.google_credentials = None
    if 'mode' not in st.session_state:
        st.session_state.mode = "Standard"
    if 'use_proxy' not in st.session_state:
        st.session_state.use_proxy = False
    if 'proxy_string' not in st.session_state:
        st.session_state.proxy_string = None
    if 'stealth_mode' not in st.session_state:
        st.session_state.stealth_mode = True

def main():
    # Initialize session state
    initialize_session_state()
    
    # Set up UI styling
    setup_ui()
    
    # Create sidebar
    create_sidebar()

    # Header section
    col1, col2 = st.columns([5, 1])
    with col1:
        st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
    with col2:
        st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
    
    mode_descriptions = {
        "Standard": "A versatile tool for discovering and downloading files from any website.",
        "Education Mode": "Optimized for educational resources, exams, and academic materials.",
        "Research Mode": "Focused on research papers, datasets, and academic publications.",
        "Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
    }
    
    st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
    
    # Main tabs
    tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
    
    # Tab 1: Search & Download
    with tabs[0]:
        st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
        
        col1, col2 = st.columns([3, 1])
        with col1:
            url = st.text_input("Enter a URL to search for downloadable files:", 
                                placeholder="e.g., https://example.com/resources",
                                value=st.session_state.get('preset_url', ''))
        with col2:
            # Initialize search_method with either session state or default value
            initial_search_method = st.session_state.get('search_method', "Deep Search")
            search_method = st.selectbox("Search Method", 
                                         ["Deep Search", "Quick Search", "Exam Site Mode"],
                                         index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
            # Update session state when changed
            if search_method != st.session_state.get('search_method'):
                st.session_state.search_method = search_method
        
        # Advanced options in an expander
        with st.expander("Search Options", expanded=False):
            col1, col2, col3 = st.columns(3)
            with col1:
                depth = st.slider("Search Depth", min_value=1, max_value=5, value=2, 
                                help="Higher values will search more links but take longer")
                prioritize_pdfs = st.checkbox("Prioritize PDFs", 
                                            value=st.session_state.get('prioritize_pdfs', True), 
                                            help="Focus on finding PDF files first")
            with col2:
                timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
                follow_subdomains = st.checkbox("Follow Subdomains", value=True, 
                                              help="Include links from subdomains in the search")
            with col3:
                # Default extensions based on mode
                default_extensions = {
                    "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
                    "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
                    "Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
                    "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
                }
                
                custom_extensions = st.text_area(
                    "Custom File Extensions", 
                    value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
                    help="Comma-separated list of file extensions to look for"
                )
                
                # Update session state when extensions changed
                if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
                    st.session_state.custom_extensions = custom_extensions
                
        search_col1, search_col2 = st.columns([4, 1])
        with search_col1:
            search_button = st.button("🔍 Start Search", use_container_width=True)
        with search_col2:
            clear_button = st.button("🧹 Clear Results", use_container_width=True)
            
        # File results section
        if st.session_state.files:
            # Display file results
            selected_files, displayed_files = display_file_results(st.session_state.files)
            
            # Download options
            if selected_files:
                col1, col2 = st.columns(2)
                with col1:
                    download_dir = st.text_input("Download Directory", value="downloads")
                with col2:
                    download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
                
                download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
                with download_col1:
                    download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
                with download_col2:
                    google_drive_button = st.button("📤 Upload to Drive", 
                                                  use_container_width=True, 
                                                  disabled=not st.session_state.google_credentials)
                with download_col3:
                    select_all = st.button("Select All Files", use_container_width=True)
                    
                # Handle select all button
                if select_all:
                    for i in displayed_files:
                        st.session_state[f"select_{i}"] = True
                    st.rerun()
                
                # Handle download button if clicked
                if download_button:
                    # Create download directory
                    os.makedirs(download_dir, exist_ok=True)
                    handle_downloads(selected_files, download_dir, download_option, download_col1)
                
                # Handle Google Drive upload
                if google_drive_button:
                    handle_google_drive_upload(selected_files)
    
    # Tab 2: Local File Search
    with tabs[1]:
        st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
        st.write("Upload files to search through their content with AI-powered semantic search.")
        
        # File upload
        uploaded_files = st.file_uploader("Upload documents for search", 
                                         accept_multiple_files=True, 
                                         type=['pdf', 'docx', 'txt', 'csv', 'json'])
        
        if uploaded_files:
            # Build search index on upload
            col1, col2 = st.columns([4, 1])
            with col1:
                use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer, 
                                           help="Uses advanced AI for more accurate semantic search (if available)")
            with col2:
                if st.button("Build Search Index", use_container_width=True):
                    with st.spinner("Processing files and building search index..."):
                        files_added = 0
                        for uploaded_file in uploaded_files:
                            file_info = {
                                'filename': uploaded_file.name,
                                'url': f'local://{uploaded_file.name}',
                                'size': humanize_file_size(uploaded_file.size)
                            }
                            success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
                            if success:
                                files_added += 1
                        
                        if files_added > 0:
                            index_built = st.session_state.rag_search.build_index()
                            if index_built:
                                st.success(f"✅ Successfully indexed {files_added} files!")
                            else:
                                st.error("Failed to build search index.")
                        else:
                            st.warning("No valid text could be extracted from the files.")
            
            # Search interface
            st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
            
            col1, col2 = st.columns([4, 1])
            with col1:
                query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
            with col2:
                expand_query = st.checkbox("Auto-expand query", value=True, 
                                         help="Automatically add related terms to your search")
            
            col1, col2 = st.columns([4, 1])
            with col1:
                if st.button("🔍 Search Documents", use_container_width=True):
                    if not query:
                        st.warning("Please enter a search query")
                    else:
                        with st.spinner("Searching..."):
                            results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
                            
                            if results:
                                st.markdown(f"**Found {len(results)} relevant documents:**")
                                for i, result in enumerate(results):
                                    with st.container():
                                        st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
                                        st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
                                        
                                        if result.get('chunk_preview'):
                                            st.markdown("**Matching content:**")
                                            st.text(result['chunk_preview'])
                                        
                                        st.markdown("</div>", unsafe_allow_html=True)
                            else:
                                st.info("No matching documents found. Try a different query.")
            with col2:
                num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
            
            # Quick search tips
            with st.expander("Search Tips", expanded=False):
                st.markdown("""
                ### Effective Search Tips
                
                - **Be specific** with your queries for more accurate results
                - **Try different phrasings** if you don't get the results you expect
                - Use **quotation marks** for exact phrase matching
                - For **complex topics**, break down your search into multiple queries
                - **Combine related terms** to improve recall
                
                The search engine uses advanced algorithms to understand the semantic meaning of your query, 
                not just keyword matching.
                """)
    
    # Tab 3: Advanced Configuration
    with tabs[2]:
        st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
        
        config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
        
        # Browser Settings tab
        with config_tabs[0]:
            col1, col2 = st.columns(2)
            with col1:
                use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode, 
                                        help="Makes browser harder to detect as automated, but may be slower")
                
                handle_captchas = st.checkbox("Handle Captchas Automatically", value=False, 
                                           help="Attempt to solve simple captchas automatically")
                
                download_timeout = st.slider("Download Timeout (seconds)", 
                                          min_value=30, max_value=600, value=300,
                                          help="Maximum time to wait for downloads to complete")
            with col2:
                user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
                                        help="Browser identity to use when accessing websites")
                
                save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
                                           help="Save screenshots when errors occur for debugging")
                
                browser_lang = st.selectbox("Browser Language", 
                                         ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
                                         index=0)
            
            if st.button("Update Browser Settings"):
                st.session_state.stealth_mode = use_stealth
                st.success("Browser settings updated!")
                
            # Dependency installation section
            st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
            if st.button("Install Playwright Dependencies"):
                from app.ui import install_playwright_dependencies
                with st.spinner("Installing dependencies..."):
                    install_playwright_dependencies()
        
        # Proxy Configuration tab
        with config_tabs[1]:
            proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
                                     help="Route requests through a proxy server for anonymity or bypassing restrictions")
            
            if proxy_enabled:
                proxy_col1, proxy_col2 = st.columns(2)
                with proxy_col1:
                    proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
                    proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
                with proxy_col2:
                    proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
                    proxy_auth = st.text_input("Proxy Authentication (optional)", 
                                            placeholder="username:password", type="password")
            
            st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
            use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
                                          help="Automatically rotate between multiple proxies for better anonymity")
            
            if use_proxy_rotation:
                proxy_list = st.text_area("Proxy List (one per line)", 
                                       placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
                rotation_interval = st.slider("Rotation Interval (requests)", 
                                           min_value=1, max_value=50, value=10,
                                           help="How often to switch proxies")
            
            if st.button("Save Proxy Configuration"):
                # Construct the proxy string
                proxy_string = None
                if proxy_enabled and proxy_host and proxy_port:
                    proxy_prefix = f"{proxy_type.lower()}://"
                    proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
                    proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
                
                # Update session state
                st.session_state.use_proxy = proxy_enabled
                st.session_state.proxy_string = proxy_string
                
                # Configure proxy rotation if enabled
                from app.utils import PROXY_ROTATION_CONFIG
                if use_proxy_rotation and proxy_list:
                    PROXY_ROTATION_CONFIG["enabled"] = True
                    PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
                    PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
                
                st.success("Proxy configuration updated!")
        
        # Download Options tab
        with config_tabs[2]:
            col1, col2 = st.columns(2)
            with col1:
                st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
                
                skip_existing = st.checkbox("Skip Existing Files", value=True,
                                        help="Don't download files that already exist locally")
                
                auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
                                       help="Automatically rename files instead of overwriting")
                
                verify_downloads = st.checkbox("Verify Downloads", value=True,
                                           help="Check file integrity after download")
                
                max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
                                     help="Number of times to retry failed downloads")
            
            with col2:
                st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
                
                auto_organize = st.checkbox("Auto-Organize Files", value=True,
                                         help="Automatically organize files by type")
                
                default_dir = st.text_input("Default Download Directory", value="downloads",
                                         help="Default location to save downloaded files")
                
                org_by_domain = st.checkbox("Organize by Domain", value=False,
                                        help="Create subdirectories based on source domains")
                
                org_by_type = st.checkbox("Organize by File Type", value=False,
                                       help="Create subdirectories based on file types")
            
            if st.button("Save Download Settings"):
                st.session_state.download_settings = {
                    "skip_existing": skip_existing,
                    "auto_rename": auto_rename,
                    "verify_downloads": verify_downloads,
                    "max_retries": max_retries,
                    "auto_organize": auto_organize,
                    "default_dir": default_dir,
                    "org_by_domain": org_by_domain,
                    "org_by_type": org_by_type
                }
                st.success("Download settings saved!")
        
        # System tab
        with config_tabs[3]:
            col1, col2 = st.columns(2)
            with col1:
                st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
                
                max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
                                        help="Maximum number of simultaneous downloads")
                
                memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
                                      help="Maximum memory to use for file processing")
                
                processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
                                           help="Number of threads to use for file processing")
            
            with col2:
                st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
                
                log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
                                      help="Detail level for application logs")
                
                save_debug_info = st.checkbox("Save Debug Information", value=False,
                                           help="Save detailed information about program execution")
                
                log_dir = st.text_input("Log Directory", value="logs",
                                     help="Directory to save log files")
            
            if st.button("Apply System Settings"):
                import logging
                st.session_state.system_settings = {
                    "max_concurrent": max_concurrent,
                    "memory_limit": memory_limit,
                    "processing_threads": processing_threads,
                    "log_level": log_level,
                    "save_debug_info": save_debug_info,
                    "log_dir": log_dir
                }
                # Update logging configuration
                log_level_num = getattr(logging, log_level)
                logging.getLogger().setLevel(log_level_num)
                st.success("System settings applied!")
                
            # Reset application button
            st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
            reset_col1, reset_col2 = st.columns([1, 3])
            with reset_col1:
                if st.button("Reset Application", use_container_width=True):
                    for key in list(st.session_state.keys()):
                        if key != 'google_credentials':  # Preserve Google auth
                            del st.session_state[key]
                    st.success("Application has been reset!")
                    st.rerun()
            with reset_col2:
                st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
    
    # Tab 4: Help
    with tabs[3]:
        st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
        
        help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
        
        with help_tabs[0]:
            st.markdown("""
            ### Getting Started
            
            1. **Enter a URL** on the Search & Download tab
            2. Select a **Search Method**:
               - **Deep Search**: Thorough but slower
               - **Quick Search**: Fast but may miss some files
               - **Exam Site Mode**: Optimized for educational resource sites
            3. Click **Start Search** to find downloadable files
            4. Select files you want to download
            5. Click **Download Selected Files**
            
            #### Using Different Modes
            
            Select a mode from the sidebar to optimize the tool for different use cases:
            
            - **Standard Mode**: Balanced for general use
            - **Education Mode**: Optimized for finding academic materials
            - **Research Mode**: Better for research papers and datasets
            - **Media Mode**: Enhanced for finding images, videos, and audio
            
            For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
            """)
            
        with help_tabs[1]:
            st.markdown("""
            ### Advanced Features
            
            - **Local File Search**: Upload files and search through their content using the enhanced RAG search
            - **Custom Extensions**: Specify additional file types to look for beyond the default set
            - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
            - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
            - **Google Drive Integration**: Upload downloaded files directly to your Google Drive
            
            #### Search Tips
            
            - For educational sites, include specific terms like "exam", "test", "paper" in the URL
            - When using Local File Search, try different variations of your query for better results
            - Use filtering and sorting options to find the most relevant files quickly
            
            #### File Organization
            
            You can configure automatic file organization in the Advanced Configuration tab:
            
            - **Organize by Domain**: Creates folders based on the source website
            - **Organize by File Type**: Separates files into folders by their extension
            - **Auto-Rename**: Prevents overwriting existing files with same names
            """)
            
        with help_tabs[2]:
            st.markdown("""
            ### Troubleshooting
            
            #### Common Issues
            
            - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
            - **Downloads failing**: Check if the site requires authentication or uses captchas
            - **Slow performance**: Reduce search depth or disable stealth mode for faster results
            - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
            
            #### Captcha Issues
            
            Some websites use captchas to prevent automated access. If you encounter captchas:
            
            1. Try using a different proxy
            2. Enable "Handle Captchas Automatically" for simple captchas
            3. For complex captchas, you may need to manually access the site first
            
            #### Proxy Problems
            
            If you're having issues with proxies:
            
            1. Verify your proxy is working with an external tool
            2. Check that you've entered the correct format (http://host:port)
            3. Some websites may block known proxy IPs
            
            #### Memory Usage
            
            If the application is using too much memory:
            
            1. Reduce the "Memory Limit" in System settings
            2. Process fewer files at once
            3. Use lower search depth values
            """)
            
        with help_tabs[3]:
            st.markdown("""
            ### About This Tool
            
            **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
            
            #### Key Features
            
            - **Smart Discovery**: Finds downloadable files even when they're not directly linked
            - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
            - **Educational Focus**: Specialized detection for exam papers and academic resources
            - **Stealth Capabilities**: Avoids detection by anti-scraping measures
            
            #### Technical Details
            
            This tool uses:
            
            - **Playwright**: For browser automation and stealth capabilities
            - **Sentence Transformers**: For AI-powered semantic search
            - **Streamlit**: For the user interface
            - **Google Drive API**: For cloud integration
            
            #### Credits
            
            Created with Python, Streamlit, Playwright, and various AI libraries.
            
            For issues or suggestions, please contact the developer.
            
            Version 2.0 - March 2025
            """)
    
    # Handle search button
    if search_button and url:
        # Reset files and downloaded paths
        st.session_state.files = []
        st.session_state.downloaded_paths = []
        st.session_state.download_complete = False
        
        # Clear the preset URL if it was used
        if 'preset_url' in st.session_state:
            st.session_state.preset_url = ''
        
        # Prepare custom extensions
        custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
        
        # Configure search parameters based on method
        sublink_limit = 5000 if search_method == "Deep Search" else 1000
        search_depth = depth if search_method == "Deep Search" else 1
        is_exam_site = search_method == "Exam Site Mode"
        
        # Execute the search asynchronously
        async def run_search():
            async with DownloadManager(
                use_proxy=st.session_state.use_proxy,
                proxy=st.session_state.proxy_string,
                use_stealth=st.session_state.stealth_mode
            ) as manager:
                # For exam sites, use specialized approach
                if is_exam_site:
                    st.session_state.keep_progress = True
                    edu_links = await manager.get_edu_exam_links(url)
                    all_files = []
                    
                    progress_text = st.empty()
                    progress_bar = st.progress(0)
                    
                    # Process each exam link
                    for i, link in enumerate(edu_links):
                        progress = (i+1) / max(1, len(edu_links))
                        progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
                        progress_bar.progress(progress)
                        
                        files = await manager.extract_downloadable_files(link, custom_ext_list)
                        all_files.extend(files)
                    
                    st.session_state.files = all_files
                    progress_text.empty()
                    progress_bar.empty()
                    st.session_state.keep_progress = False
                    
                else:
                    # Use general search method
                    files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
                    st.session_state.files = files
        
        # Run the search
        asyncio.run(run_search())
        st.rerun()
    
    # Handle clear button
    if clear_button:
        st.session_state.files = []
        st.session_state.downloaded_paths = []
        st.session_state.download_complete = False
        if 'preset_url' in st.session_state:
            st.session_state.preset_url = ''
        st.rerun()

# Entry point
if __name__ == "__main__":
    main()