Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Apr 8

Commit

1d44b06

verified ·

1 Parent(s): f0d7dcd

Delete main.py

Browse files

Files changed (1) hide show

main.py +0 -633

main.py DELETED Viewed

@@ -1,633 +0,0 @@
-import os
-import asyncio
-import streamlit as st
-from ui import (
-    setup_ui, create_sidebar, display_file_results,
-    handle_downloads, handle_google_drive_upload
-)
-from app.download_manager import DownloadManager
-from app.rag_search import EnhancedRAGSearch
-from app.utils import USER_AGENTS
-def initialize_session_state():
-    """Initialize session state variables"""
-    if 'files' not in st.session_state:
-        st.session_state.files = []
-    if 'downloaded_paths' not in st.session_state:
-        st.session_state.downloaded_paths = []
-    if 'download_complete' not in st.session_state:
-        st.session_state.download_complete = False
-    if 'selected_tab' not in st.session_state:
-        st.session_state.selected_tab = 0
-    if 'rag_search' not in st.session_state:
-        st.session_state.rag_search = EnhancedRAGSearch()
-    if 'keep_progress' not in st.session_state:
-        st.session_state.keep_progress = False
-    if 'google_credentials' not in st.session_state:
-        st.session_state.google_credentials = None
-    if 'mode' not in st.session_state:
-        st.session_state.mode = "Standard"
-    if 'use_proxy' not in st.session_state:
-        st.session_state.use_proxy = False
-    if 'proxy_string' not in st.session_state:
-        st.session_state.proxy_string = None
-    if 'stealth_mode' not in st.session_state:
-        st.session_state.stealth_mode = True
-def main():
-    # Initialize session state
-    initialize_session_state()
-    # Set up UI styling
-    setup_ui()
-    # Create sidebar
-    create_sidebar()
-    # Header section
-    col1, col2 = st.columns([5, 1])
-    with col1:
-        st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
-    with col2:
-        st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
-    mode_descriptions = {
-        "Standard": "A versatile tool for discovering and downloading files from any website.",
-        "Education Mode": "Optimized for educational resources, exams, and academic materials.",
-        "Research Mode": "Focused on research papers, datasets, and academic publications.",
-        "Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
-    }
-    st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
-    # Main tabs
-    tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
-    # Tab 1: Search & Download
-    with tabs[0]:
-        st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
-        col1, col2 = st.columns([3, 1])
-        with col1:
-            url = st.text_input("Enter a URL to search for downloadable files:",
-                                placeholder="e.g., https://example.com/resources",
-                                value=st.session_state.get('preset_url', ''))
-        with col2:
-            # Initialize search_method with either session state or default value
-            initial_search_method = st.session_state.get('search_method', "Deep Search")
-            search_method = st.selectbox("Search Method",
-                                         ["Deep Search", "Quick Search", "Exam Site Mode"],
-                                         index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
-            # Update session state when changed
-            if search_method != st.session_state.get('search_method'):
-                st.session_state.search_method = search_method
-        # Advanced options in an expander
-        with st.expander("Search Options", expanded=False):
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
-                                help="Higher values will search more links but take longer")
-                prioritize_pdfs = st.checkbox("Prioritize PDFs",
-                                            value=st.session_state.get('prioritize_pdfs', True),
-                                            help="Focus on finding PDF files first")
-            with col2:
-                timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
-                follow_subdomains = st.checkbox("Follow Subdomains", value=True,
-                                              help="Include links from subdomains in the search")
-            with col3:
-                # Default extensions based on mode
-                default_extensions = {
-                    "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
-                    "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
-                    "Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
-                    "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
-                }
-                custom_extensions = st.text_area(
-                    "Custom File Extensions",
-                    value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
-                    help="Comma-separated list of file extensions to look for"
-                )
-                # Update session state when extensions changed
-                if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
-                    st.session_state.custom_extensions = custom_extensions
-        search_col1, search_col2 = st.columns([4, 1])
-        with search_col1:
-            search_button = st.button("🔍 Start Search", use_container_width=True)
-        with search_col2:
-            clear_button = st.button("🧹 Clear Results", use_container_width=True)
-        # File results section
-        if st.session_state.files:
-            # Display file results
-            selected_files, displayed_files = display_file_results(st.session_state.files)
-            # Download options
-            if selected_files:
-                col1, col2 = st.columns(2)
-                with col1:
-                    download_dir = st.text_input("Download Directory", value="downloads")
-                with col2:
-                    download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
-                download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
-                with download_col1:
-                    download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
-                with download_col2:
-                    google_drive_button = st.button("📤 Upload to Drive",
-                                                  use_container_width=True,
-                                                  disabled=not st.session_state.google_credentials)
-                with download_col3:
-                    select_all = st.button("Select All Files", use_container_width=True)
-                # Handle select all button
-                if select_all:
-                    for i in displayed_files:
-                        st.session_state[f"select_{i}"] = True
-                    st.rerun()
-                # Handle download button if clicked
-                if download_button:
-                    # Create download directory
-                    os.makedirs(download_dir, exist_ok=True)
-                    handle_downloads(selected_files, download_dir, download_option, download_col1)
-                # Handle Google Drive upload
-                if google_drive_button:
-                    handle_google_drive_upload(selected_files)
-    # Tab 2: Local File Search
-    with tabs[1]:
-        st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
-        st.write("Upload files to search through their content with AI-powered semantic search.")
-        # File upload
-        uploaded_files = st.file_uploader("Upload documents for search",
-                                         accept_multiple_files=True,
-                                         type=['pdf', 'docx', 'txt', 'csv', 'json'])
-        if uploaded_files:
-            # Build search index on upload
-            col1, col2 = st.columns([4, 1])
-            with col1:
-                use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
-                                           help="Uses advanced AI for more accurate semantic search (if available)")
-            with col2:
-                if st.button("Build Search Index", use_container_width=True):
-                    with st.spinner("Processing files and building search index..."):
-                        files_added = 0
-                        for uploaded_file in uploaded_files:
-                            file_info = {
-                                'filename': uploaded_file.name,
-                                'url': f'local://{uploaded_file.name}',
-                                'size': humanize_file_size(uploaded_file.size)
-                            }
-                            success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
-                            if success:
-                                files_added += 1
-                        if files_added > 0:
-                            index_built = st.session_state.rag_search.build_index()
-                            if index_built:
-                                st.success(f"✅ Successfully indexed {files_added} files!")
-                            else:
-                                st.error("Failed to build search index.")
-                        else:
-                            st.warning("No valid text could be extracted from the files.")
-            # Search interface
-            st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
-            col1, col2 = st.columns([4, 1])
-            with col1:
-                query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
-            with col2:
-                expand_query = st.checkbox("Auto-expand query", value=True,
-                                         help="Automatically add related terms to your search")
-            col1, col2 = st.columns([4, 1])
-            with col1:
-                if st.button("🔍 Search Documents", use_container_width=True):
-                    if not query:
-                        st.warning("Please enter a search query")
-                    else:
-                        with st.spinner("Searching..."):
-                            results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
-                            if results:
-                                st.markdown(f"**Found {len(results)} relevant documents:**")
-                                for i, result in enumerate(results):
-                                    with st.container():
-                                        st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
-                                        st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
-                                        if result.get('chunk_preview'):
-                                            st.markdown("**Matching content:**")
-                                            st.text(result['chunk_preview'])
-                                        st.markdown("</div>", unsafe_allow_html=True)
-                            else:
-                                st.info("No matching documents found. Try a different query.")
-            with col2:
-                num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
-            # Quick search tips
-            with st.expander("Search Tips", expanded=False):
-                st.markdown("""
-                ### Effective Search Tips
-                - **Be specific** with your queries for more accurate results
-                - **Try different phrasings** if you don't get the results you expect
-                - Use **quotation marks** for exact phrase matching
-                - For **complex topics**, break down your search into multiple queries
-                - **Combine related terms** to improve recall
-                The search engine uses advanced algorithms to understand the semantic meaning of your query,
-                not just keyword matching.
-                """)
-    # Tab 3: Advanced Configuration
-    with tabs[2]:
-        st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
-        config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
-        # Browser Settings tab
-        with config_tabs[0]:
-            col1, col2 = st.columns(2)
-            with col1:
-                use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
-                                        help="Makes browser harder to detect as automated, but may be slower")
-                handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
-                                           help="Attempt to solve simple captchas automatically")
-                download_timeout = st.slider("Download Timeout (seconds)",
-                                          min_value=30, max_value=600, value=300,
-                                          help="Maximum time to wait for downloads to complete")
-            with col2:
-                user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
-                                        help="Browser identity to use when accessing websites")
-                save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
-                                           help="Save screenshots when errors occur for debugging")
-                browser_lang = st.selectbox("Browser Language",
-                                         ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
-                                         index=0)
-            if st.button("Update Browser Settings"):
-                st.session_state.stealth_mode = use_stealth
-                st.success("Browser settings updated!")
-            # Dependency installation section
-            st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
-            if st.button("Install Playwright Dependencies"):
-                from app.ui import install_playwright_dependencies
-                with st.spinner("Installing dependencies..."):
-                    install_playwright_dependencies()
-        # Proxy Configuration tab
-        with config_tabs[1]:
-            proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
-                                     help="Route requests through a proxy server for anonymity or bypassing restrictions")
-            if proxy_enabled:
-                proxy_col1, proxy_col2 = st.columns(2)
-                with proxy_col1:
-                    proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
-                    proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
-                with proxy_col2:
-                    proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
-                    proxy_auth = st.text_input("Proxy Authentication (optional)",
-                                            placeholder="username:password", type="password")
-            st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
-            use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
-                                          help="Automatically rotate between multiple proxies for better anonymity")
-            if use_proxy_rotation:
-                proxy_list = st.text_area("Proxy List (one per line)",
-                                       placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
-                rotation_interval = st.slider("Rotation Interval (requests)",
-                                           min_value=1, max_value=50, value=10,
-                                           help="How often to switch proxies")
-            if st.button("Save Proxy Configuration"):
-                # Construct the proxy string
-                proxy_string = None
-                if proxy_enabled and proxy_host and proxy_port:
-                    proxy_prefix = f"{proxy_type.lower()}://"
-                    proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
-                    proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
-                # Update session state
-                st.session_state.use_proxy = proxy_enabled
-                st.session_state.proxy_string = proxy_string
-                # Configure proxy rotation if enabled
-                from app.utils import PROXY_ROTATION_CONFIG
-                if use_proxy_rotation and proxy_list:
-                    PROXY_ROTATION_CONFIG["enabled"] = True
-                    PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
-                    PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
-                st.success("Proxy configuration updated!")
-        # Download Options tab
-        with config_tabs[2]:
-            col1, col2 = st.columns(2)
-            with col1:
-                st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
-                skip_existing = st.checkbox("Skip Existing Files", value=True,
-                                        help="Don't download files that already exist locally")
-                auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
-                                       help="Automatically rename files instead of overwriting")
-                verify_downloads = st.checkbox("Verify Downloads", value=True,
-                                           help="Check file integrity after download")
-                max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
-                                     help="Number of times to retry failed downloads")
-            with col2:
-                st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
-                auto_organize = st.checkbox("Auto-Organize Files", value=True,
-                                         help="Automatically organize files by type")
-                default_dir = st.text_input("Default Download Directory", value="downloads",
-                                         help="Default location to save downloaded files")
-                org_by_domain = st.checkbox("Organize by Domain", value=False,
-                                        help="Create subdirectories based on source domains")
-                org_by_type = st.checkbox("Organize by File Type", value=False,
-                                       help="Create subdirectories based on file types")
-            if st.button("Save Download Settings"):
-                st.session_state.download_settings = {
-                    "skip_existing": skip_existing,
-                    "auto_rename": auto_rename,
-                    "verify_downloads": verify_downloads,
-                    "max_retries": max_retries,
-                    "auto_organize": auto_organize,
-                    "default_dir": default_dir,
-                    "org_by_domain": org_by_domain,
-                    "org_by_type": org_by_type
-                }
-                st.success("Download settings saved!")
-        # System tab
-        with config_tabs[3]:
-            col1, col2 = st.columns(2)
-            with col1:
-                st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
-                max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
-                                        help="Maximum number of simultaneous downloads")
-                memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
-                                      help="Maximum memory to use for file processing")
-                processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
-                                           help="Number of threads to use for file processing")
-            with col2:
-                st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
-                log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
-                                      help="Detail level for application logs")
-                save_debug_info = st.checkbox("Save Debug Information", value=False,
-                                           help="Save detailed information about program execution")
-                log_dir = st.text_input("Log Directory", value="logs",
-                                     help="Directory to save log files")
-            if st.button("Apply System Settings"):
-                import logging
-                st.session_state.system_settings = {
-                    "max_concurrent": max_concurrent,
-                    "memory_limit": memory_limit,
-                    "processing_threads": processing_threads,
-                    "log_level": log_level,
-                    "save_debug_info": save_debug_info,
-                    "log_dir": log_dir
-                }
-                # Update logging configuration
-                log_level_num = getattr(logging, log_level)
-                logging.getLogger().setLevel(log_level_num)
-                st.success("System settings applied!")
-            # Reset application button
-            st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
-            reset_col1, reset_col2 = st.columns([1, 3])
-            with reset_col1:
-                if st.button("Reset Application", use_container_width=True):
-                    for key in list(st.session_state.keys()):
-                        if key != 'google_credentials':  # Preserve Google auth
-                            del st.session_state[key]
-                    st.success("Application has been reset!")
-                    st.rerun()
-            with reset_col2:
-                st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
-    # Tab 4: Help
-    with tabs[3]:
-        st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
-        help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
-        with help_tabs[0]:
-            st.markdown("""
-            ### Getting Started
-            1. **Enter a URL** on the Search & Download tab
-            2. Select a **Search Method**:
-               - **Deep Search**: Thorough but slower
-               - **Quick Search**: Fast but may miss some files
-               - **Exam Site Mode**: Optimized for educational resource sites
-            3. Click **Start Search** to find downloadable files
-            4. Select files you want to download
-            5. Click **Download Selected Files**
-            #### Using Different Modes
-            Select a mode from the sidebar to optimize the tool for different use cases:
-            - **Standard Mode**: Balanced for general use
-            - **Education Mode**: Optimized for finding academic materials
-            - **Research Mode**: Better for research papers and datasets
-            - **Media Mode**: Enhanced for finding images, videos, and audio
-            For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
-            """)
-        with help_tabs[1]:
-            st.markdown("""
-            ### Advanced Features
-            - **Local File Search**: Upload files and search through their content using the enhanced RAG search
-            - **Custom Extensions**: Specify additional file types to look for beyond the default set
-            - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
-            - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
-            - **Google Drive Integration**: Upload downloaded files directly to your Google Drive
-            #### Search Tips
-            - For educational sites, include specific terms like "exam", "test", "paper" in the URL
-            - When using Local File Search, try different variations of your query for better results
-            - Use filtering and sorting options to find the most relevant files quickly
-            #### File Organization
-            You can configure automatic file organization in the Advanced Configuration tab:
-            - **Organize by Domain**: Creates folders based on the source website
-            - **Organize by File Type**: Separates files into folders by their extension
-            - **Auto-Rename**: Prevents overwriting existing files with same names
-            """)
-        with help_tabs[2]:
-            st.markdown("""
-            ### Troubleshooting
-            #### Common Issues
-            - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
-            - **Downloads failing**: Check if the site requires authentication or uses captchas
-            - **Slow performance**: Reduce search depth or disable stealth mode for faster results
-            - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
-            #### Captcha Issues
-            Some websites use captchas to prevent automated access. If you encounter captchas:
-            1. Try using a different proxy
-            2. Enable "Handle Captchas Automatically" for simple captchas
-            3. For complex captchas, you may need to manually access the site first
-            #### Proxy Problems
-            If you're having issues with proxies:
-            1. Verify your proxy is working with an external tool
-            2. Check that you've entered the correct format (http://host:port)
-            3. Some websites may block known proxy IPs
-            #### Memory Usage
-            If the application is using too much memory:
-            1. Reduce the "Memory Limit" in System settings
-            2. Process fewer files at once
-            3. Use lower search depth values
-            """)
-        with help_tabs[3]:
-            st.markdown("""
-            ### About This Tool
-            **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
-            #### Key Features
-            - **Smart Discovery**: Finds downloadable files even when they're not directly linked
-            - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
-            - **Educational Focus**: Specialized detection for exam papers and academic resources
-            - **Stealth Capabilities**: Avoids detection by anti-scraping measures
-            #### Technical Details
-            This tool uses:
-            - **Playwright**: For browser automation and stealth capabilities
-            - **Sentence Transformers**: For AI-powered semantic search
-            - **Streamlit**: For the user interface
-            - **Google Drive API**: For cloud integration
-            #### Credits
-            Created with Python, Streamlit, Playwright, and various AI libraries.
-            For issues or suggestions, please contact the developer.
-            Version 2.0 - March 2025
-            """)
-    # Handle search button
-    if search_button and url:
-        # Reset files and downloaded paths
-        st.session_state.files = []
-        st.session_state.downloaded_paths = []
-        st.session_state.download_complete = False
-        # Clear the preset URL if it was used
-        if 'preset_url' in st.session_state:
-            st.session_state.preset_url = ''
-        # Prepare custom extensions
-        custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
-        # Configure search parameters based on method
-        sublink_limit = 5000 if search_method == "Deep Search" else 1000
-        search_depth = depth if search_method == "Deep Search" else 1
-        is_exam_site = search_method == "Exam Site Mode"
-        # Execute the search asynchronously
-        async def run_search():
-            async with DownloadManager(
-                use_proxy=st.session_state.use_proxy,
-                proxy=st.session_state.proxy_string,
-                use_stealth=st.session_state.stealth_mode
-            ) as manager:
-                # For exam sites, use specialized approach
-                if is_exam_site:
-                    st.session_state.keep_progress = True
-                    edu_links = await manager.get_edu_exam_links(url)
-                    all_files = []
-                    progress_text = st.empty()
-                    progress_bar = st.progress(0)
-                    # Process each exam link
-                    for i, link in enumerate(edu_links):
-                        progress = (i+1) / max(1, len(edu_links))
-                        progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
-                        progress_bar.progress(progress)
-                        files = await manager.extract_downloadable_files(link, custom_ext_list)
-                        all_files.extend(files)
-                    st.session_state.files = all_files
-                    progress_text.empty()
-                    progress_bar.empty()
-                    st.session_state.keep_progress = False
-                else:
-                    # Use general search method
-                    files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
-                    st.session_state.files = files
-        # Run the search
-        asyncio.run(run_search())
-        st.rerun()
-    # Handle clear button
-    if clear_button:
-        st.session_state.files = []
-        st.session_state.downloaded_paths = []
-        st.session_state.download_complete = False
-        if 'preset_url' in st.session_state:
-            st.session_state.preset_url = ''
-        st.rerun()
-# Entry point
-if __name__ == "__main__":
-    main()