Spaces:

euler314
/

craw_web

Running

File size: 19,771 Bytes

d4e3bdc

import streamlit as st
import os
import asyncio
import mimetypes
from app.utils import create_zip_file, humanize_file_size, show_user_friendly_error
from app.download_manager import DownloadManager
from app.rag_search import EnhancedRAGSearch
from app.google_drive import (
    get_google_auth_url, exchange_code_for_credentials, 
    google_drive_upload, create_drive_folder
)
import googleapiclient.discovery

def setup_ui():
    """Setup the main UI elements"""
    st.markdown("""
    <style>
        .stTabs [data-baseweb="tab-list"] {
            gap: 10px;
        }
        .stTabs [data-baseweb="tab"] {
            height: 50px;
            white-space: pre-wrap;
            border-radius: 4px 4px 0px 0px;
            padding: 10px 16px;
            background-color: #f0f2f6;
        }
        .stTabs [aria-selected="true"] {
            background-color: #ffffff !important;
            border-bottom: 2px solid #4c78a8;
        }
        .stFileUploader > div > div > button {
            width: 100%;
        }
        .main-header {
            font-size: 2.5rem;
            font-weight: 700;
            margin-bottom: 10px;
        }
        .section-subheader {
            font-size: 1.3rem;
            font-weight: 600;
            margin-top: 20px;
            margin-bottom: 10px;
        }
        .info-text {
            color: #6c757d;
            font-size: 0.9rem;
        }
        .stButton>button {
            width: 100%;
        }
        .result-card {
            background-color: #f8f9fa;
            border-radius: 6px;
            padding: 16px;
            margin-bottom: 12px;
            border-left: 4px solid #4c78a8;
        }
        .sidebar-header {
            font-size: 1.2rem;
            font-weight: 600;
            margin-bottom: 10px;
        }
        .sidebar-section {
            margin-bottom: 20px;
        }
    </style>
    """, unsafe_allow_html=True)

def create_sidebar():
    """Create the sidebar elements"""
    with st.sidebar:
        st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50)
        st.markdown("<p class='sidebar-header'>Advanced File Downloader</p>", unsafe_allow_html=True)
        
        # Mode Selection
        st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
        st.markdown("<p class='sidebar-header'>Mode</p>", unsafe_allow_html=True)
        mode = st.radio(
            "Select Mode",
            ["Standard", "Education Mode", "Research Mode", "Media Mode"],
            label_visibility="collapsed",
            index=["Standard", "Education Mode", "Research Mode", "Media Mode"].index(st.session_state.mode),
            horizontal=False
        )
        
        if mode != st.session_state.mode:
            st.session_state.mode = mode
            # Update mode-specific settings
            if mode == "Education Mode":
                st.session_state.custom_extensions = ".pdf,.doc,.docx,.ppt,.pptx"
                st.session_state.prioritize_pdfs = True
            elif mode == "Research Mode":
                st.session_state.custom_extensions = ".pdf,.txt,.csv,.json,.xlsx"
                st.session_state.prioritize_pdfs = True
            elif mode == "Media Mode":
                st.session_state.custom_extensions = ".jpg,.png,.mp3,.mp4,.avi,.mov"
                st.session_state.prioritize_pdfs = False
            
        st.markdown(f"<div class='info-text'>Current: <b>{st.session_state.mode}</b></div>", unsafe_allow_html=True)
        st.markdown("</div>", unsafe_allow_html=True)
        
        # Quick Settings
        st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
        st.markdown("<p class='sidebar-header'>Quick Settings</p>", unsafe_allow_html=True)
        
        stealth_mode = st.checkbox("Stealth Mode", value=st.session_state.stealth_mode)
        if stealth_mode != st.session_state.stealth_mode:
            st.session_state.stealth_mode = stealth_mode
            
        use_proxy = st.checkbox("Use Proxy", value=st.session_state.use_proxy)
        if use_proxy != st.session_state.use_proxy:
            st.session_state.use_proxy = use_proxy
            
        if use_proxy:
            proxy_string = st.text_input("Proxy Address", 
                              placeholder="e.g., http://user:pass@host:port",
                              value=st.session_state.proxy_string or "")
            if proxy_string != st.session_state.proxy_string:
                st.session_state.proxy_string = proxy_string
        
        st.markdown("</div>", unsafe_allow_html=True)
        
        # Google Drive Integration
        show_google_drive_integration()
        
        # Preset buttons for educational sites
        if st.session_state.mode == "Education Mode":
            st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
            st.markdown("<p class='sidebar-header'>Quick Access</p>", unsafe_allow_html=True)
            st.markdown("<div class='info-text'>Common Educational Sites</div>", unsafe_allow_html=True)
            
            if st.button("Past Exam Papers"):
                st.session_state.preset_url = "https://pastpapers.example.edu"
                st.session_state.search_method = "Exam Site Mode"
                st.rerun()
                
            if st.button("Open Course Materials"):
                st.session_state.preset_url = "https://opencourseware.example.edu"
                st.session_state.search_method = "Deep Search"
                st.rerun()
                
            if st.button("Research Papers"):
                st.session_state.preset_url = "https://papers.example.org"
                st.session_state.search_method = "Deep Search"
                st.rerun()
                
            st.markdown("</div>", unsafe_allow_html=True)
            
        # Tool status
        st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
        st.markdown("<p class='sidebar-header'>System Status</p>", unsafe_allow_html=True)
        
        col1, col2 = st.columns(2)
        with col1:
            st.markdown("<div class='info-text'>Search</div>", unsafe_allow_html=True)
            st.markdown("<div style='color: green; font-weight: bold;'>Active</div>", unsafe_allow_html=True)
        with col2:
            st.markdown("<div class='info-text'>Browser</div>", unsafe_allow_html=True)
            st.markdown("<div style='color: green; font-weight: bold;'>Ready</div>", unsafe_allow_html=True)
        
        if st.button("Install Dependencies"):
            with st.spinner("Installing Playwright dependencies..."):
                install_playwright_dependencies()
                
        st.markdown("</div>", unsafe_allow_html=True)
        
        # App info
        st.markdown("<div class='sidebar-section' style='position: absolute; bottom: 20px; width: 90%;'>", unsafe_allow_html=True)
        st.markdown("<div class='info-text' style='text-align: center;'>Version 2.0 • March 2025</div>", unsafe_allow_html=True)
        st.markdown("</div>", unsafe_allow_html=True)

def show_google_drive_integration():
    """Display Google Drive integration UI"""
    st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
    st.markdown("<p class='sidebar-header'>Google Drive</p>", unsafe_allow_html=True)
    
    if st.session_state.google_credentials:
        st.success("✅ Connected")
        
        drive_folder = st.text_input("Drive Folder", 
                                 value="File Downloader" if 'drive_folder' not in st.session_state else st.session_state.drive_folder)
        if 'drive_folder' not in st.session_state or drive_folder != st.session_state.drive_folder:
            st.session_state.drive_folder = drive_folder
            
        if st.button("Disconnect Drive"):
            st.session_state.google_credentials = None
            st.rerun()
    else:
        st.warning("⚠️ Not Connected")
        if st.button("Connect Google Drive"):
            auth_url = get_google_auth_url()
            st.markdown(f"[Click here to authorize]({auth_url})")
            auth_code = st.text_input("Enter authorization code:")
            
            if auth_code:
                with st.spinner("Connecting to Google Drive..."):
                    credentials, status_msg = exchange_code_for_credentials(auth_code)
                    if credentials:
                        st.session_state.google_credentials = credentials
                        st.success(status_msg)
                        st.rerun()
                    else:
                        st.error(status_msg)
    
    st.markdown("</div>", unsafe_allow_html=True)

def install_playwright_dependencies():
    """Install Playwright dependencies"""
    try:
        import subprocess
        import os

        # Set environment variable for Playwright browsers path
        os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
        
        # Install system dependencies
        subprocess.run(['apt-get', 'update', '-y'], check=True)
        packages = [
            'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
            'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
            'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
        ]
        subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
        
        # Install Playwright and dependencies
        subprocess.run(['pip', 'install', 'playwright'], check=True)
        subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
        
        st.success("Playwright dependencies installed successfully!")
    except Exception as e:
        st.error(f"Error installing Playwright dependencies: {e}")
        st.info("You may need to manually install dependencies. Check console for details.")

def display_file_results(files):
    """Display file results with filtering and sorting options"""
    if not files:
        return
        
    st.markdown("<h3 class='section-subheader'>Found Files</h3>", unsafe_allow_html=True)
    
    # File filtering options
    filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1])
    with filter_col1:
        file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.")
    with filter_col2:
        sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"])
    with filter_col3:
        show_only_pdfs = st.checkbox("PDFs Only", value=False)
    
    # Sort files based on selection
    sorted_files = list(files)
    if sort_option == "Name":
        sorted_files.sort(key=lambda x: x['filename'])
    elif sort_option == "Size (Largest)":
        # Convert size strings to comparable values
        def parse_size(size_str):
            if 'Unknown' in size_str:
                return 0
            try:
                value = float(size_str.split(' ')[0])
                unit = size_str.split(' ')[1]
                multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
                return value * multipliers.get(unit, 0)
            except:
                return 0
        
        sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True)
    elif sort_option == "Size (Smallest)":
        def parse_size(size_str):
            if 'Unknown' in size_str:
                return float('inf')
            try:
                value = float(size_str.split(' ')[0])
                unit = size_str.split(' ')[1]
                multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
                return value * multipliers.get(unit, 0)
            except:
                return float('inf')
        
        sorted_files.sort(key=lambda x: parse_size(x['size']))
    
    # File list with selection
    file_container = st.container()
    with file_container:
        selected_files = []
        displayed_files = []
        
        for i, file in enumerate(sorted_files):
            # Apply filters
            if file_filter and file_filter.lower() not in file['filename'].lower():
                continue
            if show_only_pdfs and not file['filename'].lower().endswith('.pdf'):
                continue
                
            displayed_files.append(i)
            with st.container():
                col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1])
                with col1:
                    selected = st.checkbox("", key=f"select_{i}", value=True)
                    if selected:
                        selected_files.append(i)
                with col2:
                    file_icon = get_file_icon(file['filename'])
                    st.markdown(f"**{file_icon} {file['filename']}**")
                    st.markdown(f"<span class='info-text'>{file['url'][:60]}...</span>", unsafe_allow_html=True)
                with col3:
                    st.markdown(f"**Size:** {file['size']}")
                with col4:
                    st.button("Preview", key=f"preview_{i}")
                
                st.divider()
        
        if not displayed_files:
            st.info("No files match your current filters. Try adjusting your search criteria.")
            
    return selected_files, displayed_files

def get_file_icon(filename):
    """Return appropriate icon for file type"""
    file_icon = "📄"
    if filename.lower().endswith('.pdf'):
        file_icon = "📝"
    elif filename.lower().endswith(('.doc', '.docx')):
        file_icon = "📋"
    elif filename.lower().endswith(('.xls', '.xlsx')):
        file_icon = "📊"
    elif filename.lower().endswith(('.ppt', '.pptx')):
        file_icon = "🖼️"
    elif filename.lower().endswith(('.jpg', '.png', '.gif')):
        file_icon = "🖼️"
    elif filename.lower().endswith(('.mp3', '.wav')):
        file_icon = "🔊"
    elif filename.lower().endswith(('.mp4', '.avi', '.mov')):
        file_icon = "🎬"
    return file_icon

def handle_downloads(selected_files, download_dir, download_option, download_col1):
    """Handle downloading of selected files"""
    if not selected_files:
        return
        
    # Execute the download asynchronously
    with download_col1:
        download_status = st.empty()
        download_progress = st.progress(0)
        
    async def run_download():
        async with DownloadManager(
            use_proxy=st.session_state.use_proxy,
            proxy=st.session_state.proxy_string,
            use_stealth=st.session_state.stealth_mode
        ) as manager:
            files_to_download = [st.session_state.files[i] for i in selected_files]
            
            # Reset download paths
            st.session_state.downloaded_paths = []
            
            for i, file_info in enumerate(files_to_download):
                progress = (i) / len(files_to_download)
                download_status.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}")
                download_progress.progress(progress)
                
                downloaded_path = await manager.download_file(
                    file_info, 
                    download_dir, 
                    get_domain(file_info['url'])
                )
                
                if downloaded_path:
                    st.session_state.downloaded_paths.append(downloaded_path)
            
            download_progress.progress(1.0)
            download_status.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!")
            st.session_state.download_complete = True
    
    # Run the download
    asyncio.run(run_download())
    
    # Show download results
    if st.session_state.download_complete:
        st.success(f"✅ Downloaded {len(st.session_state.downloaded_paths)} files successfully!")
        download_links = []
        for path in st.session_state.downloaded_paths:
            with open(path, "rb") as f:
                file_content = f.read()
            file_name = os.path.basename(path)
            download_links.append((file_name, file_content))
        
        if len(download_links) > 0:
            if download_option == "ZIP Archive":
                # Create ZIP archive for download
                zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir)
                with open(zip_path, "rb") as f:
                    zip_content = f.read()
                st.download_button("📦 Download ZIP Archive", 
                                  zip_content, 
                                  file_name=os.path.basename(zip_path), 
                                  mime="application/zip")
            else:
                # Show individual file download links
                st.markdown("<h4>Download Files</h4>", unsafe_allow_html=True)
                
                # Create a grid of download buttons
                cols = st.columns(3)
                for idx, (name, content) in enumerate(download_links):
                    mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream'
                    with cols[idx % 3]:
                        st.download_button(
                            f"📄 {name}", 
                            content, 
                            file_name=name, 
                            mime=mime_type, 
                            key=f"dl_{name}",
                            use_container_width=True
                        )

def handle_google_drive_upload(selected_files):
    """Handle uploading files to Google Drive"""
    if not st.session_state.google_credentials or not st.session_state.downloaded_paths:
        return
        
    with st.spinner("Uploading to Google Drive..."):
        drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials)
        
        # Create folder if it doesn't exist
        folder_id = None
        folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader"
        
        # Check if folder exists
        query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
        results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute()
        items = results.get('files', [])
        
        if not items:
            # Create folder
            folder_id = create_drive_folder(drive_service, folder_name)
        else:
            folder_id = items[0]['id']
        
        # Upload each file
        upload_progress = st.progress(0)
        status_text = st.empty()
        uploaded_count = 0
        
        for i, path in enumerate(st.session_state.downloaded_paths):
            progress = i / len(st.session_state.downloaded_paths)
            status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}")
            upload_progress.progress(progress)
            
            result = google_drive_upload(path, st.session_state.google_credentials, folder_id)
            if isinstance(result, str) and not result.startswith("Error"):
                uploaded_count += 1
        
        upload_progress.progress(1.0)
        status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'")
        
        st.success(f"✅ Files uploaded to Google Drive successfully!")

def get_domain(url):
    """Extract domain from URL"""
    from urllib.parse import urlparse
    parsed = urlparse(url)
    return parsed.netloc