craw_web / main.py
euler314's picture
Rename app/app.py to main.py
4f3b847 verified
raw
history blame
32.3 kB
import os
import asyncio
import streamlit as st
from app.ui import (
setup_ui, create_sidebar, display_file_results,
handle_downloads, handle_google_drive_upload
)
from app.download_manager import DownloadManager
from app.rag_search import EnhancedRAGSearch
from app.utils import USER_AGENTS
def initialize_session_state():
"""Initialize session state variables"""
if 'files' not in st.session_state:
st.session_state.files = []
if 'downloaded_paths' not in st.session_state:
st.session_state.downloaded_paths = []
if 'download_complete' not in st.session_state:
st.session_state.download_complete = False
if 'selected_tab' not in st.session_state:
st.session_state.selected_tab = 0
if 'rag_search' not in st.session_state:
st.session_state.rag_search = EnhancedRAGSearch()
if 'keep_progress' not in st.session_state:
st.session_state.keep_progress = False
if 'google_credentials' not in st.session_state:
st.session_state.google_credentials = None
if 'mode' not in st.session_state:
st.session_state.mode = "Standard"
if 'use_proxy' not in st.session_state:
st.session_state.use_proxy = False
if 'proxy_string' not in st.session_state:
st.session_state.proxy_string = None
if 'stealth_mode' not in st.session_state:
st.session_state.stealth_mode = True
def main():
# Initialize session state
initialize_session_state()
# Set up UI styling
setup_ui()
# Create sidebar
create_sidebar()
# Header section
col1, col2 = st.columns([5, 1])
with col1:
st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
with col2:
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
mode_descriptions = {
"Standard": "A versatile tool for discovering and downloading files from any website.",
"Education Mode": "Optimized for educational resources, exams, and academic materials.",
"Research Mode": "Focused on research papers, datasets, and academic publications.",
"Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
}
st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
# Main tabs
tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
# Tab 1: Search & Download
with tabs[0]:
st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
col1, col2 = st.columns([3, 1])
with col1:
url = st.text_input("Enter a URL to search for downloadable files:",
placeholder="e.g., https://example.com/resources",
value=st.session_state.get('preset_url', ''))
with col2:
# Initialize search_method with either session state or default value
initial_search_method = st.session_state.get('search_method', "Deep Search")
search_method = st.selectbox("Search Method",
["Deep Search", "Quick Search", "Exam Site Mode"],
index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
# Update session state when changed
if search_method != st.session_state.get('search_method'):
st.session_state.search_method = search_method
# Advanced options in an expander
with st.expander("Search Options", expanded=False):
col1, col2, col3 = st.columns(3)
with col1:
depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
help="Higher values will search more links but take longer")
prioritize_pdfs = st.checkbox("Prioritize PDFs",
value=st.session_state.get('prioritize_pdfs', True),
help="Focus on finding PDF files first")
with col2:
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
follow_subdomains = st.checkbox("Follow Subdomains", value=True,
help="Include links from subdomains in the search")
with col3:
# Default extensions based on mode
default_extensions = {
"Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
"Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
"Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
"Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
}
custom_extensions = st.text_area(
"Custom File Extensions",
value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
help="Comma-separated list of file extensions to look for"
)
# Update session state when extensions changed
if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
st.session_state.custom_extensions = custom_extensions
search_col1, search_col2 = st.columns([4, 1])
with search_col1:
search_button = st.button("🔍 Start Search", use_container_width=True)
with search_col2:
clear_button = st.button("🧹 Clear Results", use_container_width=True)
# File results section
if st.session_state.files:
# Display file results
selected_files, displayed_files = display_file_results(st.session_state.files)
# Download options
if selected_files:
col1, col2 = st.columns(2)
with col1:
download_dir = st.text_input("Download Directory", value="downloads")
with col2:
download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
with download_col1:
download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
with download_col2:
google_drive_button = st.button("📤 Upload to Drive",
use_container_width=True,
disabled=not st.session_state.google_credentials)
with download_col3:
select_all = st.button("Select All Files", use_container_width=True)
# Handle select all button
if select_all:
for i in displayed_files:
st.session_state[f"select_{i}"] = True
st.rerun()
# Handle download button if clicked
if download_button:
# Create download directory
os.makedirs(download_dir, exist_ok=True)
handle_downloads(selected_files, download_dir, download_option, download_col1)
# Handle Google Drive upload
if google_drive_button:
handle_google_drive_upload(selected_files)
# Tab 2: Local File Search
with tabs[1]:
st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
st.write("Upload files to search through their content with AI-powered semantic search.")
# File upload
uploaded_files = st.file_uploader("Upload documents for search",
accept_multiple_files=True,
type=['pdf', 'docx', 'txt', 'csv', 'json'])
if uploaded_files:
# Build search index on upload
col1, col2 = st.columns([4, 1])
with col1:
use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
help="Uses advanced AI for more accurate semantic search (if available)")
with col2:
if st.button("Build Search Index", use_container_width=True):
with st.spinner("Processing files and building search index..."):
files_added = 0
for uploaded_file in uploaded_files:
file_info = {
'filename': uploaded_file.name,
'url': f'local://{uploaded_file.name}',
'size': humanize_file_size(uploaded_file.size)
}
success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
if success:
files_added += 1
if files_added > 0:
index_built = st.session_state.rag_search.build_index()
if index_built:
st.success(f"✅ Successfully indexed {files_added} files!")
else:
st.error("Failed to build search index.")
else:
st.warning("No valid text could be extracted from the files.")
# Search interface
st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
col1, col2 = st.columns([4, 1])
with col1:
query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
with col2:
expand_query = st.checkbox("Auto-expand query", value=True,
help="Automatically add related terms to your search")
col1, col2 = st.columns([4, 1])
with col1:
if st.button("🔍 Search Documents", use_container_width=True):
if not query:
st.warning("Please enter a search query")
else:
with st.spinner("Searching..."):
results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
if results:
st.markdown(f"**Found {len(results)} relevant documents:**")
for i, result in enumerate(results):
with st.container():
st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
if result.get('chunk_preview'):
st.markdown("**Matching content:**")
st.text(result['chunk_preview'])
st.markdown("</div>", unsafe_allow_html=True)
else:
st.info("No matching documents found. Try a different query.")
with col2:
num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
# Quick search tips
with st.expander("Search Tips", expanded=False):
st.markdown("""
### Effective Search Tips
- **Be specific** with your queries for more accurate results
- **Try different phrasings** if you don't get the results you expect
- Use **quotation marks** for exact phrase matching
- For **complex topics**, break down your search into multiple queries
- **Combine related terms** to improve recall
The search engine uses advanced algorithms to understand the semantic meaning of your query,
not just keyword matching.
""")
# Tab 3: Advanced Configuration
with tabs[2]:
st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
# Browser Settings tab
with config_tabs[0]:
col1, col2 = st.columns(2)
with col1:
use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
help="Makes browser harder to detect as automated, but may be slower")
handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
help="Attempt to solve simple captchas automatically")
download_timeout = st.slider("Download Timeout (seconds)",
min_value=30, max_value=600, value=300,
help="Maximum time to wait for downloads to complete")
with col2:
user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
help="Browser identity to use when accessing websites")
save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
help="Save screenshots when errors occur for debugging")
browser_lang = st.selectbox("Browser Language",
["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
index=0)
if st.button("Update Browser Settings"):
st.session_state.stealth_mode = use_stealth
st.success("Browser settings updated!")
# Dependency installation section
st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
if st.button("Install Playwright Dependencies"):
from app.ui import install_playwright_dependencies
with st.spinner("Installing dependencies..."):
install_playwright_dependencies()
# Proxy Configuration tab
with config_tabs[1]:
proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
help="Route requests through a proxy server for anonymity or bypassing restrictions")
if proxy_enabled:
proxy_col1, proxy_col2 = st.columns(2)
with proxy_col1:
proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
with proxy_col2:
proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
proxy_auth = st.text_input("Proxy Authentication (optional)",
placeholder="username:password", type="password")
st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
help="Automatically rotate between multiple proxies for better anonymity")
if use_proxy_rotation:
proxy_list = st.text_area("Proxy List (one per line)",
placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
rotation_interval = st.slider("Rotation Interval (requests)",
min_value=1, max_value=50, value=10,
help="How often to switch proxies")
if st.button("Save Proxy Configuration"):
# Construct the proxy string
proxy_string = None
if proxy_enabled and proxy_host and proxy_port:
proxy_prefix = f"{proxy_type.lower()}://"
proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
# Update session state
st.session_state.use_proxy = proxy_enabled
st.session_state.proxy_string = proxy_string
# Configure proxy rotation if enabled
from app.utils import PROXY_ROTATION_CONFIG
if use_proxy_rotation and proxy_list:
PROXY_ROTATION_CONFIG["enabled"] = True
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
st.success("Proxy configuration updated!")
# Download Options tab
with config_tabs[2]:
col1, col2 = st.columns(2)
with col1:
st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
skip_existing = st.checkbox("Skip Existing Files", value=True,
help="Don't download files that already exist locally")
auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
help="Automatically rename files instead of overwriting")
verify_downloads = st.checkbox("Verify Downloads", value=True,
help="Check file integrity after download")
max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
help="Number of times to retry failed downloads")
with col2:
st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
auto_organize = st.checkbox("Auto-Organize Files", value=True,
help="Automatically organize files by type")
default_dir = st.text_input("Default Download Directory", value="downloads",
help="Default location to save downloaded files")
org_by_domain = st.checkbox("Organize by Domain", value=False,
help="Create subdirectories based on source domains")
org_by_type = st.checkbox("Organize by File Type", value=False,
help="Create subdirectories based on file types")
if st.button("Save Download Settings"):
st.session_state.download_settings = {
"skip_existing": skip_existing,
"auto_rename": auto_rename,
"verify_downloads": verify_downloads,
"max_retries": max_retries,
"auto_organize": auto_organize,
"default_dir": default_dir,
"org_by_domain": org_by_domain,
"org_by_type": org_by_type
}
st.success("Download settings saved!")
# System tab
with config_tabs[3]:
col1, col2 = st.columns(2)
with col1:
st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
help="Maximum number of simultaneous downloads")
memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
help="Maximum memory to use for file processing")
processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
help="Number of threads to use for file processing")
with col2:
st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
help="Detail level for application logs")
save_debug_info = st.checkbox("Save Debug Information", value=False,
help="Save detailed information about program execution")
log_dir = st.text_input("Log Directory", value="logs",
help="Directory to save log files")
if st.button("Apply System Settings"):
import logging
st.session_state.system_settings = {
"max_concurrent": max_concurrent,
"memory_limit": memory_limit,
"processing_threads": processing_threads,
"log_level": log_level,
"save_debug_info": save_debug_info,
"log_dir": log_dir
}
# Update logging configuration
log_level_num = getattr(logging, log_level)
logging.getLogger().setLevel(log_level_num)
st.success("System settings applied!")
# Reset application button
st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
reset_col1, reset_col2 = st.columns([1, 3])
with reset_col1:
if st.button("Reset Application", use_container_width=True):
for key in list(st.session_state.keys()):
if key != 'google_credentials': # Preserve Google auth
del st.session_state[key]
st.success("Application has been reset!")
st.rerun()
with reset_col2:
st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
# Tab 4: Help
with tabs[3]:
st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
with help_tabs[0]:
st.markdown("""
### Getting Started
1. **Enter a URL** on the Search & Download tab
2. Select a **Search Method**:
- **Deep Search**: Thorough but slower
- **Quick Search**: Fast but may miss some files
- **Exam Site Mode**: Optimized for educational resource sites
3. Click **Start Search** to find downloadable files
4. Select files you want to download
5. Click **Download Selected Files**
#### Using Different Modes
Select a mode from the sidebar to optimize the tool for different use cases:
- **Standard Mode**: Balanced for general use
- **Education Mode**: Optimized for finding academic materials
- **Research Mode**: Better for research papers and datasets
- **Media Mode**: Enhanced for finding images, videos, and audio
For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
""")
with help_tabs[1]:
st.markdown("""
### Advanced Features
- **Local File Search**: Upload files and search through their content using the enhanced RAG search
- **Custom Extensions**: Specify additional file types to look for beyond the default set
- **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
- **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
- **Google Drive Integration**: Upload downloaded files directly to your Google Drive
#### Search Tips
- For educational sites, include specific terms like "exam", "test", "paper" in the URL
- When using Local File Search, try different variations of your query for better results
- Use filtering and sorting options to find the most relevant files quickly
#### File Organization
You can configure automatic file organization in the Advanced Configuration tab:
- **Organize by Domain**: Creates folders based on the source website
- **Organize by File Type**: Separates files into folders by their extension
- **Auto-Rename**: Prevents overwriting existing files with same names
""")
with help_tabs[2]:
st.markdown("""
### Troubleshooting
#### Common Issues
- **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
- **Downloads failing**: Check if the site requires authentication or uses captchas
- **Slow performance**: Reduce search depth or disable stealth mode for faster results
- **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
#### Captcha Issues
Some websites use captchas to prevent automated access. If you encounter captchas:
1. Try using a different proxy
2. Enable "Handle Captchas Automatically" for simple captchas
3. For complex captchas, you may need to manually access the site first
#### Proxy Problems
If you're having issues with proxies:
1. Verify your proxy is working with an external tool
2. Check that you've entered the correct format (http://host:port)
3. Some websites may block known proxy IPs
#### Memory Usage
If the application is using too much memory:
1. Reduce the "Memory Limit" in System settings
2. Process fewer files at once
3. Use lower search depth values
""")
with help_tabs[3]:
st.markdown("""
### About This Tool
**Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
#### Key Features
- **Smart Discovery**: Finds downloadable files even when they're not directly linked
- **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
- **Educational Focus**: Specialized detection for exam papers and academic resources
- **Stealth Capabilities**: Avoids detection by anti-scraping measures
#### Technical Details
This tool uses:
- **Playwright**: For browser automation and stealth capabilities
- **Sentence Transformers**: For AI-powered semantic search
- **Streamlit**: For the user interface
- **Google Drive API**: For cloud integration
#### Credits
Created with Python, Streamlit, Playwright, and various AI libraries.
For issues or suggestions, please contact the developer.
Version 2.0 - March 2025
""")
# Handle search button
if search_button and url:
# Reset files and downloaded paths
st.session_state.files = []
st.session_state.downloaded_paths = []
st.session_state.download_complete = False
# Clear the preset URL if it was used
if 'preset_url' in st.session_state:
st.session_state.preset_url = ''
# Prepare custom extensions
custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
# Configure search parameters based on method
sublink_limit = 5000 if search_method == "Deep Search" else 1000
search_depth = depth if search_method == "Deep Search" else 1
is_exam_site = search_method == "Exam Site Mode"
# Execute the search asynchronously
async def run_search():
async with DownloadManager(
use_proxy=st.session_state.use_proxy,
proxy=st.session_state.proxy_string,
use_stealth=st.session_state.stealth_mode
) as manager:
# For exam sites, use specialized approach
if is_exam_site:
st.session_state.keep_progress = True
edu_links = await manager.get_edu_exam_links(url)
all_files = []
progress_text = st.empty()
progress_bar = st.progress(0)
# Process each exam link
for i, link in enumerate(edu_links):
progress = (i+1) / max(1, len(edu_links))
progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
progress_bar.progress(progress)
files = await manager.extract_downloadable_files(link, custom_ext_list)
all_files.extend(files)
st.session_state.files = all_files
progress_text.empty()
progress_bar.empty()
st.session_state.keep_progress = False
else:
# Use general search method
files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
st.session_state.files = files
# Run the search
asyncio.run(run_search())
st.rerun()
# Handle clear button
if clear_button:
st.session_state.files = []
st.session_state.downloaded_paths = []
st.session_state.download_complete = False
if 'preset_url' in st.session_state:
st.session_state.preset_url = ''
st.rerun()
# Entry point
if __name__ == "__main__":
main()