File size: 32,273 Bytes
6158c43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 |
import os
import asyncio
import streamlit as st
from app.ui import (
setup_ui, create_sidebar, display_file_results,
handle_downloads, handle_google_drive_upload
)
from app.download_manager import DownloadManager
from app.rag_search import EnhancedRAGSearch
from app.utils import USER_AGENTS
def initialize_session_state():
"""Initialize session state variables"""
if 'files' not in st.session_state:
st.session_state.files = []
if 'downloaded_paths' not in st.session_state:
st.session_state.downloaded_paths = []
if 'download_complete' not in st.session_state:
st.session_state.download_complete = False
if 'selected_tab' not in st.session_state:
st.session_state.selected_tab = 0
if 'rag_search' not in st.session_state:
st.session_state.rag_search = EnhancedRAGSearch()
if 'keep_progress' not in st.session_state:
st.session_state.keep_progress = False
if 'google_credentials' not in st.session_state:
st.session_state.google_credentials = None
if 'mode' not in st.session_state:
st.session_state.mode = "Standard"
if 'use_proxy' not in st.session_state:
st.session_state.use_proxy = False
if 'proxy_string' not in st.session_state:
st.session_state.proxy_string = None
if 'stealth_mode' not in st.session_state:
st.session_state.stealth_mode = True
def main():
# Initialize session state
initialize_session_state()
# Set up UI styling
setup_ui()
# Create sidebar
create_sidebar()
# Header section
col1, col2 = st.columns([5, 1])
with col1:
st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
with col2:
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
mode_descriptions = {
"Standard": "A versatile tool for discovering and downloading files from any website.",
"Education Mode": "Optimized for educational resources, exams, and academic materials.",
"Research Mode": "Focused on research papers, datasets, and academic publications.",
"Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
}
st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
# Main tabs
tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
# Tab 1: Search & Download
with tabs[0]:
st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
col1, col2 = st.columns([3, 1])
with col1:
url = st.text_input("Enter a URL to search for downloadable files:",
placeholder="e.g., https://example.com/resources",
value=st.session_state.get('preset_url', ''))
with col2:
# Initialize search_method with either session state or default value
initial_search_method = st.session_state.get('search_method', "Deep Search")
search_method = st.selectbox("Search Method",
["Deep Search", "Quick Search", "Exam Site Mode"],
index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
# Update session state when changed
if search_method != st.session_state.get('search_method'):
st.session_state.search_method = search_method
# Advanced options in an expander
with st.expander("Search Options", expanded=False):
col1, col2, col3 = st.columns(3)
with col1:
depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
help="Higher values will search more links but take longer")
prioritize_pdfs = st.checkbox("Prioritize PDFs",
value=st.session_state.get('prioritize_pdfs', True),
help="Focus on finding PDF files first")
with col2:
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
follow_subdomains = st.checkbox("Follow Subdomains", value=True,
help="Include links from subdomains in the search")
with col3:
# Default extensions based on mode
default_extensions = {
"Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
"Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
"Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
"Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
}
custom_extensions = st.text_area(
"Custom File Extensions",
value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
help="Comma-separated list of file extensions to look for"
)
# Update session state when extensions changed
if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
st.session_state.custom_extensions = custom_extensions
search_col1, search_col2 = st.columns([4, 1])
with search_col1:
search_button = st.button("π Start Search", use_container_width=True)
with search_col2:
clear_button = st.button("π§Ή Clear Results", use_container_width=True)
# File results section
if st.session_state.files:
# Display file results
selected_files, displayed_files = display_file_results(st.session_state.files)
# Download options
if selected_files:
col1, col2 = st.columns(2)
with col1:
download_dir = st.text_input("Download Directory", value="downloads")
with col2:
download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
with download_col1:
download_button = st.button("β¬οΈ Download Selected Files", use_container_width=True)
with download_col2:
google_drive_button = st.button("π€ Upload to Drive",
use_container_width=True,
disabled=not st.session_state.google_credentials)
with download_col3:
select_all = st.button("Select All Files", use_container_width=True)
# Handle select all button
if select_all:
for i in displayed_files:
st.session_state[f"select_{i}"] = True
st.rerun()
# Handle download button if clicked
if download_button:
# Create download directory
os.makedirs(download_dir, exist_ok=True)
handle_downloads(selected_files, download_dir, download_option, download_col1)
# Handle Google Drive upload
if google_drive_button:
handle_google_drive_upload(selected_files)
# Tab 2: Local File Search
with tabs[1]:
st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
st.write("Upload files to search through their content with AI-powered semantic search.")
# File upload
uploaded_files = st.file_uploader("Upload documents for search",
accept_multiple_files=True,
type=['pdf', 'docx', 'txt', 'csv', 'json'])
if uploaded_files:
# Build search index on upload
col1, col2 = st.columns([4, 1])
with col1:
use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
help="Uses advanced AI for more accurate semantic search (if available)")
with col2:
if st.button("Build Search Index", use_container_width=True):
with st.spinner("Processing files and building search index..."):
files_added = 0
for uploaded_file in uploaded_files:
file_info = {
'filename': uploaded_file.name,
'url': f'local://{uploaded_file.name}',
'size': humanize_file_size(uploaded_file.size)
}
success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
if success:
files_added += 1
if files_added > 0:
index_built = st.session_state.rag_search.build_index()
if index_built:
st.success(f"β
Successfully indexed {files_added} files!")
else:
st.error("Failed to build search index.")
else:
st.warning("No valid text could be extracted from the files.")
# Search interface
st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
col1, col2 = st.columns([4, 1])
with col1:
query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
with col2:
expand_query = st.checkbox("Auto-expand query", value=True,
help="Automatically add related terms to your search")
col1, col2 = st.columns([4, 1])
with col1:
if st.button("π Search Documents", use_container_width=True):
if not query:
st.warning("Please enter a search query")
else:
with st.spinner("Searching..."):
results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
if results:
st.markdown(f"**Found {len(results)} relevant documents:**")
for i, result in enumerate(results):
with st.container():
st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
if result.get('chunk_preview'):
st.markdown("**Matching content:**")
st.text(result['chunk_preview'])
st.markdown("</div>", unsafe_allow_html=True)
else:
st.info("No matching documents found. Try a different query.")
with col2:
num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
# Quick search tips
with st.expander("Search Tips", expanded=False):
st.markdown("""
### Effective Search Tips
- **Be specific** with your queries for more accurate results
- **Try different phrasings** if you don't get the results you expect
- Use **quotation marks** for exact phrase matching
- For **complex topics**, break down your search into multiple queries
- **Combine related terms** to improve recall
The search engine uses advanced algorithms to understand the semantic meaning of your query,
not just keyword matching.
""")
# Tab 3: Advanced Configuration
with tabs[2]:
st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
# Browser Settings tab
with config_tabs[0]:
col1, col2 = st.columns(2)
with col1:
use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
help="Makes browser harder to detect as automated, but may be slower")
handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
help="Attempt to solve simple captchas automatically")
download_timeout = st.slider("Download Timeout (seconds)",
min_value=30, max_value=600, value=300,
help="Maximum time to wait for downloads to complete")
with col2:
user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
help="Browser identity to use when accessing websites")
save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
help="Save screenshots when errors occur for debugging")
browser_lang = st.selectbox("Browser Language",
["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
index=0)
if st.button("Update Browser Settings"):
st.session_state.stealth_mode = use_stealth
st.success("Browser settings updated!")
# Dependency installation section
st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
if st.button("Install Playwright Dependencies"):
from app.ui import install_playwright_dependencies
with st.spinner("Installing dependencies..."):
install_playwright_dependencies()
# Proxy Configuration tab
with config_tabs[1]:
proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
help="Route requests through a proxy server for anonymity or bypassing restrictions")
if proxy_enabled:
proxy_col1, proxy_col2 = st.columns(2)
with proxy_col1:
proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
with proxy_col2:
proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
proxy_auth = st.text_input("Proxy Authentication (optional)",
placeholder="username:password", type="password")
st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
help="Automatically rotate between multiple proxies for better anonymity")
if use_proxy_rotation:
proxy_list = st.text_area("Proxy List (one per line)",
placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
rotation_interval = st.slider("Rotation Interval (requests)",
min_value=1, max_value=50, value=10,
help="How often to switch proxies")
if st.button("Save Proxy Configuration"):
# Construct the proxy string
proxy_string = None
if proxy_enabled and proxy_host and proxy_port:
proxy_prefix = f"{proxy_type.lower()}://"
proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
# Update session state
st.session_state.use_proxy = proxy_enabled
st.session_state.proxy_string = proxy_string
# Configure proxy rotation if enabled
from app.utils import PROXY_ROTATION_CONFIG
if use_proxy_rotation and proxy_list:
PROXY_ROTATION_CONFIG["enabled"] = True
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
st.success("Proxy configuration updated!")
# Download Options tab
with config_tabs[2]:
col1, col2 = st.columns(2)
with col1:
st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
skip_existing = st.checkbox("Skip Existing Files", value=True,
help="Don't download files that already exist locally")
auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
help="Automatically rename files instead of overwriting")
verify_downloads = st.checkbox("Verify Downloads", value=True,
help="Check file integrity after download")
max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
help="Number of times to retry failed downloads")
with col2:
st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
auto_organize = st.checkbox("Auto-Organize Files", value=True,
help="Automatically organize files by type")
default_dir = st.text_input("Default Download Directory", value="downloads",
help="Default location to save downloaded files")
org_by_domain = st.checkbox("Organize by Domain", value=False,
help="Create subdirectories based on source domains")
org_by_type = st.checkbox("Organize by File Type", value=False,
help="Create subdirectories based on file types")
if st.button("Save Download Settings"):
st.session_state.download_settings = {
"skip_existing": skip_existing,
"auto_rename": auto_rename,
"verify_downloads": verify_downloads,
"max_retries": max_retries,
"auto_organize": auto_organize,
"default_dir": default_dir,
"org_by_domain": org_by_domain,
"org_by_type": org_by_type
}
st.success("Download settings saved!")
# System tab
with config_tabs[3]:
col1, col2 = st.columns(2)
with col1:
st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
help="Maximum number of simultaneous downloads")
memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
help="Maximum memory to use for file processing")
processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
help="Number of threads to use for file processing")
with col2:
st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
help="Detail level for application logs")
save_debug_info = st.checkbox("Save Debug Information", value=False,
help="Save detailed information about program execution")
log_dir = st.text_input("Log Directory", value="logs",
help="Directory to save log files")
if st.button("Apply System Settings"):
import logging
st.session_state.system_settings = {
"max_concurrent": max_concurrent,
"memory_limit": memory_limit,
"processing_threads": processing_threads,
"log_level": log_level,
"save_debug_info": save_debug_info,
"log_dir": log_dir
}
# Update logging configuration
log_level_num = getattr(logging, log_level)
logging.getLogger().setLevel(log_level_num)
st.success("System settings applied!")
# Reset application button
st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
reset_col1, reset_col2 = st.columns([1, 3])
with reset_col1:
if st.button("Reset Application", use_container_width=True):
for key in list(st.session_state.keys()):
if key != 'google_credentials': # Preserve Google auth
del st.session_state[key]
st.success("Application has been reset!")
st.rerun()
with reset_col2:
st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
# Tab 4: Help
with tabs[3]:
st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
with help_tabs[0]:
st.markdown("""
### Getting Started
1. **Enter a URL** on the Search & Download tab
2. Select a **Search Method**:
- **Deep Search**: Thorough but slower
- **Quick Search**: Fast but may miss some files
- **Exam Site Mode**: Optimized for educational resource sites
3. Click **Start Search** to find downloadable files
4. Select files you want to download
5. Click **Download Selected Files**
#### Using Different Modes
Select a mode from the sidebar to optimize the tool for different use cases:
- **Standard Mode**: Balanced for general use
- **Education Mode**: Optimized for finding academic materials
- **Research Mode**: Better for research papers and datasets
- **Media Mode**: Enhanced for finding images, videos, and audio
For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
""")
with help_tabs[1]:
st.markdown("""
### Advanced Features
- **Local File Search**: Upload files and search through their content using the enhanced RAG search
- **Custom Extensions**: Specify additional file types to look for beyond the default set
- **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
- **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
- **Google Drive Integration**: Upload downloaded files directly to your Google Drive
#### Search Tips
- For educational sites, include specific terms like "exam", "test", "paper" in the URL
- When using Local File Search, try different variations of your query for better results
- Use filtering and sorting options to find the most relevant files quickly
#### File Organization
You can configure automatic file organization in the Advanced Configuration tab:
- **Organize by Domain**: Creates folders based on the source website
- **Organize by File Type**: Separates files into folders by their extension
- **Auto-Rename**: Prevents overwriting existing files with same names
""")
with help_tabs[2]:
st.markdown("""
### Troubleshooting
#### Common Issues
- **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
- **Downloads failing**: Check if the site requires authentication or uses captchas
- **Slow performance**: Reduce search depth or disable stealth mode for faster results
- **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
#### Captcha Issues
Some websites use captchas to prevent automated access. If you encounter captchas:
1. Try using a different proxy
2. Enable "Handle Captchas Automatically" for simple captchas
3. For complex captchas, you may need to manually access the site first
#### Proxy Problems
If you're having issues with proxies:
1. Verify your proxy is working with an external tool
2. Check that you've entered the correct format (http://host:port)
3. Some websites may block known proxy IPs
#### Memory Usage
If the application is using too much memory:
1. Reduce the "Memory Limit" in System settings
2. Process fewer files at once
3. Use lower search depth values
""")
with help_tabs[3]:
st.markdown("""
### About This Tool
**Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
#### Key Features
- **Smart Discovery**: Finds downloadable files even when they're not directly linked
- **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
- **Educational Focus**: Specialized detection for exam papers and academic resources
- **Stealth Capabilities**: Avoids detection by anti-scraping measures
#### Technical Details
This tool uses:
- **Playwright**: For browser automation and stealth capabilities
- **Sentence Transformers**: For AI-powered semantic search
- **Streamlit**: For the user interface
- **Google Drive API**: For cloud integration
#### Credits
Created with Python, Streamlit, Playwright, and various AI libraries.
For issues or suggestions, please contact the developer.
Version 2.0 - March 2025
""")
# Handle search button
if search_button and url:
# Reset files and downloaded paths
st.session_state.files = []
st.session_state.downloaded_paths = []
st.session_state.download_complete = False
# Clear the preset URL if it was used
if 'preset_url' in st.session_state:
st.session_state.preset_url = ''
# Prepare custom extensions
custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
# Configure search parameters based on method
sublink_limit = 5000 if search_method == "Deep Search" else 1000
search_depth = depth if search_method == "Deep Search" else 1
is_exam_site = search_method == "Exam Site Mode"
# Execute the search asynchronously
async def run_search():
async with DownloadManager(
use_proxy=st.session_state.use_proxy,
proxy=st.session_state.proxy_string,
use_stealth=st.session_state.stealth_mode
) as manager:
# For exam sites, use specialized approach
if is_exam_site:
st.session_state.keep_progress = True
edu_links = await manager.get_edu_exam_links(url)
all_files = []
progress_text = st.empty()
progress_bar = st.progress(0)
# Process each exam link
for i, link in enumerate(edu_links):
progress = (i+1) / max(1, len(edu_links))
progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
progress_bar.progress(progress)
files = await manager.extract_downloadable_files(link, custom_ext_list)
all_files.extend(files)
st.session_state.files = all_files
progress_text.empty()
progress_bar.empty()
st.session_state.keep_progress = False
else:
# Use general search method
files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
st.session_state.files = files
# Run the search
asyncio.run(run_search())
st.rerun()
# Handle clear button
if clear_button:
st.session_state.files = []
st.session_state.downloaded_paths = []
st.session_state.download_complete = False
if 'preset_url' in st.session_state:
st.session_state.preset_url = ''
st.rerun()
# Entry point
if __name__ == "__main__":
main() |