Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

craw_web / main.py

euler314

Rename app/app.py to main.py

4f3b847 verified 3 months ago

raw

history blame

32.3 kB

	import os
	import asyncio
	import streamlit as st
	from app.ui import (
	setup_ui, create_sidebar, display_file_results,
	handle_downloads, handle_google_drive_upload
	)
	from app.download_manager import DownloadManager
	from app.rag_search import EnhancedRAGSearch
	from app.utils import USER_AGENTS

	def initialize_session_state():
	"""Initialize session state variables"""
	if 'files' not in st.session_state:
	st.session_state.files = []
	if 'downloaded_paths' not in st.session_state:
	st.session_state.downloaded_paths = []
	if 'download_complete' not in st.session_state:
	st.session_state.download_complete = False
	if 'selected_tab' not in st.session_state:
	st.session_state.selected_tab = 0
	if 'rag_search' not in st.session_state:
	st.session_state.rag_search = EnhancedRAGSearch()
	if 'keep_progress' not in st.session_state:
	st.session_state.keep_progress = False
	if 'google_credentials' not in st.session_state:
	st.session_state.google_credentials = None
	if 'mode' not in st.session_state:
	st.session_state.mode = "Standard"
	if 'use_proxy' not in st.session_state:
	st.session_state.use_proxy = False
	if 'proxy_string' not in st.session_state:
	st.session_state.proxy_string = None
	if 'stealth_mode' not in st.session_state:
	st.session_state.stealth_mode = True

	def main():
	# Initialize session state
	initialize_session_state()

	# Set up UI styling
	setup_ui()

	# Create sidebar
	create_sidebar()

	# Header section
	col1, col2 = st.columns([5, 1])
	with col1:
	st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
	with col2:
	st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)

	mode_descriptions = {
	"Standard": "A versatile tool for discovering and downloading files from any website.",
	"Education Mode": "Optimized for educational resources, exams, and academic materials.",
	"Research Mode": "Focused on research papers, datasets, and academic publications.",
	"Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
	}

	st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)

	# Main tabs
	tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])

	# Tab 1: Search & Download
	with tabs[0]:
	st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)

	col1, col2 = st.columns([3, 1])
	with col1:
	url = st.text_input("Enter a URL to search for downloadable files:",
	placeholder="e.g., https://example.com/resources",
	value=st.session_state.get('preset_url', ''))
	with col2:
	# Initialize search_method with either session state or default value
	initial_search_method = st.session_state.get('search_method', "Deep Search")
	search_method = st.selectbox("Search Method",
	["Deep Search", "Quick Search", "Exam Site Mode"],
	index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
	# Update session state when changed
	if search_method != st.session_state.get('search_method'):
	st.session_state.search_method = search_method

	# Advanced options in an expander
	with st.expander("Search Options", expanded=False):
	col1, col2, col3 = st.columns(3)
	with col1:
	depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
	help="Higher values will search more links but take longer")
	prioritize_pdfs = st.checkbox("Prioritize PDFs",
	value=st.session_state.get('prioritize_pdfs', True),
	help="Focus on finding PDF files first")
	with col2:
	timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
	follow_subdomains = st.checkbox("Follow Subdomains", value=True,
	help="Include links from subdomains in the search")
	with col3:
	# Default extensions based on mode
	default_extensions = {
	"Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
	"Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
	"Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
	"Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
	}

	custom_extensions = st.text_area(
	"Custom File Extensions",
	value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
	help="Comma-separated list of file extensions to look for"
	)

	# Update session state when extensions changed
	if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
	st.session_state.custom_extensions = custom_extensions

	search_col1, search_col2 = st.columns([4, 1])
	with search_col1:
	search_button = st.button("🔍 Start Search", use_container_width=True)
	with search_col2:
	clear_button = st.button("🧹 Clear Results", use_container_width=True)

	# File results section
	if st.session_state.files:
	# Display file results
	selected_files, displayed_files = display_file_results(st.session_state.files)

	# Download options
	if selected_files:
	col1, col2 = st.columns(2)
	with col1:
	download_dir = st.text_input("Download Directory", value="downloads")
	with col2:
	download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)

	download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
	with download_col1:
	download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
	with download_col2:
	google_drive_button = st.button("📤 Upload to Drive",
	use_container_width=True,
	disabled=not st.session_state.google_credentials)
	with download_col3:
	select_all = st.button("Select All Files", use_container_width=True)

	# Handle select all button
	if select_all:
	for i in displayed_files:
	st.session_state[f"select_{i}"] = True
	st.rerun()

	# Handle download button if clicked
	if download_button:
	# Create download directory
	os.makedirs(download_dir, exist_ok=True)
	handle_downloads(selected_files, download_dir, download_option, download_col1)

	# Handle Google Drive upload
	if google_drive_button:
	handle_google_drive_upload(selected_files)

	# Tab 2: Local File Search
	with tabs[1]:
	st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
	st.write("Upload files to search through their content with AI-powered semantic search.")

	# File upload
	uploaded_files = st.file_uploader("Upload documents for search",
	accept_multiple_files=True,
	type=['pdf', 'docx', 'txt', 'csv', 'json'])

	if uploaded_files:
	# Build search index on upload
	col1, col2 = st.columns([4, 1])
	with col1:
	use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
	help="Uses advanced AI for more accurate semantic search (if available)")
	with col2:
	if st.button("Build Search Index", use_container_width=True):
	with st.spinner("Processing files and building search index..."):
	files_added = 0
	for uploaded_file in uploaded_files:
	file_info = {
	'filename': uploaded_file.name,
	'url': f'local://{uploaded_file.name}',
	'size': humanize_file_size(uploaded_file.size)
	}
	success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
	if success:
	files_added += 1

	if files_added > 0:
	index_built = st.session_state.rag_search.build_index()
	if index_built:
	st.success(f"✅ Successfully indexed {files_added} files!")
	else:
	st.error("Failed to build search index.")
	else:
	st.warning("No valid text could be extracted from the files.")

	# Search interface
	st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)

	col1, col2 = st.columns([4, 1])
	with col1:
	query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
	with col2:
	expand_query = st.checkbox("Auto-expand query", value=True,
	help="Automatically add related terms to your search")

	col1, col2 = st.columns([4, 1])
	with col1:
	if st.button("🔍 Search Documents", use_container_width=True):
	if not query:
	st.warning("Please enter a search query")
	else:
	with st.spinner("Searching..."):
	results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)

	if results:
	st.markdown(f"Found {len(results)} relevant documents:")
	for i, result in enumerate(results):
	with st.container():
	st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
	st.markdown(f"{i+1}. {result['file_info']['filename']} (Score: {result['score']:.2f})")

	if result.get('chunk_preview'):
	st.markdown("Matching content:")
	st.text(result['chunk_preview'])

	st.markdown("</div>", unsafe_allow_html=True)
	else:
	st.info("No matching documents found. Try a different query.")
	with col2:
	num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)

	# Quick search tips
	with st.expander("Search Tips", expanded=False):
	st.markdown("""
	### Effective Search Tips

	- Be specific with your queries for more accurate results
	- Try different phrasings if you don't get the results you expect
	- Use quotation marks for exact phrase matching
	- For complex topics, break down your search into multiple queries
	- Combine related terms to improve recall

	The search engine uses advanced algorithms to understand the semantic meaning of your query,
	not just keyword matching.
	""")

	# Tab 3: Advanced Configuration
	with tabs[2]:
	st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)

	config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])

	# Browser Settings tab
	with config_tabs[0]:
	col1, col2 = st.columns(2)
	with col1:
	use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
	help="Makes browser harder to detect as automated, but may be slower")

	handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
	help="Attempt to solve simple captchas automatically")

	download_timeout = st.slider("Download Timeout (seconds)",
	min_value=30, max_value=600, value=300,
	help="Maximum time to wait for downloads to complete")
	with col2:
	user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
	help="Browser identity to use when accessing websites")

	save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
	help="Save screenshots when errors occur for debugging")

	browser_lang = st.selectbox("Browser Language",
	["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
	index=0)

	if st.button("Update Browser Settings"):
	st.session_state.stealth_mode = use_stealth
	st.success("Browser settings updated!")

	# Dependency installation section
	st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
	if st.button("Install Playwright Dependencies"):
	from app.ui import install_playwright_dependencies
	with st.spinner("Installing dependencies..."):
	install_playwright_dependencies()

	# Proxy Configuration tab
	with config_tabs[1]:
	proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
	help="Route requests through a proxy server for anonymity or bypassing restrictions")

	if proxy_enabled:
	proxy_col1, proxy_col2 = st.columns(2)
	with proxy_col1:
	proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
	proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
	with proxy_col2:
	proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
	proxy_auth = st.text_input("Proxy Authentication (optional)",
	placeholder="username:password", type="password")

	st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
	use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
	help="Automatically rotate between multiple proxies for better anonymity")

	if use_proxy_rotation:
	proxy_list = st.text_area("Proxy List (one per line)",
	placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
	rotation_interval = st.slider("Rotation Interval (requests)",
	min_value=1, max_value=50, value=10,
	help="How often to switch proxies")

	if st.button("Save Proxy Configuration"):
	# Construct the proxy string
	proxy_string = None
	if proxy_enabled and proxy_host and proxy_port:
	proxy_prefix = f"{proxy_type.lower()}://"
	proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
	proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"

	# Update session state
	st.session_state.use_proxy = proxy_enabled
	st.session_state.proxy_string = proxy_string

	# Configure proxy rotation if enabled
	from app.utils import PROXY_ROTATION_CONFIG
	if use_proxy_rotation and proxy_list:
	PROXY_ROTATION_CONFIG["enabled"] = True
	PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
	PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]

	st.success("Proxy configuration updated!")

	# Download Options tab
	with config_tabs[2]:
	col1, col2 = st.columns(2)
	with col1:
	st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)

	skip_existing = st.checkbox("Skip Existing Files", value=True,
	help="Don't download files that already exist locally")

	auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
	help="Automatically rename files instead of overwriting")

	verify_downloads = st.checkbox("Verify Downloads", value=True,
	help="Check file integrity after download")

	max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
	help="Number of times to retry failed downloads")

	with col2:
	st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)

	auto_organize = st.checkbox("Auto-Organize Files", value=True,
	help="Automatically organize files by type")

	default_dir = st.text_input("Default Download Directory", value="downloads",
	help="Default location to save downloaded files")

	org_by_domain = st.checkbox("Organize by Domain", value=False,
	help="Create subdirectories based on source domains")

	org_by_type = st.checkbox("Organize by File Type", value=False,
	help="Create subdirectories based on file types")

	if st.button("Save Download Settings"):
	st.session_state.download_settings = {
	"skip_existing": skip_existing,
	"auto_rename": auto_rename,
	"verify_downloads": verify_downloads,
	"max_retries": max_retries,
	"auto_organize": auto_organize,
	"default_dir": default_dir,
	"org_by_domain": org_by_domain,
	"org_by_type": org_by_type
	}
	st.success("Download settings saved!")

	# System tab
	with config_tabs[3]:
	col1, col2 = st.columns(2)
	with col1:
	st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)

	max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
	help="Maximum number of simultaneous downloads")

	memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
	help="Maximum memory to use for file processing")

	processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
	help="Number of threads to use for file processing")

	with col2:
	st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)

	log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
	help="Detail level for application logs")

	save_debug_info = st.checkbox("Save Debug Information", value=False,
	help="Save detailed information about program execution")

	log_dir = st.text_input("Log Directory", value="logs",
	help="Directory to save log files")

	if st.button("Apply System Settings"):
	import logging
	st.session_state.system_settings = {
	"max_concurrent": max_concurrent,
	"memory_limit": memory_limit,
	"processing_threads": processing_threads,
	"log_level": log_level,
	"save_debug_info": save_debug_info,
	"log_dir": log_dir
	}
	# Update logging configuration
	log_level_num = getattr(logging, log_level)
	logging.getLogger().setLevel(log_level_num)
	st.success("System settings applied!")

	# Reset application button
	st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
	reset_col1, reset_col2 = st.columns([1, 3])
	with reset_col1:
	if st.button("Reset Application", use_container_width=True):
	for key in list(st.session_state.keys()):
	if key != 'google_credentials': # Preserve Google auth
	del st.session_state[key]
	st.success("Application has been reset!")
	st.rerun()
	with reset_col2:
	st.info("This will clear all search results, downloaded files, and reset settings to defaults.")

	# Tab 4: Help
	with tabs[3]:
	st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)

	help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])

	with help_tabs[0]:
	st.markdown("""
	### Getting Started

	1. Enter a URL on the Search & Download tab
	2. Select a Search Method:
	- Deep Search: Thorough but slower
	- Quick Search: Fast but may miss some files
	- Exam Site Mode: Optimized for educational resource sites
	3. Click Start Search to find downloadable files
	4. Select files you want to download
	5. Click Download Selected Files

	#### Using Different Modes

	Select a mode from the sidebar to optimize the tool for different use cases:

	- Standard Mode: Balanced for general use
	- Education Mode: Optimized for finding academic materials
	- Research Mode: Better for research papers and datasets
	- Media Mode: Enhanced for finding images, videos, and audio

	For best results with educational materials, use the Exam Site Mode with websites that contain past exams, papers, or course materials.
	""")

	with help_tabs[1]:
	st.markdown("""
	### Advanced Features

	- Local File Search: Upload files and search through their content using the enhanced RAG search
	- Custom Extensions: Specify additional file types to look for beyond the default set
	- Stealth Mode: Makes the browser harder to detect as automated, useful for sites that block scrapers
	- Proxy Support: Use proxies to access region-restricted content or improve anonymity
	- Google Drive Integration: Upload downloaded files directly to your Google Drive

	#### Search Tips

	- For educational sites, include specific terms like "exam", "test", "paper" in the URL
	- When using Local File Search, try different variations of your query for better results
	- Use filtering and sorting options to find the most relevant files quickly

	#### File Organization

	You can configure automatic file organization in the Advanced Configuration tab:

	- Organize by Domain: Creates folders based on the source website
	- Organize by File Type: Separates files into folders by their extension
	- Auto-Rename: Prevents overwriting existing files with same names
	""")

	with help_tabs[2]:
	st.markdown("""
	### Troubleshooting

	#### Common Issues

	- No files found: Try using Deep Search with higher depth value, or add more specific file extensions
	- Downloads failing: Check if the site requires authentication or uses captchas
	- Slow performance: Reduce search depth or disable stealth mode for faster results
	- Browser errors: Click "Install Playwright Dependencies" in Advanced Settings

	#### Captcha Issues

	Some websites use captchas to prevent automated access. If you encounter captchas:

	1. Try using a different proxy
	2. Enable "Handle Captchas Automatically" for simple captchas
	3. For complex captchas, you may need to manually access the site first

	#### Proxy Problems

	If you're having issues with proxies:

	1. Verify your proxy is working with an external tool
	2. Check that you've entered the correct format (http://host:port)
	3. Some websites may block known proxy IPs

	#### Memory Usage

	If the application is using too much memory:

	1. Reduce the "Memory Limit" in System settings
	2. Process fewer files at once
	3. Use lower search depth values
	""")

	with help_tabs[3]:
	st.markdown("""
	### About This Tool

	Advanced File Downloader is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.

	#### Key Features

	- Smart Discovery: Finds downloadable files even when they're not directly linked
	- Enhanced RAG Search: Search through downloaded documents using advanced AI techniques
	- Educational Focus: Specialized detection for exam papers and academic resources
	- Stealth Capabilities: Avoids detection by anti-scraping measures

	#### Technical Details

	This tool uses:

	- Playwright: For browser automation and stealth capabilities
	- Sentence Transformers: For AI-powered semantic search
	- Streamlit: For the user interface
	- Google Drive API: For cloud integration

	#### Credits

	Created with Python, Streamlit, Playwright, and various AI libraries.

	For issues or suggestions, please contact the developer.

	Version 2.0 - March 2025
	""")

	# Handle search button
	if search_button and url:
	# Reset files and downloaded paths
	st.session_state.files = []
	st.session_state.downloaded_paths = []
	st.session_state.download_complete = False

	# Clear the preset URL if it was used
	if 'preset_url' in st.session_state:
	st.session_state.preset_url = ''

	# Prepare custom extensions
	custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]

	# Configure search parameters based on method
	sublink_limit = 5000 if search_method == "Deep Search" else 1000
	search_depth = depth if search_method == "Deep Search" else 1
	is_exam_site = search_method == "Exam Site Mode"

	# Execute the search asynchronously
	async def run_search():
	async with DownloadManager(
	use_proxy=st.session_state.use_proxy,
	proxy=st.session_state.proxy_string,
	use_stealth=st.session_state.stealth_mode
	) as manager:
	# For exam sites, use specialized approach
	if is_exam_site:
	st.session_state.keep_progress = True
	edu_links = await manager.get_edu_exam_links(url)
	all_files = []

	progress_text = st.empty()
	progress_bar = st.progress(0)

	# Process each exam link
	for i, link in enumerate(edu_links):
	progress = (i+1) / max(1, len(edu_links))
	progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
	progress_bar.progress(progress)

	files = await manager.extract_downloadable_files(link, custom_ext_list)
	all_files.extend(files)

	st.session_state.files = all_files
	progress_text.empty()
	progress_bar.empty()
	st.session_state.keep_progress = False

	else:
	# Use general search method
	files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
	st.session_state.files = files

	# Run the search
	asyncio.run(run_search())
	st.rerun()

	# Handle clear button
	if clear_button:
	st.session_state.files = []
	st.session_state.downloaded_paths = []
	st.session_state.download_complete = False
	if 'preset_url' in st.session_state:
	st.session_state.preset_url = ''
	st.rerun()

	# Entry point
	if __name__ == "__main__":
	main()