Delete main.py
Browse files
main.py
DELETED
@@ -1,633 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import asyncio
|
3 |
-
import streamlit as st
|
4 |
-
from ui import (
|
5 |
-
setup_ui, create_sidebar, display_file_results,
|
6 |
-
handle_downloads, handle_google_drive_upload
|
7 |
-
)
|
8 |
-
from app.download_manager import DownloadManager
|
9 |
-
from app.rag_search import EnhancedRAGSearch
|
10 |
-
from app.utils import USER_AGENTS
|
11 |
-
|
12 |
-
def initialize_session_state():
|
13 |
-
"""Initialize session state variables"""
|
14 |
-
if 'files' not in st.session_state:
|
15 |
-
st.session_state.files = []
|
16 |
-
if 'downloaded_paths' not in st.session_state:
|
17 |
-
st.session_state.downloaded_paths = []
|
18 |
-
if 'download_complete' not in st.session_state:
|
19 |
-
st.session_state.download_complete = False
|
20 |
-
if 'selected_tab' not in st.session_state:
|
21 |
-
st.session_state.selected_tab = 0
|
22 |
-
if 'rag_search' not in st.session_state:
|
23 |
-
st.session_state.rag_search = EnhancedRAGSearch()
|
24 |
-
if 'keep_progress' not in st.session_state:
|
25 |
-
st.session_state.keep_progress = False
|
26 |
-
if 'google_credentials' not in st.session_state:
|
27 |
-
st.session_state.google_credentials = None
|
28 |
-
if 'mode' not in st.session_state:
|
29 |
-
st.session_state.mode = "Standard"
|
30 |
-
if 'use_proxy' not in st.session_state:
|
31 |
-
st.session_state.use_proxy = False
|
32 |
-
if 'proxy_string' not in st.session_state:
|
33 |
-
st.session_state.proxy_string = None
|
34 |
-
if 'stealth_mode' not in st.session_state:
|
35 |
-
st.session_state.stealth_mode = True
|
36 |
-
|
37 |
-
def main():
|
38 |
-
# Initialize session state
|
39 |
-
initialize_session_state()
|
40 |
-
|
41 |
-
# Set up UI styling
|
42 |
-
setup_ui()
|
43 |
-
|
44 |
-
# Create sidebar
|
45 |
-
create_sidebar()
|
46 |
-
|
47 |
-
# Header section
|
48 |
-
col1, col2 = st.columns([5, 1])
|
49 |
-
with col1:
|
50 |
-
st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
|
51 |
-
with col2:
|
52 |
-
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
|
53 |
-
|
54 |
-
mode_descriptions = {
|
55 |
-
"Standard": "A versatile tool for discovering and downloading files from any website.",
|
56 |
-
"Education Mode": "Optimized for educational resources, exams, and academic materials.",
|
57 |
-
"Research Mode": "Focused on research papers, datasets, and academic publications.",
|
58 |
-
"Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
|
59 |
-
}
|
60 |
-
|
61 |
-
st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
|
62 |
-
|
63 |
-
# Main tabs
|
64 |
-
tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
|
65 |
-
|
66 |
-
# Tab 1: Search & Download
|
67 |
-
with tabs[0]:
|
68 |
-
st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
|
69 |
-
|
70 |
-
col1, col2 = st.columns([3, 1])
|
71 |
-
with col1:
|
72 |
-
url = st.text_input("Enter a URL to search for downloadable files:",
|
73 |
-
placeholder="e.g., https://example.com/resources",
|
74 |
-
value=st.session_state.get('preset_url', ''))
|
75 |
-
with col2:
|
76 |
-
# Initialize search_method with either session state or default value
|
77 |
-
initial_search_method = st.session_state.get('search_method', "Deep Search")
|
78 |
-
search_method = st.selectbox("Search Method",
|
79 |
-
["Deep Search", "Quick Search", "Exam Site Mode"],
|
80 |
-
index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
|
81 |
-
# Update session state when changed
|
82 |
-
if search_method != st.session_state.get('search_method'):
|
83 |
-
st.session_state.search_method = search_method
|
84 |
-
|
85 |
-
# Advanced options in an expander
|
86 |
-
with st.expander("Search Options", expanded=False):
|
87 |
-
col1, col2, col3 = st.columns(3)
|
88 |
-
with col1:
|
89 |
-
depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
|
90 |
-
help="Higher values will search more links but take longer")
|
91 |
-
prioritize_pdfs = st.checkbox("Prioritize PDFs",
|
92 |
-
value=st.session_state.get('prioritize_pdfs', True),
|
93 |
-
help="Focus on finding PDF files first")
|
94 |
-
with col2:
|
95 |
-
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
|
96 |
-
follow_subdomains = st.checkbox("Follow Subdomains", value=True,
|
97 |
-
help="Include links from subdomains in the search")
|
98 |
-
with col3:
|
99 |
-
# Default extensions based on mode
|
100 |
-
default_extensions = {
|
101 |
-
"Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
|
102 |
-
"Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
|
103 |
-
"Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
|
104 |
-
"Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
|
105 |
-
}
|
106 |
-
|
107 |
-
custom_extensions = st.text_area(
|
108 |
-
"Custom File Extensions",
|
109 |
-
value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
|
110 |
-
help="Comma-separated list of file extensions to look for"
|
111 |
-
)
|
112 |
-
|
113 |
-
# Update session state when extensions changed
|
114 |
-
if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
|
115 |
-
st.session_state.custom_extensions = custom_extensions
|
116 |
-
|
117 |
-
search_col1, search_col2 = st.columns([4, 1])
|
118 |
-
with search_col1:
|
119 |
-
search_button = st.button("🔍 Start Search", use_container_width=True)
|
120 |
-
with search_col2:
|
121 |
-
clear_button = st.button("🧹 Clear Results", use_container_width=True)
|
122 |
-
|
123 |
-
# File results section
|
124 |
-
if st.session_state.files:
|
125 |
-
# Display file results
|
126 |
-
selected_files, displayed_files = display_file_results(st.session_state.files)
|
127 |
-
|
128 |
-
# Download options
|
129 |
-
if selected_files:
|
130 |
-
col1, col2 = st.columns(2)
|
131 |
-
with col1:
|
132 |
-
download_dir = st.text_input("Download Directory", value="downloads")
|
133 |
-
with col2:
|
134 |
-
download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
|
135 |
-
|
136 |
-
download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
|
137 |
-
with download_col1:
|
138 |
-
download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
|
139 |
-
with download_col2:
|
140 |
-
google_drive_button = st.button("📤 Upload to Drive",
|
141 |
-
use_container_width=True,
|
142 |
-
disabled=not st.session_state.google_credentials)
|
143 |
-
with download_col3:
|
144 |
-
select_all = st.button("Select All Files", use_container_width=True)
|
145 |
-
|
146 |
-
# Handle select all button
|
147 |
-
if select_all:
|
148 |
-
for i in displayed_files:
|
149 |
-
st.session_state[f"select_{i}"] = True
|
150 |
-
st.rerun()
|
151 |
-
|
152 |
-
# Handle download button if clicked
|
153 |
-
if download_button:
|
154 |
-
# Create download directory
|
155 |
-
os.makedirs(download_dir, exist_ok=True)
|
156 |
-
handle_downloads(selected_files, download_dir, download_option, download_col1)
|
157 |
-
|
158 |
-
# Handle Google Drive upload
|
159 |
-
if google_drive_button:
|
160 |
-
handle_google_drive_upload(selected_files)
|
161 |
-
|
162 |
-
# Tab 2: Local File Search
|
163 |
-
with tabs[1]:
|
164 |
-
st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
|
165 |
-
st.write("Upload files to search through their content with AI-powered semantic search.")
|
166 |
-
|
167 |
-
# File upload
|
168 |
-
uploaded_files = st.file_uploader("Upload documents for search",
|
169 |
-
accept_multiple_files=True,
|
170 |
-
type=['pdf', 'docx', 'txt', 'csv', 'json'])
|
171 |
-
|
172 |
-
if uploaded_files:
|
173 |
-
# Build search index on upload
|
174 |
-
col1, col2 = st.columns([4, 1])
|
175 |
-
with col1:
|
176 |
-
use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
|
177 |
-
help="Uses advanced AI for more accurate semantic search (if available)")
|
178 |
-
with col2:
|
179 |
-
if st.button("Build Search Index", use_container_width=True):
|
180 |
-
with st.spinner("Processing files and building search index..."):
|
181 |
-
files_added = 0
|
182 |
-
for uploaded_file in uploaded_files:
|
183 |
-
file_info = {
|
184 |
-
'filename': uploaded_file.name,
|
185 |
-
'url': f'local://{uploaded_file.name}',
|
186 |
-
'size': humanize_file_size(uploaded_file.size)
|
187 |
-
}
|
188 |
-
success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
|
189 |
-
if success:
|
190 |
-
files_added += 1
|
191 |
-
|
192 |
-
if files_added > 0:
|
193 |
-
index_built = st.session_state.rag_search.build_index()
|
194 |
-
if index_built:
|
195 |
-
st.success(f"✅ Successfully indexed {files_added} files!")
|
196 |
-
else:
|
197 |
-
st.error("Failed to build search index.")
|
198 |
-
else:
|
199 |
-
st.warning("No valid text could be extracted from the files.")
|
200 |
-
|
201 |
-
# Search interface
|
202 |
-
st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
|
203 |
-
|
204 |
-
col1, col2 = st.columns([4, 1])
|
205 |
-
with col1:
|
206 |
-
query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
|
207 |
-
with col2:
|
208 |
-
expand_query = st.checkbox("Auto-expand query", value=True,
|
209 |
-
help="Automatically add related terms to your search")
|
210 |
-
|
211 |
-
col1, col2 = st.columns([4, 1])
|
212 |
-
with col1:
|
213 |
-
if st.button("🔍 Search Documents", use_container_width=True):
|
214 |
-
if not query:
|
215 |
-
st.warning("Please enter a search query")
|
216 |
-
else:
|
217 |
-
with st.spinner("Searching..."):
|
218 |
-
results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
|
219 |
-
|
220 |
-
if results:
|
221 |
-
st.markdown(f"**Found {len(results)} relevant documents:**")
|
222 |
-
for i, result in enumerate(results):
|
223 |
-
with st.container():
|
224 |
-
st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
|
225 |
-
st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
|
226 |
-
|
227 |
-
if result.get('chunk_preview'):
|
228 |
-
st.markdown("**Matching content:**")
|
229 |
-
st.text(result['chunk_preview'])
|
230 |
-
|
231 |
-
st.markdown("</div>", unsafe_allow_html=True)
|
232 |
-
else:
|
233 |
-
st.info("No matching documents found. Try a different query.")
|
234 |
-
with col2:
|
235 |
-
num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
|
236 |
-
|
237 |
-
# Quick search tips
|
238 |
-
with st.expander("Search Tips", expanded=False):
|
239 |
-
st.markdown("""
|
240 |
-
### Effective Search Tips
|
241 |
-
|
242 |
-
- **Be specific** with your queries for more accurate results
|
243 |
-
- **Try different phrasings** if you don't get the results you expect
|
244 |
-
- Use **quotation marks** for exact phrase matching
|
245 |
-
- For **complex topics**, break down your search into multiple queries
|
246 |
-
- **Combine related terms** to improve recall
|
247 |
-
|
248 |
-
The search engine uses advanced algorithms to understand the semantic meaning of your query,
|
249 |
-
not just keyword matching.
|
250 |
-
""")
|
251 |
-
|
252 |
-
# Tab 3: Advanced Configuration
|
253 |
-
with tabs[2]:
|
254 |
-
st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
|
255 |
-
|
256 |
-
config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
|
257 |
-
|
258 |
-
# Browser Settings tab
|
259 |
-
with config_tabs[0]:
|
260 |
-
col1, col2 = st.columns(2)
|
261 |
-
with col1:
|
262 |
-
use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
|
263 |
-
help="Makes browser harder to detect as automated, but may be slower")
|
264 |
-
|
265 |
-
handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
|
266 |
-
help="Attempt to solve simple captchas automatically")
|
267 |
-
|
268 |
-
download_timeout = st.slider("Download Timeout (seconds)",
|
269 |
-
min_value=30, max_value=600, value=300,
|
270 |
-
help="Maximum time to wait for downloads to complete")
|
271 |
-
with col2:
|
272 |
-
user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
|
273 |
-
help="Browser identity to use when accessing websites")
|
274 |
-
|
275 |
-
save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
|
276 |
-
help="Save screenshots when errors occur for debugging")
|
277 |
-
|
278 |
-
browser_lang = st.selectbox("Browser Language",
|
279 |
-
["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
|
280 |
-
index=0)
|
281 |
-
|
282 |
-
if st.button("Update Browser Settings"):
|
283 |
-
st.session_state.stealth_mode = use_stealth
|
284 |
-
st.success("Browser settings updated!")
|
285 |
-
|
286 |
-
# Dependency installation section
|
287 |
-
st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
|
288 |
-
if st.button("Install Playwright Dependencies"):
|
289 |
-
from app.ui import install_playwright_dependencies
|
290 |
-
with st.spinner("Installing dependencies..."):
|
291 |
-
install_playwright_dependencies()
|
292 |
-
|
293 |
-
# Proxy Configuration tab
|
294 |
-
with config_tabs[1]:
|
295 |
-
proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
|
296 |
-
help="Route requests through a proxy server for anonymity or bypassing restrictions")
|
297 |
-
|
298 |
-
if proxy_enabled:
|
299 |
-
proxy_col1, proxy_col2 = st.columns(2)
|
300 |
-
with proxy_col1:
|
301 |
-
proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
|
302 |
-
proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
|
303 |
-
with proxy_col2:
|
304 |
-
proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
|
305 |
-
proxy_auth = st.text_input("Proxy Authentication (optional)",
|
306 |
-
placeholder="username:password", type="password")
|
307 |
-
|
308 |
-
st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
|
309 |
-
use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
|
310 |
-
help="Automatically rotate between multiple proxies for better anonymity")
|
311 |
-
|
312 |
-
if use_proxy_rotation:
|
313 |
-
proxy_list = st.text_area("Proxy List (one per line)",
|
314 |
-
placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
|
315 |
-
rotation_interval = st.slider("Rotation Interval (requests)",
|
316 |
-
min_value=1, max_value=50, value=10,
|
317 |
-
help="How often to switch proxies")
|
318 |
-
|
319 |
-
if st.button("Save Proxy Configuration"):
|
320 |
-
# Construct the proxy string
|
321 |
-
proxy_string = None
|
322 |
-
if proxy_enabled and proxy_host and proxy_port:
|
323 |
-
proxy_prefix = f"{proxy_type.lower()}://"
|
324 |
-
proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
|
325 |
-
proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
|
326 |
-
|
327 |
-
# Update session state
|
328 |
-
st.session_state.use_proxy = proxy_enabled
|
329 |
-
st.session_state.proxy_string = proxy_string
|
330 |
-
|
331 |
-
# Configure proxy rotation if enabled
|
332 |
-
from app.utils import PROXY_ROTATION_CONFIG
|
333 |
-
if use_proxy_rotation and proxy_list:
|
334 |
-
PROXY_ROTATION_CONFIG["enabled"] = True
|
335 |
-
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
|
336 |
-
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
|
337 |
-
|
338 |
-
st.success("Proxy configuration updated!")
|
339 |
-
|
340 |
-
# Download Options tab
|
341 |
-
with config_tabs[2]:
|
342 |
-
col1, col2 = st.columns(2)
|
343 |
-
with col1:
|
344 |
-
st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
|
345 |
-
|
346 |
-
skip_existing = st.checkbox("Skip Existing Files", value=True,
|
347 |
-
help="Don't download files that already exist locally")
|
348 |
-
|
349 |
-
auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
|
350 |
-
help="Automatically rename files instead of overwriting")
|
351 |
-
|
352 |
-
verify_downloads = st.checkbox("Verify Downloads", value=True,
|
353 |
-
help="Check file integrity after download")
|
354 |
-
|
355 |
-
max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
|
356 |
-
help="Number of times to retry failed downloads")
|
357 |
-
|
358 |
-
with col2:
|
359 |
-
st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
|
360 |
-
|
361 |
-
auto_organize = st.checkbox("Auto-Organize Files", value=True,
|
362 |
-
help="Automatically organize files by type")
|
363 |
-
|
364 |
-
default_dir = st.text_input("Default Download Directory", value="downloads",
|
365 |
-
help="Default location to save downloaded files")
|
366 |
-
|
367 |
-
org_by_domain = st.checkbox("Organize by Domain", value=False,
|
368 |
-
help="Create subdirectories based on source domains")
|
369 |
-
|
370 |
-
org_by_type = st.checkbox("Organize by File Type", value=False,
|
371 |
-
help="Create subdirectories based on file types")
|
372 |
-
|
373 |
-
if st.button("Save Download Settings"):
|
374 |
-
st.session_state.download_settings = {
|
375 |
-
"skip_existing": skip_existing,
|
376 |
-
"auto_rename": auto_rename,
|
377 |
-
"verify_downloads": verify_downloads,
|
378 |
-
"max_retries": max_retries,
|
379 |
-
"auto_organize": auto_organize,
|
380 |
-
"default_dir": default_dir,
|
381 |
-
"org_by_domain": org_by_domain,
|
382 |
-
"org_by_type": org_by_type
|
383 |
-
}
|
384 |
-
st.success("Download settings saved!")
|
385 |
-
|
386 |
-
# System tab
|
387 |
-
with config_tabs[3]:
|
388 |
-
col1, col2 = st.columns(2)
|
389 |
-
with col1:
|
390 |
-
st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
|
391 |
-
|
392 |
-
max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
|
393 |
-
help="Maximum number of simultaneous downloads")
|
394 |
-
|
395 |
-
memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
|
396 |
-
help="Maximum memory to use for file processing")
|
397 |
-
|
398 |
-
processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
|
399 |
-
help="Number of threads to use for file processing")
|
400 |
-
|
401 |
-
with col2:
|
402 |
-
st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
|
403 |
-
|
404 |
-
log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
|
405 |
-
help="Detail level for application logs")
|
406 |
-
|
407 |
-
save_debug_info = st.checkbox("Save Debug Information", value=False,
|
408 |
-
help="Save detailed information about program execution")
|
409 |
-
|
410 |
-
log_dir = st.text_input("Log Directory", value="logs",
|
411 |
-
help="Directory to save log files")
|
412 |
-
|
413 |
-
if st.button("Apply System Settings"):
|
414 |
-
import logging
|
415 |
-
st.session_state.system_settings = {
|
416 |
-
"max_concurrent": max_concurrent,
|
417 |
-
"memory_limit": memory_limit,
|
418 |
-
"processing_threads": processing_threads,
|
419 |
-
"log_level": log_level,
|
420 |
-
"save_debug_info": save_debug_info,
|
421 |
-
"log_dir": log_dir
|
422 |
-
}
|
423 |
-
# Update logging configuration
|
424 |
-
log_level_num = getattr(logging, log_level)
|
425 |
-
logging.getLogger().setLevel(log_level_num)
|
426 |
-
st.success("System settings applied!")
|
427 |
-
|
428 |
-
# Reset application button
|
429 |
-
st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
|
430 |
-
reset_col1, reset_col2 = st.columns([1, 3])
|
431 |
-
with reset_col1:
|
432 |
-
if st.button("Reset Application", use_container_width=True):
|
433 |
-
for key in list(st.session_state.keys()):
|
434 |
-
if key != 'google_credentials': # Preserve Google auth
|
435 |
-
del st.session_state[key]
|
436 |
-
st.success("Application has been reset!")
|
437 |
-
st.rerun()
|
438 |
-
with reset_col2:
|
439 |
-
st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
|
440 |
-
|
441 |
-
# Tab 4: Help
|
442 |
-
with tabs[3]:
|
443 |
-
st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
|
444 |
-
|
445 |
-
help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
|
446 |
-
|
447 |
-
with help_tabs[0]:
|
448 |
-
st.markdown("""
|
449 |
-
### Getting Started
|
450 |
-
|
451 |
-
1. **Enter a URL** on the Search & Download tab
|
452 |
-
2. Select a **Search Method**:
|
453 |
-
- **Deep Search**: Thorough but slower
|
454 |
-
- **Quick Search**: Fast but may miss some files
|
455 |
-
- **Exam Site Mode**: Optimized for educational resource sites
|
456 |
-
3. Click **Start Search** to find downloadable files
|
457 |
-
4. Select files you want to download
|
458 |
-
5. Click **Download Selected Files**
|
459 |
-
|
460 |
-
#### Using Different Modes
|
461 |
-
|
462 |
-
Select a mode from the sidebar to optimize the tool for different use cases:
|
463 |
-
|
464 |
-
- **Standard Mode**: Balanced for general use
|
465 |
-
- **Education Mode**: Optimized for finding academic materials
|
466 |
-
- **Research Mode**: Better for research papers and datasets
|
467 |
-
- **Media Mode**: Enhanced for finding images, videos, and audio
|
468 |
-
|
469 |
-
For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
|
470 |
-
""")
|
471 |
-
|
472 |
-
with help_tabs[1]:
|
473 |
-
st.markdown("""
|
474 |
-
### Advanced Features
|
475 |
-
|
476 |
-
- **Local File Search**: Upload files and search through their content using the enhanced RAG search
|
477 |
-
- **Custom Extensions**: Specify additional file types to look for beyond the default set
|
478 |
-
- **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
|
479 |
-
- **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
|
480 |
-
- **Google Drive Integration**: Upload downloaded files directly to your Google Drive
|
481 |
-
|
482 |
-
#### Search Tips
|
483 |
-
|
484 |
-
- For educational sites, include specific terms like "exam", "test", "paper" in the URL
|
485 |
-
- When using Local File Search, try different variations of your query for better results
|
486 |
-
- Use filtering and sorting options to find the most relevant files quickly
|
487 |
-
|
488 |
-
#### File Organization
|
489 |
-
|
490 |
-
You can configure automatic file organization in the Advanced Configuration tab:
|
491 |
-
|
492 |
-
- **Organize by Domain**: Creates folders based on the source website
|
493 |
-
- **Organize by File Type**: Separates files into folders by their extension
|
494 |
-
- **Auto-Rename**: Prevents overwriting existing files with same names
|
495 |
-
""")
|
496 |
-
|
497 |
-
with help_tabs[2]:
|
498 |
-
st.markdown("""
|
499 |
-
### Troubleshooting
|
500 |
-
|
501 |
-
#### Common Issues
|
502 |
-
|
503 |
-
- **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
|
504 |
-
- **Downloads failing**: Check if the site requires authentication or uses captchas
|
505 |
-
- **Slow performance**: Reduce search depth or disable stealth mode for faster results
|
506 |
-
- **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
|
507 |
-
|
508 |
-
#### Captcha Issues
|
509 |
-
|
510 |
-
Some websites use captchas to prevent automated access. If you encounter captchas:
|
511 |
-
|
512 |
-
1. Try using a different proxy
|
513 |
-
2. Enable "Handle Captchas Automatically" for simple captchas
|
514 |
-
3. For complex captchas, you may need to manually access the site first
|
515 |
-
|
516 |
-
#### Proxy Problems
|
517 |
-
|
518 |
-
If you're having issues with proxies:
|
519 |
-
|
520 |
-
1. Verify your proxy is working with an external tool
|
521 |
-
2. Check that you've entered the correct format (http://host:port)
|
522 |
-
3. Some websites may block known proxy IPs
|
523 |
-
|
524 |
-
#### Memory Usage
|
525 |
-
|
526 |
-
If the application is using too much memory:
|
527 |
-
|
528 |
-
1. Reduce the "Memory Limit" in System settings
|
529 |
-
2. Process fewer files at once
|
530 |
-
3. Use lower search depth values
|
531 |
-
""")
|
532 |
-
|
533 |
-
with help_tabs[3]:
|
534 |
-
st.markdown("""
|
535 |
-
### About This Tool
|
536 |
-
|
537 |
-
**Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
|
538 |
-
|
539 |
-
#### Key Features
|
540 |
-
|
541 |
-
- **Smart Discovery**: Finds downloadable files even when they're not directly linked
|
542 |
-
- **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
|
543 |
-
- **Educational Focus**: Specialized detection for exam papers and academic resources
|
544 |
-
- **Stealth Capabilities**: Avoids detection by anti-scraping measures
|
545 |
-
|
546 |
-
#### Technical Details
|
547 |
-
|
548 |
-
This tool uses:
|
549 |
-
|
550 |
-
- **Playwright**: For browser automation and stealth capabilities
|
551 |
-
- **Sentence Transformers**: For AI-powered semantic search
|
552 |
-
- **Streamlit**: For the user interface
|
553 |
-
- **Google Drive API**: For cloud integration
|
554 |
-
|
555 |
-
#### Credits
|
556 |
-
|
557 |
-
Created with Python, Streamlit, Playwright, and various AI libraries.
|
558 |
-
|
559 |
-
For issues or suggestions, please contact the developer.
|
560 |
-
|
561 |
-
Version 2.0 - March 2025
|
562 |
-
""")
|
563 |
-
|
564 |
-
# Handle search button
|
565 |
-
if search_button and url:
|
566 |
-
# Reset files and downloaded paths
|
567 |
-
st.session_state.files = []
|
568 |
-
st.session_state.downloaded_paths = []
|
569 |
-
st.session_state.download_complete = False
|
570 |
-
|
571 |
-
# Clear the preset URL if it was used
|
572 |
-
if 'preset_url' in st.session_state:
|
573 |
-
st.session_state.preset_url = ''
|
574 |
-
|
575 |
-
# Prepare custom extensions
|
576 |
-
custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
|
577 |
-
|
578 |
-
# Configure search parameters based on method
|
579 |
-
sublink_limit = 5000 if search_method == "Deep Search" else 1000
|
580 |
-
search_depth = depth if search_method == "Deep Search" else 1
|
581 |
-
is_exam_site = search_method == "Exam Site Mode"
|
582 |
-
|
583 |
-
# Execute the search asynchronously
|
584 |
-
async def run_search():
|
585 |
-
async with DownloadManager(
|
586 |
-
use_proxy=st.session_state.use_proxy,
|
587 |
-
proxy=st.session_state.proxy_string,
|
588 |
-
use_stealth=st.session_state.stealth_mode
|
589 |
-
) as manager:
|
590 |
-
# For exam sites, use specialized approach
|
591 |
-
if is_exam_site:
|
592 |
-
st.session_state.keep_progress = True
|
593 |
-
edu_links = await manager.get_edu_exam_links(url)
|
594 |
-
all_files = []
|
595 |
-
|
596 |
-
progress_text = st.empty()
|
597 |
-
progress_bar = st.progress(0)
|
598 |
-
|
599 |
-
# Process each exam link
|
600 |
-
for i, link in enumerate(edu_links):
|
601 |
-
progress = (i+1) / max(1, len(edu_links))
|
602 |
-
progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
|
603 |
-
progress_bar.progress(progress)
|
604 |
-
|
605 |
-
files = await manager.extract_downloadable_files(link, custom_ext_list)
|
606 |
-
all_files.extend(files)
|
607 |
-
|
608 |
-
st.session_state.files = all_files
|
609 |
-
progress_text.empty()
|
610 |
-
progress_bar.empty()
|
611 |
-
st.session_state.keep_progress = False
|
612 |
-
|
613 |
-
else:
|
614 |
-
# Use general search method
|
615 |
-
files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
|
616 |
-
st.session_state.files = files
|
617 |
-
|
618 |
-
# Run the search
|
619 |
-
asyncio.run(run_search())
|
620 |
-
st.rerun()
|
621 |
-
|
622 |
-
# Handle clear button
|
623 |
-
if clear_button:
|
624 |
-
st.session_state.files = []
|
625 |
-
st.session_state.downloaded_paths = []
|
626 |
-
st.session_state.download_complete = False
|
627 |
-
if 'preset_url' in st.session_state:
|
628 |
-
st.session_state.preset_url = ''
|
629 |
-
st.rerun()
|
630 |
-
|
631 |
-
# Entry point
|
632 |
-
if __name__ == "__main__":
|
633 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|