euler314 commited on
Commit
6158c43
·
verified ·
1 Parent(s): d4e3bdc

Create app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +633 -0
app/main.py ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import streamlit as st
4
+ from app.ui import (
5
+ setup_ui, create_sidebar, display_file_results,
6
+ handle_downloads, handle_google_drive_upload
7
+ )
8
+ from app.download_manager import DownloadManager
9
+ from app.rag_search import EnhancedRAGSearch
10
+ from app.utils import USER_AGENTS
11
+
12
+ def initialize_session_state():
13
+ """Initialize session state variables"""
14
+ if 'files' not in st.session_state:
15
+ st.session_state.files = []
16
+ if 'downloaded_paths' not in st.session_state:
17
+ st.session_state.downloaded_paths = []
18
+ if 'download_complete' not in st.session_state:
19
+ st.session_state.download_complete = False
20
+ if 'selected_tab' not in st.session_state:
21
+ st.session_state.selected_tab = 0
22
+ if 'rag_search' not in st.session_state:
23
+ st.session_state.rag_search = EnhancedRAGSearch()
24
+ if 'keep_progress' not in st.session_state:
25
+ st.session_state.keep_progress = False
26
+ if 'google_credentials' not in st.session_state:
27
+ st.session_state.google_credentials = None
28
+ if 'mode' not in st.session_state:
29
+ st.session_state.mode = "Standard"
30
+ if 'use_proxy' not in st.session_state:
31
+ st.session_state.use_proxy = False
32
+ if 'proxy_string' not in st.session_state:
33
+ st.session_state.proxy_string = None
34
+ if 'stealth_mode' not in st.session_state:
35
+ st.session_state.stealth_mode = True
36
+
37
+ def main():
38
+ # Initialize session state
39
+ initialize_session_state()
40
+
41
+ # Set up UI styling
42
+ setup_ui()
43
+
44
+ # Create sidebar
45
+ create_sidebar()
46
+
47
+ # Header section
48
+ col1, col2 = st.columns([5, 1])
49
+ with col1:
50
+ st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
51
+ with col2:
52
+ st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
53
+
54
+ mode_descriptions = {
55
+ "Standard": "A versatile tool for discovering and downloading files from any website.",
56
+ "Education Mode": "Optimized for educational resources, exams, and academic materials.",
57
+ "Research Mode": "Focused on research papers, datasets, and academic publications.",
58
+ "Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
59
+ }
60
+
61
+ st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
62
+
63
+ # Main tabs
64
+ tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
65
+
66
+ # Tab 1: Search & Download
67
+ with tabs[0]:
68
+ st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
69
+
70
+ col1, col2 = st.columns([3, 1])
71
+ with col1:
72
+ url = st.text_input("Enter a URL to search for downloadable files:",
73
+ placeholder="e.g., https://example.com/resources",
74
+ value=st.session_state.get('preset_url', ''))
75
+ with col2:
76
+ # Initialize search_method with either session state or default value
77
+ initial_search_method = st.session_state.get('search_method', "Deep Search")
78
+ search_method = st.selectbox("Search Method",
79
+ ["Deep Search", "Quick Search", "Exam Site Mode"],
80
+ index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
81
+ # Update session state when changed
82
+ if search_method != st.session_state.get('search_method'):
83
+ st.session_state.search_method = search_method
84
+
85
+ # Advanced options in an expander
86
+ with st.expander("Search Options", expanded=False):
87
+ col1, col2, col3 = st.columns(3)
88
+ with col1:
89
+ depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
90
+ help="Higher values will search more links but take longer")
91
+ prioritize_pdfs = st.checkbox("Prioritize PDFs",
92
+ value=st.session_state.get('prioritize_pdfs', True),
93
+ help="Focus on finding PDF files first")
94
+ with col2:
95
+ timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
96
+ follow_subdomains = st.checkbox("Follow Subdomains", value=True,
97
+ help="Include links from subdomains in the search")
98
+ with col3:
99
+ # Default extensions based on mode
100
+ default_extensions = {
101
+ "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
102
+ "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
103
+ "Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
104
+ "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
105
+ }
106
+
107
+ custom_extensions = st.text_area(
108
+ "Custom File Extensions",
109
+ value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
110
+ help="Comma-separated list of file extensions to look for"
111
+ )
112
+
113
+ # Update session state when extensions changed
114
+ if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
115
+ st.session_state.custom_extensions = custom_extensions
116
+
117
+ search_col1, search_col2 = st.columns([4, 1])
118
+ with search_col1:
119
+ search_button = st.button("🔍 Start Search", use_container_width=True)
120
+ with search_col2:
121
+ clear_button = st.button("🧹 Clear Results", use_container_width=True)
122
+
123
+ # File results section
124
+ if st.session_state.files:
125
+ # Display file results
126
+ selected_files, displayed_files = display_file_results(st.session_state.files)
127
+
128
+ # Download options
129
+ if selected_files:
130
+ col1, col2 = st.columns(2)
131
+ with col1:
132
+ download_dir = st.text_input("Download Directory", value="downloads")
133
+ with col2:
134
+ download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
135
+
136
+ download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
137
+ with download_col1:
138
+ download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
139
+ with download_col2:
140
+ google_drive_button = st.button("📤 Upload to Drive",
141
+ use_container_width=True,
142
+ disabled=not st.session_state.google_credentials)
143
+ with download_col3:
144
+ select_all = st.button("Select All Files", use_container_width=True)
145
+
146
+ # Handle select all button
147
+ if select_all:
148
+ for i in displayed_files:
149
+ st.session_state[f"select_{i}"] = True
150
+ st.rerun()
151
+
152
+ # Handle download button if clicked
153
+ if download_button:
154
+ # Create download directory
155
+ os.makedirs(download_dir, exist_ok=True)
156
+ handle_downloads(selected_files, download_dir, download_option, download_col1)
157
+
158
+ # Handle Google Drive upload
159
+ if google_drive_button:
160
+ handle_google_drive_upload(selected_files)
161
+
162
+ # Tab 2: Local File Search
163
+ with tabs[1]:
164
+ st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
165
+ st.write("Upload files to search through their content with AI-powered semantic search.")
166
+
167
+ # File upload
168
+ uploaded_files = st.file_uploader("Upload documents for search",
169
+ accept_multiple_files=True,
170
+ type=['pdf', 'docx', 'txt', 'csv', 'json'])
171
+
172
+ if uploaded_files:
173
+ # Build search index on upload
174
+ col1, col2 = st.columns([4, 1])
175
+ with col1:
176
+ use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
177
+ help="Uses advanced AI for more accurate semantic search (if available)")
178
+ with col2:
179
+ if st.button("Build Search Index", use_container_width=True):
180
+ with st.spinner("Processing files and building search index..."):
181
+ files_added = 0
182
+ for uploaded_file in uploaded_files:
183
+ file_info = {
184
+ 'filename': uploaded_file.name,
185
+ 'url': f'local://{uploaded_file.name}',
186
+ 'size': humanize_file_size(uploaded_file.size)
187
+ }
188
+ success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
189
+ if success:
190
+ files_added += 1
191
+
192
+ if files_added > 0:
193
+ index_built = st.session_state.rag_search.build_index()
194
+ if index_built:
195
+ st.success(f"✅ Successfully indexed {files_added} files!")
196
+ else:
197
+ st.error("Failed to build search index.")
198
+ else:
199
+ st.warning("No valid text could be extracted from the files.")
200
+
201
+ # Search interface
202
+ st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
203
+
204
+ col1, col2 = st.columns([4, 1])
205
+ with col1:
206
+ query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
207
+ with col2:
208
+ expand_query = st.checkbox("Auto-expand query", value=True,
209
+ help="Automatically add related terms to your search")
210
+
211
+ col1, col2 = st.columns([4, 1])
212
+ with col1:
213
+ if st.button("🔍 Search Documents", use_container_width=True):
214
+ if not query:
215
+ st.warning("Please enter a search query")
216
+ else:
217
+ with st.spinner("Searching..."):
218
+ results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
219
+
220
+ if results:
221
+ st.markdown(f"**Found {len(results)} relevant documents:**")
222
+ for i, result in enumerate(results):
223
+ with st.container():
224
+ st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
225
+ st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
226
+
227
+ if result.get('chunk_preview'):
228
+ st.markdown("**Matching content:**")
229
+ st.text(result['chunk_preview'])
230
+
231
+ st.markdown("</div>", unsafe_allow_html=True)
232
+ else:
233
+ st.info("No matching documents found. Try a different query.")
234
+ with col2:
235
+ num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
236
+
237
+ # Quick search tips
238
+ with st.expander("Search Tips", expanded=False):
239
+ st.markdown("""
240
+ ### Effective Search Tips
241
+
242
+ - **Be specific** with your queries for more accurate results
243
+ - **Try different phrasings** if you don't get the results you expect
244
+ - Use **quotation marks** for exact phrase matching
245
+ - For **complex topics**, break down your search into multiple queries
246
+ - **Combine related terms** to improve recall
247
+
248
+ The search engine uses advanced algorithms to understand the semantic meaning of your query,
249
+ not just keyword matching.
250
+ """)
251
+
252
+ # Tab 3: Advanced Configuration
253
+ with tabs[2]:
254
+ st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
255
+
256
+ config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
257
+
258
+ # Browser Settings tab
259
+ with config_tabs[0]:
260
+ col1, col2 = st.columns(2)
261
+ with col1:
262
+ use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
263
+ help="Makes browser harder to detect as automated, but may be slower")
264
+
265
+ handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
266
+ help="Attempt to solve simple captchas automatically")
267
+
268
+ download_timeout = st.slider("Download Timeout (seconds)",
269
+ min_value=30, max_value=600, value=300,
270
+ help="Maximum time to wait for downloads to complete")
271
+ with col2:
272
+ user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
273
+ help="Browser identity to use when accessing websites")
274
+
275
+ save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
276
+ help="Save screenshots when errors occur for debugging")
277
+
278
+ browser_lang = st.selectbox("Browser Language",
279
+ ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
280
+ index=0)
281
+
282
+ if st.button("Update Browser Settings"):
283
+ st.session_state.stealth_mode = use_stealth
284
+ st.success("Browser settings updated!")
285
+
286
+ # Dependency installation section
287
+ st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
288
+ if st.button("Install Playwright Dependencies"):
289
+ from app.ui import install_playwright_dependencies
290
+ with st.spinner("Installing dependencies..."):
291
+ install_playwright_dependencies()
292
+
293
+ # Proxy Configuration tab
294
+ with config_tabs[1]:
295
+ proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
296
+ help="Route requests through a proxy server for anonymity or bypassing restrictions")
297
+
298
+ if proxy_enabled:
299
+ proxy_col1, proxy_col2 = st.columns(2)
300
+ with proxy_col1:
301
+ proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
302
+ proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
303
+ with proxy_col2:
304
+ proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
305
+ proxy_auth = st.text_input("Proxy Authentication (optional)",
306
+ placeholder="username:password", type="password")
307
+
308
+ st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
309
+ use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
310
+ help="Automatically rotate between multiple proxies for better anonymity")
311
+
312
+ if use_proxy_rotation:
313
+ proxy_list = st.text_area("Proxy List (one per line)",
314
+ placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
315
+ rotation_interval = st.slider("Rotation Interval (requests)",
316
+ min_value=1, max_value=50, value=10,
317
+ help="How often to switch proxies")
318
+
319
+ if st.button("Save Proxy Configuration"):
320
+ # Construct the proxy string
321
+ proxy_string = None
322
+ if proxy_enabled and proxy_host and proxy_port:
323
+ proxy_prefix = f"{proxy_type.lower()}://"
324
+ proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
325
+ proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
326
+
327
+ # Update session state
328
+ st.session_state.use_proxy = proxy_enabled
329
+ st.session_state.proxy_string = proxy_string
330
+
331
+ # Configure proxy rotation if enabled
332
+ from app.utils import PROXY_ROTATION_CONFIG
333
+ if use_proxy_rotation and proxy_list:
334
+ PROXY_ROTATION_CONFIG["enabled"] = True
335
+ PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
336
+ PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
337
+
338
+ st.success("Proxy configuration updated!")
339
+
340
+ # Download Options tab
341
+ with config_tabs[2]:
342
+ col1, col2 = st.columns(2)
343
+ with col1:
344
+ st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
345
+
346
+ skip_existing = st.checkbox("Skip Existing Files", value=True,
347
+ help="Don't download files that already exist locally")
348
+
349
+ auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
350
+ help="Automatically rename files instead of overwriting")
351
+
352
+ verify_downloads = st.checkbox("Verify Downloads", value=True,
353
+ help="Check file integrity after download")
354
+
355
+ max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
356
+ help="Number of times to retry failed downloads")
357
+
358
+ with col2:
359
+ st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
360
+
361
+ auto_organize = st.checkbox("Auto-Organize Files", value=True,
362
+ help="Automatically organize files by type")
363
+
364
+ default_dir = st.text_input("Default Download Directory", value="downloads",
365
+ help="Default location to save downloaded files")
366
+
367
+ org_by_domain = st.checkbox("Organize by Domain", value=False,
368
+ help="Create subdirectories based on source domains")
369
+
370
+ org_by_type = st.checkbox("Organize by File Type", value=False,
371
+ help="Create subdirectories based on file types")
372
+
373
+ if st.button("Save Download Settings"):
374
+ st.session_state.download_settings = {
375
+ "skip_existing": skip_existing,
376
+ "auto_rename": auto_rename,
377
+ "verify_downloads": verify_downloads,
378
+ "max_retries": max_retries,
379
+ "auto_organize": auto_organize,
380
+ "default_dir": default_dir,
381
+ "org_by_domain": org_by_domain,
382
+ "org_by_type": org_by_type
383
+ }
384
+ st.success("Download settings saved!")
385
+
386
+ # System tab
387
+ with config_tabs[3]:
388
+ col1, col2 = st.columns(2)
389
+ with col1:
390
+ st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
391
+
392
+ max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
393
+ help="Maximum number of simultaneous downloads")
394
+
395
+ memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
396
+ help="Maximum memory to use for file processing")
397
+
398
+ processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
399
+ help="Number of threads to use for file processing")
400
+
401
+ with col2:
402
+ st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
403
+
404
+ log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
405
+ help="Detail level for application logs")
406
+
407
+ save_debug_info = st.checkbox("Save Debug Information", value=False,
408
+ help="Save detailed information about program execution")
409
+
410
+ log_dir = st.text_input("Log Directory", value="logs",
411
+ help="Directory to save log files")
412
+
413
+ if st.button("Apply System Settings"):
414
+ import logging
415
+ st.session_state.system_settings = {
416
+ "max_concurrent": max_concurrent,
417
+ "memory_limit": memory_limit,
418
+ "processing_threads": processing_threads,
419
+ "log_level": log_level,
420
+ "save_debug_info": save_debug_info,
421
+ "log_dir": log_dir
422
+ }
423
+ # Update logging configuration
424
+ log_level_num = getattr(logging, log_level)
425
+ logging.getLogger().setLevel(log_level_num)
426
+ st.success("System settings applied!")
427
+
428
+ # Reset application button
429
+ st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
430
+ reset_col1, reset_col2 = st.columns([1, 3])
431
+ with reset_col1:
432
+ if st.button("Reset Application", use_container_width=True):
433
+ for key in list(st.session_state.keys()):
434
+ if key != 'google_credentials': # Preserve Google auth
435
+ del st.session_state[key]
436
+ st.success("Application has been reset!")
437
+ st.rerun()
438
+ with reset_col2:
439
+ st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
440
+
441
+ # Tab 4: Help
442
+ with tabs[3]:
443
+ st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
444
+
445
+ help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
446
+
447
+ with help_tabs[0]:
448
+ st.markdown("""
449
+ ### Getting Started
450
+
451
+ 1. **Enter a URL** on the Search & Download tab
452
+ 2. Select a **Search Method**:
453
+ - **Deep Search**: Thorough but slower
454
+ - **Quick Search**: Fast but may miss some files
455
+ - **Exam Site Mode**: Optimized for educational resource sites
456
+ 3. Click **Start Search** to find downloadable files
457
+ 4. Select files you want to download
458
+ 5. Click **Download Selected Files**
459
+
460
+ #### Using Different Modes
461
+
462
+ Select a mode from the sidebar to optimize the tool for different use cases:
463
+
464
+ - **Standard Mode**: Balanced for general use
465
+ - **Education Mode**: Optimized for finding academic materials
466
+ - **Research Mode**: Better for research papers and datasets
467
+ - **Media Mode**: Enhanced for finding images, videos, and audio
468
+
469
+ For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
470
+ """)
471
+
472
+ with help_tabs[1]:
473
+ st.markdown("""
474
+ ### Advanced Features
475
+
476
+ - **Local File Search**: Upload files and search through their content using the enhanced RAG search
477
+ - **Custom Extensions**: Specify additional file types to look for beyond the default set
478
+ - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
479
+ - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
480
+ - **Google Drive Integration**: Upload downloaded files directly to your Google Drive
481
+
482
+ #### Search Tips
483
+
484
+ - For educational sites, include specific terms like "exam", "test", "paper" in the URL
485
+ - When using Local File Search, try different variations of your query for better results
486
+ - Use filtering and sorting options to find the most relevant files quickly
487
+
488
+ #### File Organization
489
+
490
+ You can configure automatic file organization in the Advanced Configuration tab:
491
+
492
+ - **Organize by Domain**: Creates folders based on the source website
493
+ - **Organize by File Type**: Separates files into folders by their extension
494
+ - **Auto-Rename**: Prevents overwriting existing files with same names
495
+ """)
496
+
497
+ with help_tabs[2]:
498
+ st.markdown("""
499
+ ### Troubleshooting
500
+
501
+ #### Common Issues
502
+
503
+ - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
504
+ - **Downloads failing**: Check if the site requires authentication or uses captchas
505
+ - **Slow performance**: Reduce search depth or disable stealth mode for faster results
506
+ - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
507
+
508
+ #### Captcha Issues
509
+
510
+ Some websites use captchas to prevent automated access. If you encounter captchas:
511
+
512
+ 1. Try using a different proxy
513
+ 2. Enable "Handle Captchas Automatically" for simple captchas
514
+ 3. For complex captchas, you may need to manually access the site first
515
+
516
+ #### Proxy Problems
517
+
518
+ If you're having issues with proxies:
519
+
520
+ 1. Verify your proxy is working with an external tool
521
+ 2. Check that you've entered the correct format (http://host:port)
522
+ 3. Some websites may block known proxy IPs
523
+
524
+ #### Memory Usage
525
+
526
+ If the application is using too much memory:
527
+
528
+ 1. Reduce the "Memory Limit" in System settings
529
+ 2. Process fewer files at once
530
+ 3. Use lower search depth values
531
+ """)
532
+
533
+ with help_tabs[3]:
534
+ st.markdown("""
535
+ ### About This Tool
536
+
537
+ **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
538
+
539
+ #### Key Features
540
+
541
+ - **Smart Discovery**: Finds downloadable files even when they're not directly linked
542
+ - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
543
+ - **Educational Focus**: Specialized detection for exam papers and academic resources
544
+ - **Stealth Capabilities**: Avoids detection by anti-scraping measures
545
+
546
+ #### Technical Details
547
+
548
+ This tool uses:
549
+
550
+ - **Playwright**: For browser automation and stealth capabilities
551
+ - **Sentence Transformers**: For AI-powered semantic search
552
+ - **Streamlit**: For the user interface
553
+ - **Google Drive API**: For cloud integration
554
+
555
+ #### Credits
556
+
557
+ Created with Python, Streamlit, Playwright, and various AI libraries.
558
+
559
+ For issues or suggestions, please contact the developer.
560
+
561
+ Version 2.0 - March 2025
562
+ """)
563
+
564
+ # Handle search button
565
+ if search_button and url:
566
+ # Reset files and downloaded paths
567
+ st.session_state.files = []
568
+ st.session_state.downloaded_paths = []
569
+ st.session_state.download_complete = False
570
+
571
+ # Clear the preset URL if it was used
572
+ if 'preset_url' in st.session_state:
573
+ st.session_state.preset_url = ''
574
+
575
+ # Prepare custom extensions
576
+ custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
577
+
578
+ # Configure search parameters based on method
579
+ sublink_limit = 5000 if search_method == "Deep Search" else 1000
580
+ search_depth = depth if search_method == "Deep Search" else 1
581
+ is_exam_site = search_method == "Exam Site Mode"
582
+
583
+ # Execute the search asynchronously
584
+ async def run_search():
585
+ async with DownloadManager(
586
+ use_proxy=st.session_state.use_proxy,
587
+ proxy=st.session_state.proxy_string,
588
+ use_stealth=st.session_state.stealth_mode
589
+ ) as manager:
590
+ # For exam sites, use specialized approach
591
+ if is_exam_site:
592
+ st.session_state.keep_progress = True
593
+ edu_links = await manager.get_edu_exam_links(url)
594
+ all_files = []
595
+
596
+ progress_text = st.empty()
597
+ progress_bar = st.progress(0)
598
+
599
+ # Process each exam link
600
+ for i, link in enumerate(edu_links):
601
+ progress = (i+1) / max(1, len(edu_links))
602
+ progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
603
+ progress_bar.progress(progress)
604
+
605
+ files = await manager.extract_downloadable_files(link, custom_ext_list)
606
+ all_files.extend(files)
607
+
608
+ st.session_state.files = all_files
609
+ progress_text.empty()
610
+ progress_bar.empty()
611
+ st.session_state.keep_progress = False
612
+
613
+ else:
614
+ # Use general search method
615
+ files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
616
+ st.session_state.files = files
617
+
618
+ # Run the search
619
+ asyncio.run(run_search())
620
+ st.rerun()
621
+
622
+ # Handle clear button
623
+ if clear_button:
624
+ st.session_state.files = []
625
+ st.session_state.downloaded_paths = []
626
+ st.session_state.download_complete = False
627
+ if 'preset_url' in st.session_state:
628
+ st.session_state.preset_url = ''
629
+ st.rerun()
630
+
631
+ # Entry point
632
+ if __name__ == "__main__":
633
+ main()