euler314 commited on
Commit
1d44b06
·
verified ·
1 Parent(s): f0d7dcd

Delete main.py

Browse files
Files changed (1) hide show
  1. main.py +0 -633
main.py DELETED
@@ -1,633 +0,0 @@
1
- import os
2
- import asyncio
3
- import streamlit as st
4
- from ui import (
5
- setup_ui, create_sidebar, display_file_results,
6
- handle_downloads, handle_google_drive_upload
7
- )
8
- from app.download_manager import DownloadManager
9
- from app.rag_search import EnhancedRAGSearch
10
- from app.utils import USER_AGENTS
11
-
12
- def initialize_session_state():
13
- """Initialize session state variables"""
14
- if 'files' not in st.session_state:
15
- st.session_state.files = []
16
- if 'downloaded_paths' not in st.session_state:
17
- st.session_state.downloaded_paths = []
18
- if 'download_complete' not in st.session_state:
19
- st.session_state.download_complete = False
20
- if 'selected_tab' not in st.session_state:
21
- st.session_state.selected_tab = 0
22
- if 'rag_search' not in st.session_state:
23
- st.session_state.rag_search = EnhancedRAGSearch()
24
- if 'keep_progress' not in st.session_state:
25
- st.session_state.keep_progress = False
26
- if 'google_credentials' not in st.session_state:
27
- st.session_state.google_credentials = None
28
- if 'mode' not in st.session_state:
29
- st.session_state.mode = "Standard"
30
- if 'use_proxy' not in st.session_state:
31
- st.session_state.use_proxy = False
32
- if 'proxy_string' not in st.session_state:
33
- st.session_state.proxy_string = None
34
- if 'stealth_mode' not in st.session_state:
35
- st.session_state.stealth_mode = True
36
-
37
- def main():
38
- # Initialize session state
39
- initialize_session_state()
40
-
41
- # Set up UI styling
42
- setup_ui()
43
-
44
- # Create sidebar
45
- create_sidebar()
46
-
47
- # Header section
48
- col1, col2 = st.columns([5, 1])
49
- with col1:
50
- st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
51
- with col2:
52
- st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
53
-
54
- mode_descriptions = {
55
- "Standard": "A versatile tool for discovering and downloading files from any website.",
56
- "Education Mode": "Optimized for educational resources, exams, and academic materials.",
57
- "Research Mode": "Focused on research papers, datasets, and academic publications.",
58
- "Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
59
- }
60
-
61
- st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
62
-
63
- # Main tabs
64
- tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
65
-
66
- # Tab 1: Search & Download
67
- with tabs[0]:
68
- st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
69
-
70
- col1, col2 = st.columns([3, 1])
71
- with col1:
72
- url = st.text_input("Enter a URL to search for downloadable files:",
73
- placeholder="e.g., https://example.com/resources",
74
- value=st.session_state.get('preset_url', ''))
75
- with col2:
76
- # Initialize search_method with either session state or default value
77
- initial_search_method = st.session_state.get('search_method', "Deep Search")
78
- search_method = st.selectbox("Search Method",
79
- ["Deep Search", "Quick Search", "Exam Site Mode"],
80
- index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
81
- # Update session state when changed
82
- if search_method != st.session_state.get('search_method'):
83
- st.session_state.search_method = search_method
84
-
85
- # Advanced options in an expander
86
- with st.expander("Search Options", expanded=False):
87
- col1, col2, col3 = st.columns(3)
88
- with col1:
89
- depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
90
- help="Higher values will search more links but take longer")
91
- prioritize_pdfs = st.checkbox("Prioritize PDFs",
92
- value=st.session_state.get('prioritize_pdfs', True),
93
- help="Focus on finding PDF files first")
94
- with col2:
95
- timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
96
- follow_subdomains = st.checkbox("Follow Subdomains", value=True,
97
- help="Include links from subdomains in the search")
98
- with col3:
99
- # Default extensions based on mode
100
- default_extensions = {
101
- "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
102
- "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
103
- "Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
104
- "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
105
- }
106
-
107
- custom_extensions = st.text_area(
108
- "Custom File Extensions",
109
- value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
110
- help="Comma-separated list of file extensions to look for"
111
- )
112
-
113
- # Update session state when extensions changed
114
- if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
115
- st.session_state.custom_extensions = custom_extensions
116
-
117
- search_col1, search_col2 = st.columns([4, 1])
118
- with search_col1:
119
- search_button = st.button("🔍 Start Search", use_container_width=True)
120
- with search_col2:
121
- clear_button = st.button("🧹 Clear Results", use_container_width=True)
122
-
123
- # File results section
124
- if st.session_state.files:
125
- # Display file results
126
- selected_files, displayed_files = display_file_results(st.session_state.files)
127
-
128
- # Download options
129
- if selected_files:
130
- col1, col2 = st.columns(2)
131
- with col1:
132
- download_dir = st.text_input("Download Directory", value="downloads")
133
- with col2:
134
- download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
135
-
136
- download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
137
- with download_col1:
138
- download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
139
- with download_col2:
140
- google_drive_button = st.button("📤 Upload to Drive",
141
- use_container_width=True,
142
- disabled=not st.session_state.google_credentials)
143
- with download_col3:
144
- select_all = st.button("Select All Files", use_container_width=True)
145
-
146
- # Handle select all button
147
- if select_all:
148
- for i in displayed_files:
149
- st.session_state[f"select_{i}"] = True
150
- st.rerun()
151
-
152
- # Handle download button if clicked
153
- if download_button:
154
- # Create download directory
155
- os.makedirs(download_dir, exist_ok=True)
156
- handle_downloads(selected_files, download_dir, download_option, download_col1)
157
-
158
- # Handle Google Drive upload
159
- if google_drive_button:
160
- handle_google_drive_upload(selected_files)
161
-
162
- # Tab 2: Local File Search
163
- with tabs[1]:
164
- st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
165
- st.write("Upload files to search through their content with AI-powered semantic search.")
166
-
167
- # File upload
168
- uploaded_files = st.file_uploader("Upload documents for search",
169
- accept_multiple_files=True,
170
- type=['pdf', 'docx', 'txt', 'csv', 'json'])
171
-
172
- if uploaded_files:
173
- # Build search index on upload
174
- col1, col2 = st.columns([4, 1])
175
- with col1:
176
- use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
177
- help="Uses advanced AI for more accurate semantic search (if available)")
178
- with col2:
179
- if st.button("Build Search Index", use_container_width=True):
180
- with st.spinner("Processing files and building search index..."):
181
- files_added = 0
182
- for uploaded_file in uploaded_files:
183
- file_info = {
184
- 'filename': uploaded_file.name,
185
- 'url': f'local://{uploaded_file.name}',
186
- 'size': humanize_file_size(uploaded_file.size)
187
- }
188
- success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
189
- if success:
190
- files_added += 1
191
-
192
- if files_added > 0:
193
- index_built = st.session_state.rag_search.build_index()
194
- if index_built:
195
- st.success(f"✅ Successfully indexed {files_added} files!")
196
- else:
197
- st.error("Failed to build search index.")
198
- else:
199
- st.warning("No valid text could be extracted from the files.")
200
-
201
- # Search interface
202
- st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
203
-
204
- col1, col2 = st.columns([4, 1])
205
- with col1:
206
- query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
207
- with col2:
208
- expand_query = st.checkbox("Auto-expand query", value=True,
209
- help="Automatically add related terms to your search")
210
-
211
- col1, col2 = st.columns([4, 1])
212
- with col1:
213
- if st.button("🔍 Search Documents", use_container_width=True):
214
- if not query:
215
- st.warning("Please enter a search query")
216
- else:
217
- with st.spinner("Searching..."):
218
- results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
219
-
220
- if results:
221
- st.markdown(f"**Found {len(results)} relevant documents:**")
222
- for i, result in enumerate(results):
223
- with st.container():
224
- st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
225
- st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
226
-
227
- if result.get('chunk_preview'):
228
- st.markdown("**Matching content:**")
229
- st.text(result['chunk_preview'])
230
-
231
- st.markdown("</div>", unsafe_allow_html=True)
232
- else:
233
- st.info("No matching documents found. Try a different query.")
234
- with col2:
235
- num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
236
-
237
- # Quick search tips
238
- with st.expander("Search Tips", expanded=False):
239
- st.markdown("""
240
- ### Effective Search Tips
241
-
242
- - **Be specific** with your queries for more accurate results
243
- - **Try different phrasings** if you don't get the results you expect
244
- - Use **quotation marks** for exact phrase matching
245
- - For **complex topics**, break down your search into multiple queries
246
- - **Combine related terms** to improve recall
247
-
248
- The search engine uses advanced algorithms to understand the semantic meaning of your query,
249
- not just keyword matching.
250
- """)
251
-
252
- # Tab 3: Advanced Configuration
253
- with tabs[2]:
254
- st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
255
-
256
- config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
257
-
258
- # Browser Settings tab
259
- with config_tabs[0]:
260
- col1, col2 = st.columns(2)
261
- with col1:
262
- use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
263
- help="Makes browser harder to detect as automated, but may be slower")
264
-
265
- handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
266
- help="Attempt to solve simple captchas automatically")
267
-
268
- download_timeout = st.slider("Download Timeout (seconds)",
269
- min_value=30, max_value=600, value=300,
270
- help="Maximum time to wait for downloads to complete")
271
- with col2:
272
- user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
273
- help="Browser identity to use when accessing websites")
274
-
275
- save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
276
- help="Save screenshots when errors occur for debugging")
277
-
278
- browser_lang = st.selectbox("Browser Language",
279
- ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
280
- index=0)
281
-
282
- if st.button("Update Browser Settings"):
283
- st.session_state.stealth_mode = use_stealth
284
- st.success("Browser settings updated!")
285
-
286
- # Dependency installation section
287
- st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
288
- if st.button("Install Playwright Dependencies"):
289
- from app.ui import install_playwright_dependencies
290
- with st.spinner("Installing dependencies..."):
291
- install_playwright_dependencies()
292
-
293
- # Proxy Configuration tab
294
- with config_tabs[1]:
295
- proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
296
- help="Route requests through a proxy server for anonymity or bypassing restrictions")
297
-
298
- if proxy_enabled:
299
- proxy_col1, proxy_col2 = st.columns(2)
300
- with proxy_col1:
301
- proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
302
- proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
303
- with proxy_col2:
304
- proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
305
- proxy_auth = st.text_input("Proxy Authentication (optional)",
306
- placeholder="username:password", type="password")
307
-
308
- st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
309
- use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
310
- help="Automatically rotate between multiple proxies for better anonymity")
311
-
312
- if use_proxy_rotation:
313
- proxy_list = st.text_area("Proxy List (one per line)",
314
- placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
315
- rotation_interval = st.slider("Rotation Interval (requests)",
316
- min_value=1, max_value=50, value=10,
317
- help="How often to switch proxies")
318
-
319
- if st.button("Save Proxy Configuration"):
320
- # Construct the proxy string
321
- proxy_string = None
322
- if proxy_enabled and proxy_host and proxy_port:
323
- proxy_prefix = f"{proxy_type.lower()}://"
324
- proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
325
- proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
326
-
327
- # Update session state
328
- st.session_state.use_proxy = proxy_enabled
329
- st.session_state.proxy_string = proxy_string
330
-
331
- # Configure proxy rotation if enabled
332
- from app.utils import PROXY_ROTATION_CONFIG
333
- if use_proxy_rotation and proxy_list:
334
- PROXY_ROTATION_CONFIG["enabled"] = True
335
- PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
336
- PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
337
-
338
- st.success("Proxy configuration updated!")
339
-
340
- # Download Options tab
341
- with config_tabs[2]:
342
- col1, col2 = st.columns(2)
343
- with col1:
344
- st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
345
-
346
- skip_existing = st.checkbox("Skip Existing Files", value=True,
347
- help="Don't download files that already exist locally")
348
-
349
- auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
350
- help="Automatically rename files instead of overwriting")
351
-
352
- verify_downloads = st.checkbox("Verify Downloads", value=True,
353
- help="Check file integrity after download")
354
-
355
- max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
356
- help="Number of times to retry failed downloads")
357
-
358
- with col2:
359
- st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
360
-
361
- auto_organize = st.checkbox("Auto-Organize Files", value=True,
362
- help="Automatically organize files by type")
363
-
364
- default_dir = st.text_input("Default Download Directory", value="downloads",
365
- help="Default location to save downloaded files")
366
-
367
- org_by_domain = st.checkbox("Organize by Domain", value=False,
368
- help="Create subdirectories based on source domains")
369
-
370
- org_by_type = st.checkbox("Organize by File Type", value=False,
371
- help="Create subdirectories based on file types")
372
-
373
- if st.button("Save Download Settings"):
374
- st.session_state.download_settings = {
375
- "skip_existing": skip_existing,
376
- "auto_rename": auto_rename,
377
- "verify_downloads": verify_downloads,
378
- "max_retries": max_retries,
379
- "auto_organize": auto_organize,
380
- "default_dir": default_dir,
381
- "org_by_domain": org_by_domain,
382
- "org_by_type": org_by_type
383
- }
384
- st.success("Download settings saved!")
385
-
386
- # System tab
387
- with config_tabs[3]:
388
- col1, col2 = st.columns(2)
389
- with col1:
390
- st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
391
-
392
- max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
393
- help="Maximum number of simultaneous downloads")
394
-
395
- memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
396
- help="Maximum memory to use for file processing")
397
-
398
- processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
399
- help="Number of threads to use for file processing")
400
-
401
- with col2:
402
- st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
403
-
404
- log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
405
- help="Detail level for application logs")
406
-
407
- save_debug_info = st.checkbox("Save Debug Information", value=False,
408
- help="Save detailed information about program execution")
409
-
410
- log_dir = st.text_input("Log Directory", value="logs",
411
- help="Directory to save log files")
412
-
413
- if st.button("Apply System Settings"):
414
- import logging
415
- st.session_state.system_settings = {
416
- "max_concurrent": max_concurrent,
417
- "memory_limit": memory_limit,
418
- "processing_threads": processing_threads,
419
- "log_level": log_level,
420
- "save_debug_info": save_debug_info,
421
- "log_dir": log_dir
422
- }
423
- # Update logging configuration
424
- log_level_num = getattr(logging, log_level)
425
- logging.getLogger().setLevel(log_level_num)
426
- st.success("System settings applied!")
427
-
428
- # Reset application button
429
- st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
430
- reset_col1, reset_col2 = st.columns([1, 3])
431
- with reset_col1:
432
- if st.button("Reset Application", use_container_width=True):
433
- for key in list(st.session_state.keys()):
434
- if key != 'google_credentials': # Preserve Google auth
435
- del st.session_state[key]
436
- st.success("Application has been reset!")
437
- st.rerun()
438
- with reset_col2:
439
- st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
440
-
441
- # Tab 4: Help
442
- with tabs[3]:
443
- st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
444
-
445
- help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
446
-
447
- with help_tabs[0]:
448
- st.markdown("""
449
- ### Getting Started
450
-
451
- 1. **Enter a URL** on the Search & Download tab
452
- 2. Select a **Search Method**:
453
- - **Deep Search**: Thorough but slower
454
- - **Quick Search**: Fast but may miss some files
455
- - **Exam Site Mode**: Optimized for educational resource sites
456
- 3. Click **Start Search** to find downloadable files
457
- 4. Select files you want to download
458
- 5. Click **Download Selected Files**
459
-
460
- #### Using Different Modes
461
-
462
- Select a mode from the sidebar to optimize the tool for different use cases:
463
-
464
- - **Standard Mode**: Balanced for general use
465
- - **Education Mode**: Optimized for finding academic materials
466
- - **Research Mode**: Better for research papers and datasets
467
- - **Media Mode**: Enhanced for finding images, videos, and audio
468
-
469
- For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
470
- """)
471
-
472
- with help_tabs[1]:
473
- st.markdown("""
474
- ### Advanced Features
475
-
476
- - **Local File Search**: Upload files and search through their content using the enhanced RAG search
477
- - **Custom Extensions**: Specify additional file types to look for beyond the default set
478
- - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
479
- - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
480
- - **Google Drive Integration**: Upload downloaded files directly to your Google Drive
481
-
482
- #### Search Tips
483
-
484
- - For educational sites, include specific terms like "exam", "test", "paper" in the URL
485
- - When using Local File Search, try different variations of your query for better results
486
- - Use filtering and sorting options to find the most relevant files quickly
487
-
488
- #### File Organization
489
-
490
- You can configure automatic file organization in the Advanced Configuration tab:
491
-
492
- - **Organize by Domain**: Creates folders based on the source website
493
- - **Organize by File Type**: Separates files into folders by their extension
494
- - **Auto-Rename**: Prevents overwriting existing files with same names
495
- """)
496
-
497
- with help_tabs[2]:
498
- st.markdown("""
499
- ### Troubleshooting
500
-
501
- #### Common Issues
502
-
503
- - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
504
- - **Downloads failing**: Check if the site requires authentication or uses captchas
505
- - **Slow performance**: Reduce search depth or disable stealth mode for faster results
506
- - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
507
-
508
- #### Captcha Issues
509
-
510
- Some websites use captchas to prevent automated access. If you encounter captchas:
511
-
512
- 1. Try using a different proxy
513
- 2. Enable "Handle Captchas Automatically" for simple captchas
514
- 3. For complex captchas, you may need to manually access the site first
515
-
516
- #### Proxy Problems
517
-
518
- If you're having issues with proxies:
519
-
520
- 1. Verify your proxy is working with an external tool
521
- 2. Check that you've entered the correct format (http://host:port)
522
- 3. Some websites may block known proxy IPs
523
-
524
- #### Memory Usage
525
-
526
- If the application is using too much memory:
527
-
528
- 1. Reduce the "Memory Limit" in System settings
529
- 2. Process fewer files at once
530
- 3. Use lower search depth values
531
- """)
532
-
533
- with help_tabs[3]:
534
- st.markdown("""
535
- ### About This Tool
536
-
537
- **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
538
-
539
- #### Key Features
540
-
541
- - **Smart Discovery**: Finds downloadable files even when they're not directly linked
542
- - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
543
- - **Educational Focus**: Specialized detection for exam papers and academic resources
544
- - **Stealth Capabilities**: Avoids detection by anti-scraping measures
545
-
546
- #### Technical Details
547
-
548
- This tool uses:
549
-
550
- - **Playwright**: For browser automation and stealth capabilities
551
- - **Sentence Transformers**: For AI-powered semantic search
552
- - **Streamlit**: For the user interface
553
- - **Google Drive API**: For cloud integration
554
-
555
- #### Credits
556
-
557
- Created with Python, Streamlit, Playwright, and various AI libraries.
558
-
559
- For issues or suggestions, please contact the developer.
560
-
561
- Version 2.0 - March 2025
562
- """)
563
-
564
- # Handle search button
565
- if search_button and url:
566
- # Reset files and downloaded paths
567
- st.session_state.files = []
568
- st.session_state.downloaded_paths = []
569
- st.session_state.download_complete = False
570
-
571
- # Clear the preset URL if it was used
572
- if 'preset_url' in st.session_state:
573
- st.session_state.preset_url = ''
574
-
575
- # Prepare custom extensions
576
- custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
577
-
578
- # Configure search parameters based on method
579
- sublink_limit = 5000 if search_method == "Deep Search" else 1000
580
- search_depth = depth if search_method == "Deep Search" else 1
581
- is_exam_site = search_method == "Exam Site Mode"
582
-
583
- # Execute the search asynchronously
584
- async def run_search():
585
- async with DownloadManager(
586
- use_proxy=st.session_state.use_proxy,
587
- proxy=st.session_state.proxy_string,
588
- use_stealth=st.session_state.stealth_mode
589
- ) as manager:
590
- # For exam sites, use specialized approach
591
- if is_exam_site:
592
- st.session_state.keep_progress = True
593
- edu_links = await manager.get_edu_exam_links(url)
594
- all_files = []
595
-
596
- progress_text = st.empty()
597
- progress_bar = st.progress(0)
598
-
599
- # Process each exam link
600
- for i, link in enumerate(edu_links):
601
- progress = (i+1) / max(1, len(edu_links))
602
- progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
603
- progress_bar.progress(progress)
604
-
605
- files = await manager.extract_downloadable_files(link, custom_ext_list)
606
- all_files.extend(files)
607
-
608
- st.session_state.files = all_files
609
- progress_text.empty()
610
- progress_bar.empty()
611
- st.session_state.keep_progress = False
612
-
613
- else:
614
- # Use general search method
615
- files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
616
- st.session_state.files = files
617
-
618
- # Run the search
619
- asyncio.run(run_search())
620
- st.rerun()
621
-
622
- # Handle clear button
623
- if clear_button:
624
- st.session_state.files = []
625
- st.session_state.downloaded_paths = []
626
- st.session_state.download_complete = False
627
- if 'preset_url' in st.session_state:
628
- st.session_state.preset_url = ''
629
- st.rerun()
630
-
631
- # Entry point
632
- if __name__ == "__main__":
633
- main()