Create app/main.py
Browse files- app/main.py +633 -0
app/main.py
ADDED
@@ -0,0 +1,633 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import asyncio
|
3 |
+
import streamlit as st
|
4 |
+
from app.ui import (
|
5 |
+
setup_ui, create_sidebar, display_file_results,
|
6 |
+
handle_downloads, handle_google_drive_upload
|
7 |
+
)
|
8 |
+
from app.download_manager import DownloadManager
|
9 |
+
from app.rag_search import EnhancedRAGSearch
|
10 |
+
from app.utils import USER_AGENTS
|
11 |
+
|
12 |
+
def initialize_session_state():
|
13 |
+
"""Initialize session state variables"""
|
14 |
+
if 'files' not in st.session_state:
|
15 |
+
st.session_state.files = []
|
16 |
+
if 'downloaded_paths' not in st.session_state:
|
17 |
+
st.session_state.downloaded_paths = []
|
18 |
+
if 'download_complete' not in st.session_state:
|
19 |
+
st.session_state.download_complete = False
|
20 |
+
if 'selected_tab' not in st.session_state:
|
21 |
+
st.session_state.selected_tab = 0
|
22 |
+
if 'rag_search' not in st.session_state:
|
23 |
+
st.session_state.rag_search = EnhancedRAGSearch()
|
24 |
+
if 'keep_progress' not in st.session_state:
|
25 |
+
st.session_state.keep_progress = False
|
26 |
+
if 'google_credentials' not in st.session_state:
|
27 |
+
st.session_state.google_credentials = None
|
28 |
+
if 'mode' not in st.session_state:
|
29 |
+
st.session_state.mode = "Standard"
|
30 |
+
if 'use_proxy' not in st.session_state:
|
31 |
+
st.session_state.use_proxy = False
|
32 |
+
if 'proxy_string' not in st.session_state:
|
33 |
+
st.session_state.proxy_string = None
|
34 |
+
if 'stealth_mode' not in st.session_state:
|
35 |
+
st.session_state.stealth_mode = True
|
36 |
+
|
37 |
+
def main():
|
38 |
+
# Initialize session state
|
39 |
+
initialize_session_state()
|
40 |
+
|
41 |
+
# Set up UI styling
|
42 |
+
setup_ui()
|
43 |
+
|
44 |
+
# Create sidebar
|
45 |
+
create_sidebar()
|
46 |
+
|
47 |
+
# Header section
|
48 |
+
col1, col2 = st.columns([5, 1])
|
49 |
+
with col1:
|
50 |
+
st.markdown("<h1 class='main-header'>Advanced File Downloader</h1>", unsafe_allow_html=True)
|
51 |
+
with col2:
|
52 |
+
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
|
53 |
+
|
54 |
+
mode_descriptions = {
|
55 |
+
"Standard": "A versatile tool for discovering and downloading files from any website.",
|
56 |
+
"Education Mode": "Optimized for educational resources, exams, and academic materials.",
|
57 |
+
"Research Mode": "Focused on research papers, datasets, and academic publications.",
|
58 |
+
"Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
|
59 |
+
}
|
60 |
+
|
61 |
+
st.markdown(f"<p class='info-text'>{mode_descriptions[st.session_state.mode]}</p>", unsafe_allow_html=True)
|
62 |
+
|
63 |
+
# Main tabs
|
64 |
+
tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
|
65 |
+
|
66 |
+
# Tab 1: Search & Download
|
67 |
+
with tabs[0]:
|
68 |
+
st.markdown("<h2 class='section-subheader'>Find and Download Files</h2>", unsafe_allow_html=True)
|
69 |
+
|
70 |
+
col1, col2 = st.columns([3, 1])
|
71 |
+
with col1:
|
72 |
+
url = st.text_input("Enter a URL to search for downloadable files:",
|
73 |
+
placeholder="e.g., https://example.com/resources",
|
74 |
+
value=st.session_state.get('preset_url', ''))
|
75 |
+
with col2:
|
76 |
+
# Initialize search_method with either session state or default value
|
77 |
+
initial_search_method = st.session_state.get('search_method', "Deep Search")
|
78 |
+
search_method = st.selectbox("Search Method",
|
79 |
+
["Deep Search", "Quick Search", "Exam Site Mode"],
|
80 |
+
index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
|
81 |
+
# Update session state when changed
|
82 |
+
if search_method != st.session_state.get('search_method'):
|
83 |
+
st.session_state.search_method = search_method
|
84 |
+
|
85 |
+
# Advanced options in an expander
|
86 |
+
with st.expander("Search Options", expanded=False):
|
87 |
+
col1, col2, col3 = st.columns(3)
|
88 |
+
with col1:
|
89 |
+
depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
|
90 |
+
help="Higher values will search more links but take longer")
|
91 |
+
prioritize_pdfs = st.checkbox("Prioritize PDFs",
|
92 |
+
value=st.session_state.get('prioritize_pdfs', True),
|
93 |
+
help="Focus on finding PDF files first")
|
94 |
+
with col2:
|
95 |
+
timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
|
96 |
+
follow_subdomains = st.checkbox("Follow Subdomains", value=True,
|
97 |
+
help="Include links from subdomains in the search")
|
98 |
+
with col3:
|
99 |
+
# Default extensions based on mode
|
100 |
+
default_extensions = {
|
101 |
+
"Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
|
102 |
+
"Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
|
103 |
+
"Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
|
104 |
+
"Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
|
105 |
+
}
|
106 |
+
|
107 |
+
custom_extensions = st.text_area(
|
108 |
+
"Custom File Extensions",
|
109 |
+
value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
|
110 |
+
help="Comma-separated list of file extensions to look for"
|
111 |
+
)
|
112 |
+
|
113 |
+
# Update session state when extensions changed
|
114 |
+
if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
|
115 |
+
st.session_state.custom_extensions = custom_extensions
|
116 |
+
|
117 |
+
search_col1, search_col2 = st.columns([4, 1])
|
118 |
+
with search_col1:
|
119 |
+
search_button = st.button("🔍 Start Search", use_container_width=True)
|
120 |
+
with search_col2:
|
121 |
+
clear_button = st.button("🧹 Clear Results", use_container_width=True)
|
122 |
+
|
123 |
+
# File results section
|
124 |
+
if st.session_state.files:
|
125 |
+
# Display file results
|
126 |
+
selected_files, displayed_files = display_file_results(st.session_state.files)
|
127 |
+
|
128 |
+
# Download options
|
129 |
+
if selected_files:
|
130 |
+
col1, col2 = st.columns(2)
|
131 |
+
with col1:
|
132 |
+
download_dir = st.text_input("Download Directory", value="downloads")
|
133 |
+
with col2:
|
134 |
+
download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
|
135 |
+
|
136 |
+
download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
|
137 |
+
with download_col1:
|
138 |
+
download_button = st.button("⬇️ Download Selected Files", use_container_width=True)
|
139 |
+
with download_col2:
|
140 |
+
google_drive_button = st.button("📤 Upload to Drive",
|
141 |
+
use_container_width=True,
|
142 |
+
disabled=not st.session_state.google_credentials)
|
143 |
+
with download_col3:
|
144 |
+
select_all = st.button("Select All Files", use_container_width=True)
|
145 |
+
|
146 |
+
# Handle select all button
|
147 |
+
if select_all:
|
148 |
+
for i in displayed_files:
|
149 |
+
st.session_state[f"select_{i}"] = True
|
150 |
+
st.rerun()
|
151 |
+
|
152 |
+
# Handle download button if clicked
|
153 |
+
if download_button:
|
154 |
+
# Create download directory
|
155 |
+
os.makedirs(download_dir, exist_ok=True)
|
156 |
+
handle_downloads(selected_files, download_dir, download_option, download_col1)
|
157 |
+
|
158 |
+
# Handle Google Drive upload
|
159 |
+
if google_drive_button:
|
160 |
+
handle_google_drive_upload(selected_files)
|
161 |
+
|
162 |
+
# Tab 2: Local File Search
|
163 |
+
with tabs[1]:
|
164 |
+
st.markdown("<h2 class='section-subheader'>Search Downloaded Files</h2>", unsafe_allow_html=True)
|
165 |
+
st.write("Upload files to search through their content with AI-powered semantic search.")
|
166 |
+
|
167 |
+
# File upload
|
168 |
+
uploaded_files = st.file_uploader("Upload documents for search",
|
169 |
+
accept_multiple_files=True,
|
170 |
+
type=['pdf', 'docx', 'txt', 'csv', 'json'])
|
171 |
+
|
172 |
+
if uploaded_files:
|
173 |
+
# Build search index on upload
|
174 |
+
col1, col2 = st.columns([4, 1])
|
175 |
+
with col1:
|
176 |
+
use_transformer = st.checkbox("Use AI Transformer Model", value=st.session_state.rag_search.use_transformer,
|
177 |
+
help="Uses advanced AI for more accurate semantic search (if available)")
|
178 |
+
with col2:
|
179 |
+
if st.button("Build Search Index", use_container_width=True):
|
180 |
+
with st.spinner("Processing files and building search index..."):
|
181 |
+
files_added = 0
|
182 |
+
for uploaded_file in uploaded_files:
|
183 |
+
file_info = {
|
184 |
+
'filename': uploaded_file.name,
|
185 |
+
'url': f'local://{uploaded_file.name}',
|
186 |
+
'size': humanize_file_size(uploaded_file.size)
|
187 |
+
}
|
188 |
+
success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
|
189 |
+
if success:
|
190 |
+
files_added += 1
|
191 |
+
|
192 |
+
if files_added > 0:
|
193 |
+
index_built = st.session_state.rag_search.build_index()
|
194 |
+
if index_built:
|
195 |
+
st.success(f"✅ Successfully indexed {files_added} files!")
|
196 |
+
else:
|
197 |
+
st.error("Failed to build search index.")
|
198 |
+
else:
|
199 |
+
st.warning("No valid text could be extracted from the files.")
|
200 |
+
|
201 |
+
# Search interface
|
202 |
+
st.markdown("<h3 class='section-subheader'>Search Files</h3>", unsafe_allow_html=True)
|
203 |
+
|
204 |
+
col1, col2 = st.columns([4, 1])
|
205 |
+
with col1:
|
206 |
+
query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
|
207 |
+
with col2:
|
208 |
+
expand_query = st.checkbox("Auto-expand query", value=True,
|
209 |
+
help="Automatically add related terms to your search")
|
210 |
+
|
211 |
+
col1, col2 = st.columns([4, 1])
|
212 |
+
with col1:
|
213 |
+
if st.button("🔍 Search Documents", use_container_width=True):
|
214 |
+
if not query:
|
215 |
+
st.warning("Please enter a search query")
|
216 |
+
else:
|
217 |
+
with st.spinner("Searching..."):
|
218 |
+
results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
|
219 |
+
|
220 |
+
if results:
|
221 |
+
st.markdown(f"**Found {len(results)} relevant documents:**")
|
222 |
+
for i, result in enumerate(results):
|
223 |
+
with st.container():
|
224 |
+
st.markdown(f"<div class='result-card'>", unsafe_allow_html=True)
|
225 |
+
st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
|
226 |
+
|
227 |
+
if result.get('chunk_preview'):
|
228 |
+
st.markdown("**Matching content:**")
|
229 |
+
st.text(result['chunk_preview'])
|
230 |
+
|
231 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
232 |
+
else:
|
233 |
+
st.info("No matching documents found. Try a different query.")
|
234 |
+
with col2:
|
235 |
+
num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
|
236 |
+
|
237 |
+
# Quick search tips
|
238 |
+
with st.expander("Search Tips", expanded=False):
|
239 |
+
st.markdown("""
|
240 |
+
### Effective Search Tips
|
241 |
+
|
242 |
+
- **Be specific** with your queries for more accurate results
|
243 |
+
- **Try different phrasings** if you don't get the results you expect
|
244 |
+
- Use **quotation marks** for exact phrase matching
|
245 |
+
- For **complex topics**, break down your search into multiple queries
|
246 |
+
- **Combine related terms** to improve recall
|
247 |
+
|
248 |
+
The search engine uses advanced algorithms to understand the semantic meaning of your query,
|
249 |
+
not just keyword matching.
|
250 |
+
""")
|
251 |
+
|
252 |
+
# Tab 3: Advanced Configuration
|
253 |
+
with tabs[2]:
|
254 |
+
st.markdown("<h2 class='section-subheader'>Advanced Settings</h2>", unsafe_allow_html=True)
|
255 |
+
|
256 |
+
config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
|
257 |
+
|
258 |
+
# Browser Settings tab
|
259 |
+
with config_tabs[0]:
|
260 |
+
col1, col2 = st.columns(2)
|
261 |
+
with col1:
|
262 |
+
use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
|
263 |
+
help="Makes browser harder to detect as automated, but may be slower")
|
264 |
+
|
265 |
+
handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
|
266 |
+
help="Attempt to solve simple captchas automatically")
|
267 |
+
|
268 |
+
download_timeout = st.slider("Download Timeout (seconds)",
|
269 |
+
min_value=30, max_value=600, value=300,
|
270 |
+
help="Maximum time to wait for downloads to complete")
|
271 |
+
with col2:
|
272 |
+
user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
|
273 |
+
help="Browser identity to use when accessing websites")
|
274 |
+
|
275 |
+
save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
|
276 |
+
help="Save screenshots when errors occur for debugging")
|
277 |
+
|
278 |
+
browser_lang = st.selectbox("Browser Language",
|
279 |
+
["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
|
280 |
+
index=0)
|
281 |
+
|
282 |
+
if st.button("Update Browser Settings"):
|
283 |
+
st.session_state.stealth_mode = use_stealth
|
284 |
+
st.success("Browser settings updated!")
|
285 |
+
|
286 |
+
# Dependency installation section
|
287 |
+
st.markdown("<h4 class='section-subheader'>Dependencies</h4>", unsafe_allow_html=True)
|
288 |
+
if st.button("Install Playwright Dependencies"):
|
289 |
+
from app.ui import install_playwright_dependencies
|
290 |
+
with st.spinner("Installing dependencies..."):
|
291 |
+
install_playwright_dependencies()
|
292 |
+
|
293 |
+
# Proxy Configuration tab
|
294 |
+
with config_tabs[1]:
|
295 |
+
proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
|
296 |
+
help="Route requests through a proxy server for anonymity or bypassing restrictions")
|
297 |
+
|
298 |
+
if proxy_enabled:
|
299 |
+
proxy_col1, proxy_col2 = st.columns(2)
|
300 |
+
with proxy_col1:
|
301 |
+
proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
|
302 |
+
proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
|
303 |
+
with proxy_col2:
|
304 |
+
proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
|
305 |
+
proxy_auth = st.text_input("Proxy Authentication (optional)",
|
306 |
+
placeholder="username:password", type="password")
|
307 |
+
|
308 |
+
st.markdown("<h4 class='section-subheader'>Proxy Rotation</h4>", unsafe_allow_html=True)
|
309 |
+
use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
|
310 |
+
help="Automatically rotate between multiple proxies for better anonymity")
|
311 |
+
|
312 |
+
if use_proxy_rotation:
|
313 |
+
proxy_list = st.text_area("Proxy List (one per line)",
|
314 |
+
placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
|
315 |
+
rotation_interval = st.slider("Rotation Interval (requests)",
|
316 |
+
min_value=1, max_value=50, value=10,
|
317 |
+
help="How often to switch proxies")
|
318 |
+
|
319 |
+
if st.button("Save Proxy Configuration"):
|
320 |
+
# Construct the proxy string
|
321 |
+
proxy_string = None
|
322 |
+
if proxy_enabled and proxy_host and proxy_port:
|
323 |
+
proxy_prefix = f"{proxy_type.lower()}://"
|
324 |
+
proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
|
325 |
+
proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
|
326 |
+
|
327 |
+
# Update session state
|
328 |
+
st.session_state.use_proxy = proxy_enabled
|
329 |
+
st.session_state.proxy_string = proxy_string
|
330 |
+
|
331 |
+
# Configure proxy rotation if enabled
|
332 |
+
from app.utils import PROXY_ROTATION_CONFIG
|
333 |
+
if use_proxy_rotation and proxy_list:
|
334 |
+
PROXY_ROTATION_CONFIG["enabled"] = True
|
335 |
+
PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
|
336 |
+
PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
|
337 |
+
|
338 |
+
st.success("Proxy configuration updated!")
|
339 |
+
|
340 |
+
# Download Options tab
|
341 |
+
with config_tabs[2]:
|
342 |
+
col1, col2 = st.columns(2)
|
343 |
+
with col1:
|
344 |
+
st.markdown("<h4 class='section-subheader'>Download Behavior</h4>", unsafe_allow_html=True)
|
345 |
+
|
346 |
+
skip_existing = st.checkbox("Skip Existing Files", value=True,
|
347 |
+
help="Don't download files that already exist locally")
|
348 |
+
|
349 |
+
auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
|
350 |
+
help="Automatically rename files instead of overwriting")
|
351 |
+
|
352 |
+
verify_downloads = st.checkbox("Verify Downloads", value=True,
|
353 |
+
help="Check file integrity after download")
|
354 |
+
|
355 |
+
max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
|
356 |
+
help="Number of times to retry failed downloads")
|
357 |
+
|
358 |
+
with col2:
|
359 |
+
st.markdown("<h4 class='section-subheader'>File Organization</h4>", unsafe_allow_html=True)
|
360 |
+
|
361 |
+
auto_organize = st.checkbox("Auto-Organize Files", value=True,
|
362 |
+
help="Automatically organize files by type")
|
363 |
+
|
364 |
+
default_dir = st.text_input("Default Download Directory", value="downloads",
|
365 |
+
help="Default location to save downloaded files")
|
366 |
+
|
367 |
+
org_by_domain = st.checkbox("Organize by Domain", value=False,
|
368 |
+
help="Create subdirectories based on source domains")
|
369 |
+
|
370 |
+
org_by_type = st.checkbox("Organize by File Type", value=False,
|
371 |
+
help="Create subdirectories based on file types")
|
372 |
+
|
373 |
+
if st.button("Save Download Settings"):
|
374 |
+
st.session_state.download_settings = {
|
375 |
+
"skip_existing": skip_existing,
|
376 |
+
"auto_rename": auto_rename,
|
377 |
+
"verify_downloads": verify_downloads,
|
378 |
+
"max_retries": max_retries,
|
379 |
+
"auto_organize": auto_organize,
|
380 |
+
"default_dir": default_dir,
|
381 |
+
"org_by_domain": org_by_domain,
|
382 |
+
"org_by_type": org_by_type
|
383 |
+
}
|
384 |
+
st.success("Download settings saved!")
|
385 |
+
|
386 |
+
# System tab
|
387 |
+
with config_tabs[3]:
|
388 |
+
col1, col2 = st.columns(2)
|
389 |
+
with col1:
|
390 |
+
st.markdown("<h4 class='section-subheader'>Memory & Performance</h4>", unsafe_allow_html=True)
|
391 |
+
|
392 |
+
max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
|
393 |
+
help="Maximum number of simultaneous downloads")
|
394 |
+
|
395 |
+
memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
|
396 |
+
help="Maximum memory to use for file processing")
|
397 |
+
|
398 |
+
processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
|
399 |
+
help="Number of threads to use for file processing")
|
400 |
+
|
401 |
+
with col2:
|
402 |
+
st.markdown("<h4 class='section-subheader'>Logs & Diagnostics</h4>", unsafe_allow_html=True)
|
403 |
+
|
404 |
+
log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
|
405 |
+
help="Detail level for application logs")
|
406 |
+
|
407 |
+
save_debug_info = st.checkbox("Save Debug Information", value=False,
|
408 |
+
help="Save detailed information about program execution")
|
409 |
+
|
410 |
+
log_dir = st.text_input("Log Directory", value="logs",
|
411 |
+
help="Directory to save log files")
|
412 |
+
|
413 |
+
if st.button("Apply System Settings"):
|
414 |
+
import logging
|
415 |
+
st.session_state.system_settings = {
|
416 |
+
"max_concurrent": max_concurrent,
|
417 |
+
"memory_limit": memory_limit,
|
418 |
+
"processing_threads": processing_threads,
|
419 |
+
"log_level": log_level,
|
420 |
+
"save_debug_info": save_debug_info,
|
421 |
+
"log_dir": log_dir
|
422 |
+
}
|
423 |
+
# Update logging configuration
|
424 |
+
log_level_num = getattr(logging, log_level)
|
425 |
+
logging.getLogger().setLevel(log_level_num)
|
426 |
+
st.success("System settings applied!")
|
427 |
+
|
428 |
+
# Reset application button
|
429 |
+
st.markdown("<h4 class='section-subheader'>Application Control</h4>", unsafe_allow_html=True)
|
430 |
+
reset_col1, reset_col2 = st.columns([1, 3])
|
431 |
+
with reset_col1:
|
432 |
+
if st.button("Reset Application", use_container_width=True):
|
433 |
+
for key in list(st.session_state.keys()):
|
434 |
+
if key != 'google_credentials': # Preserve Google auth
|
435 |
+
del st.session_state[key]
|
436 |
+
st.success("Application has been reset!")
|
437 |
+
st.rerun()
|
438 |
+
with reset_col2:
|
439 |
+
st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
|
440 |
+
|
441 |
+
# Tab 4: Help
|
442 |
+
with tabs[3]:
|
443 |
+
st.markdown("<h2 class='section-subheader'>Help & Documentation</h2>", unsafe_allow_html=True)
|
444 |
+
|
445 |
+
help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
|
446 |
+
|
447 |
+
with help_tabs[0]:
|
448 |
+
st.markdown("""
|
449 |
+
### Getting Started
|
450 |
+
|
451 |
+
1. **Enter a URL** on the Search & Download tab
|
452 |
+
2. Select a **Search Method**:
|
453 |
+
- **Deep Search**: Thorough but slower
|
454 |
+
- **Quick Search**: Fast but may miss some files
|
455 |
+
- **Exam Site Mode**: Optimized for educational resource sites
|
456 |
+
3. Click **Start Search** to find downloadable files
|
457 |
+
4. Select files you want to download
|
458 |
+
5. Click **Download Selected Files**
|
459 |
+
|
460 |
+
#### Using Different Modes
|
461 |
+
|
462 |
+
Select a mode from the sidebar to optimize the tool for different use cases:
|
463 |
+
|
464 |
+
- **Standard Mode**: Balanced for general use
|
465 |
+
- **Education Mode**: Optimized for finding academic materials
|
466 |
+
- **Research Mode**: Better for research papers and datasets
|
467 |
+
- **Media Mode**: Enhanced for finding images, videos, and audio
|
468 |
+
|
469 |
+
For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
|
470 |
+
""")
|
471 |
+
|
472 |
+
with help_tabs[1]:
|
473 |
+
st.markdown("""
|
474 |
+
### Advanced Features
|
475 |
+
|
476 |
+
- **Local File Search**: Upload files and search through their content using the enhanced RAG search
|
477 |
+
- **Custom Extensions**: Specify additional file types to look for beyond the default set
|
478 |
+
- **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
|
479 |
+
- **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
|
480 |
+
- **Google Drive Integration**: Upload downloaded files directly to your Google Drive
|
481 |
+
|
482 |
+
#### Search Tips
|
483 |
+
|
484 |
+
- For educational sites, include specific terms like "exam", "test", "paper" in the URL
|
485 |
+
- When using Local File Search, try different variations of your query for better results
|
486 |
+
- Use filtering and sorting options to find the most relevant files quickly
|
487 |
+
|
488 |
+
#### File Organization
|
489 |
+
|
490 |
+
You can configure automatic file organization in the Advanced Configuration tab:
|
491 |
+
|
492 |
+
- **Organize by Domain**: Creates folders based on the source website
|
493 |
+
- **Organize by File Type**: Separates files into folders by their extension
|
494 |
+
- **Auto-Rename**: Prevents overwriting existing files with same names
|
495 |
+
""")
|
496 |
+
|
497 |
+
with help_tabs[2]:
|
498 |
+
st.markdown("""
|
499 |
+
### Troubleshooting
|
500 |
+
|
501 |
+
#### Common Issues
|
502 |
+
|
503 |
+
- **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
|
504 |
+
- **Downloads failing**: Check if the site requires authentication or uses captchas
|
505 |
+
- **Slow performance**: Reduce search depth or disable stealth mode for faster results
|
506 |
+
- **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
|
507 |
+
|
508 |
+
#### Captcha Issues
|
509 |
+
|
510 |
+
Some websites use captchas to prevent automated access. If you encounter captchas:
|
511 |
+
|
512 |
+
1. Try using a different proxy
|
513 |
+
2. Enable "Handle Captchas Automatically" for simple captchas
|
514 |
+
3. For complex captchas, you may need to manually access the site first
|
515 |
+
|
516 |
+
#### Proxy Problems
|
517 |
+
|
518 |
+
If you're having issues with proxies:
|
519 |
+
|
520 |
+
1. Verify your proxy is working with an external tool
|
521 |
+
2. Check that you've entered the correct format (http://host:port)
|
522 |
+
3. Some websites may block known proxy IPs
|
523 |
+
|
524 |
+
#### Memory Usage
|
525 |
+
|
526 |
+
If the application is using too much memory:
|
527 |
+
|
528 |
+
1. Reduce the "Memory Limit" in System settings
|
529 |
+
2. Process fewer files at once
|
530 |
+
3. Use lower search depth values
|
531 |
+
""")
|
532 |
+
|
533 |
+
with help_tabs[3]:
|
534 |
+
st.markdown("""
|
535 |
+
### About This Tool
|
536 |
+
|
537 |
+
**Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
|
538 |
+
|
539 |
+
#### Key Features
|
540 |
+
|
541 |
+
- **Smart Discovery**: Finds downloadable files even when they're not directly linked
|
542 |
+
- **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
|
543 |
+
- **Educational Focus**: Specialized detection for exam papers and academic resources
|
544 |
+
- **Stealth Capabilities**: Avoids detection by anti-scraping measures
|
545 |
+
|
546 |
+
#### Technical Details
|
547 |
+
|
548 |
+
This tool uses:
|
549 |
+
|
550 |
+
- **Playwright**: For browser automation and stealth capabilities
|
551 |
+
- **Sentence Transformers**: For AI-powered semantic search
|
552 |
+
- **Streamlit**: For the user interface
|
553 |
+
- **Google Drive API**: For cloud integration
|
554 |
+
|
555 |
+
#### Credits
|
556 |
+
|
557 |
+
Created with Python, Streamlit, Playwright, and various AI libraries.
|
558 |
+
|
559 |
+
For issues or suggestions, please contact the developer.
|
560 |
+
|
561 |
+
Version 2.0 - March 2025
|
562 |
+
""")
|
563 |
+
|
564 |
+
# Handle search button
|
565 |
+
if search_button and url:
|
566 |
+
# Reset files and downloaded paths
|
567 |
+
st.session_state.files = []
|
568 |
+
st.session_state.downloaded_paths = []
|
569 |
+
st.session_state.download_complete = False
|
570 |
+
|
571 |
+
# Clear the preset URL if it was used
|
572 |
+
if 'preset_url' in st.session_state:
|
573 |
+
st.session_state.preset_url = ''
|
574 |
+
|
575 |
+
# Prepare custom extensions
|
576 |
+
custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
|
577 |
+
|
578 |
+
# Configure search parameters based on method
|
579 |
+
sublink_limit = 5000 if search_method == "Deep Search" else 1000
|
580 |
+
search_depth = depth if search_method == "Deep Search" else 1
|
581 |
+
is_exam_site = search_method == "Exam Site Mode"
|
582 |
+
|
583 |
+
# Execute the search asynchronously
|
584 |
+
async def run_search():
|
585 |
+
async with DownloadManager(
|
586 |
+
use_proxy=st.session_state.use_proxy,
|
587 |
+
proxy=st.session_state.proxy_string,
|
588 |
+
use_stealth=st.session_state.stealth_mode
|
589 |
+
) as manager:
|
590 |
+
# For exam sites, use specialized approach
|
591 |
+
if is_exam_site:
|
592 |
+
st.session_state.keep_progress = True
|
593 |
+
edu_links = await manager.get_edu_exam_links(url)
|
594 |
+
all_files = []
|
595 |
+
|
596 |
+
progress_text = st.empty()
|
597 |
+
progress_bar = st.progress(0)
|
598 |
+
|
599 |
+
# Process each exam link
|
600 |
+
for i, link in enumerate(edu_links):
|
601 |
+
progress = (i+1) / max(1, len(edu_links))
|
602 |
+
progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
|
603 |
+
progress_bar.progress(progress)
|
604 |
+
|
605 |
+
files = await manager.extract_downloadable_files(link, custom_ext_list)
|
606 |
+
all_files.extend(files)
|
607 |
+
|
608 |
+
st.session_state.files = all_files
|
609 |
+
progress_text.empty()
|
610 |
+
progress_bar.empty()
|
611 |
+
st.session_state.keep_progress = False
|
612 |
+
|
613 |
+
else:
|
614 |
+
# Use general search method
|
615 |
+
files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
|
616 |
+
st.session_state.files = files
|
617 |
+
|
618 |
+
# Run the search
|
619 |
+
asyncio.run(run_search())
|
620 |
+
st.rerun()
|
621 |
+
|
622 |
+
# Handle clear button
|
623 |
+
if clear_button:
|
624 |
+
st.session_state.files = []
|
625 |
+
st.session_state.downloaded_paths = []
|
626 |
+
st.session_state.download_complete = False
|
627 |
+
if 'preset_url' in st.session_state:
|
628 |
+
st.session_state.preset_url = ''
|
629 |
+
st.rerun()
|
630 |
+
|
631 |
+
# Entry point
|
632 |
+
if __name__ == "__main__":
|
633 |
+
main()
|