|
import streamlit as st |
|
import os |
|
import asyncio |
|
import mimetypes |
|
from app.utils import create_zip_file, humanize_file_size, show_user_friendly_error |
|
from app.download_manager import DownloadManager |
|
from app.rag_search import EnhancedRAGSearch |
|
from app.google_drive import ( |
|
get_google_auth_url, exchange_code_for_credentials, |
|
google_drive_upload, create_drive_folder |
|
) |
|
import googleapiclient.discovery |
|
|
|
def setup_ui(): |
|
"""Setup the main UI elements""" |
|
st.markdown(""" |
|
<style> |
|
.stTabs [data-baseweb="tab-list"] { |
|
gap: 10px; |
|
} |
|
.stTabs [data-baseweb="tab"] { |
|
height: 50px; |
|
white-space: pre-wrap; |
|
border-radius: 4px 4px 0px 0px; |
|
padding: 10px 16px; |
|
background-color: #f0f2f6; |
|
} |
|
.stTabs [aria-selected="true"] { |
|
background-color: #ffffff !important; |
|
border-bottom: 2px solid #4c78a8; |
|
} |
|
.stFileUploader > div > div > button { |
|
width: 100%; |
|
} |
|
.main-header { |
|
font-size: 2.5rem; |
|
font-weight: 700; |
|
margin-bottom: 10px; |
|
} |
|
.section-subheader { |
|
font-size: 1.3rem; |
|
font-weight: 600; |
|
margin-top: 20px; |
|
margin-bottom: 10px; |
|
} |
|
.info-text { |
|
color: #6c757d; |
|
font-size: 0.9rem; |
|
} |
|
.stButton>button { |
|
width: 100%; |
|
} |
|
.result-card { |
|
background-color: #f8f9fa; |
|
border-radius: 6px; |
|
padding: 16px; |
|
margin-bottom: 12px; |
|
border-left: 4px solid #4c78a8; |
|
} |
|
.sidebar-header { |
|
font-size: 1.2rem; |
|
font-weight: 600; |
|
margin-bottom: 10px; |
|
} |
|
.sidebar-section { |
|
margin-bottom: 20px; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
def create_sidebar(): |
|
"""Create the sidebar elements""" |
|
with st.sidebar: |
|
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50) |
|
st.markdown("<p class='sidebar-header'>Advanced File Downloader</p>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Mode</p>", unsafe_allow_html=True) |
|
mode = st.radio( |
|
"Select Mode", |
|
["Standard", "Education Mode", "Research Mode", "Media Mode"], |
|
label_visibility="collapsed", |
|
index=["Standard", "Education Mode", "Research Mode", "Media Mode"].index(st.session_state.mode), |
|
horizontal=False |
|
) |
|
|
|
if mode != st.session_state.mode: |
|
st.session_state.mode = mode |
|
|
|
if mode == "Education Mode": |
|
st.session_state.custom_extensions = ".pdf,.doc,.docx,.ppt,.pptx" |
|
st.session_state.prioritize_pdfs = True |
|
elif mode == "Research Mode": |
|
st.session_state.custom_extensions = ".pdf,.txt,.csv,.json,.xlsx" |
|
st.session_state.prioritize_pdfs = True |
|
elif mode == "Media Mode": |
|
st.session_state.custom_extensions = ".jpg,.png,.mp3,.mp4,.avi,.mov" |
|
st.session_state.prioritize_pdfs = False |
|
|
|
st.markdown(f"<div class='info-text'>Current: <b>{st.session_state.mode}</b></div>", unsafe_allow_html=True) |
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Quick Settings</p>", unsafe_allow_html=True) |
|
|
|
stealth_mode = st.checkbox("Stealth Mode", value=st.session_state.stealth_mode) |
|
if stealth_mode != st.session_state.stealth_mode: |
|
st.session_state.stealth_mode = stealth_mode |
|
|
|
use_proxy = st.checkbox("Use Proxy", value=st.session_state.use_proxy) |
|
if use_proxy != st.session_state.use_proxy: |
|
st.session_state.use_proxy = use_proxy |
|
|
|
if use_proxy: |
|
proxy_string = st.text_input("Proxy Address", |
|
placeholder="e.g., http://user:pass@host:port", |
|
value=st.session_state.proxy_string or "") |
|
if proxy_string != st.session_state.proxy_string: |
|
st.session_state.proxy_string = proxy_string |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
show_google_drive_integration() |
|
|
|
|
|
if st.session_state.mode == "Education Mode": |
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Quick Access</p>", unsafe_allow_html=True) |
|
st.markdown("<div class='info-text'>Common Educational Sites</div>", unsafe_allow_html=True) |
|
|
|
if st.button("Past Exam Papers"): |
|
st.session_state.preset_url = "https://pastpapers.example.edu" |
|
st.session_state.search_method = "Exam Site Mode" |
|
st.rerun() |
|
|
|
if st.button("Open Course Materials"): |
|
st.session_state.preset_url = "https://opencourseware.example.edu" |
|
st.session_state.search_method = "Deep Search" |
|
st.rerun() |
|
|
|
if st.button("Research Papers"): |
|
st.session_state.preset_url = "https://papers.example.org" |
|
st.session_state.search_method = "Deep Search" |
|
st.rerun() |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>System Status</p>", unsafe_allow_html=True) |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.markdown("<div class='info-text'>Search</div>", unsafe_allow_html=True) |
|
st.markdown("<div style='color: green; font-weight: bold;'>Active</div>", unsafe_allow_html=True) |
|
with col2: |
|
st.markdown("<div class='info-text'>Browser</div>", unsafe_allow_html=True) |
|
st.markdown("<div style='color: green; font-weight: bold;'>Ready</div>", unsafe_allow_html=True) |
|
|
|
if st.button("Install Dependencies"): |
|
with st.spinner("Installing Playwright dependencies..."): |
|
install_playwright_dependencies() |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<div class='sidebar-section' style='position: absolute; bottom: 20px; width: 90%;'>", unsafe_allow_html=True) |
|
st.markdown("<div class='info-text' style='text-align: center;'>Version 2.0 β’ March 2025</div>", unsafe_allow_html=True) |
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
def show_google_drive_integration(): |
|
"""Display Google Drive integration UI""" |
|
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True) |
|
st.markdown("<p class='sidebar-header'>Google Drive</p>", unsafe_allow_html=True) |
|
|
|
if st.session_state.google_credentials: |
|
st.success("β
Connected") |
|
|
|
drive_folder = st.text_input("Drive Folder", |
|
value="File Downloader" if 'drive_folder' not in st.session_state else st.session_state.drive_folder) |
|
if 'drive_folder' not in st.session_state or drive_folder != st.session_state.drive_folder: |
|
st.session_state.drive_folder = drive_folder |
|
|
|
if st.button("Disconnect Drive"): |
|
st.session_state.google_credentials = None |
|
st.rerun() |
|
else: |
|
st.warning("β οΈ Not Connected") |
|
if st.button("Connect Google Drive"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
auth_code = st.text_input("Enter authorization code:") |
|
|
|
if auth_code: |
|
with st.spinner("Connecting to Google Drive..."): |
|
credentials, status_msg = exchange_code_for_credentials(auth_code) |
|
if credentials: |
|
st.session_state.google_credentials = credentials |
|
st.success(status_msg) |
|
st.rerun() |
|
else: |
|
st.error(status_msg) |
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
def install_playwright_dependencies(): |
|
"""Install Playwright dependencies""" |
|
try: |
|
import subprocess |
|
import os |
|
|
|
|
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
|
|
|
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
|
|
|
|
subprocess.run(['pip', 'install', 'playwright'], check=True) |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
|
|
st.success("Playwright dependencies installed successfully!") |
|
except Exception as e: |
|
st.error(f"Error installing Playwright dependencies: {e}") |
|
st.info("You may need to manually install dependencies. Check console for details.") |
|
|
|
def display_file_results(files): |
|
"""Display file results with filtering and sorting options""" |
|
if not files: |
|
return |
|
|
|
st.markdown("<h3 class='section-subheader'>Found Files</h3>", unsafe_allow_html=True) |
|
|
|
|
|
filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1]) |
|
with filter_col1: |
|
file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.") |
|
with filter_col2: |
|
sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"]) |
|
with filter_col3: |
|
show_only_pdfs = st.checkbox("PDFs Only", value=False) |
|
|
|
|
|
sorted_files = list(files) |
|
if sort_option == "Name": |
|
sorted_files.sort(key=lambda x: x['filename']) |
|
elif sort_option == "Size (Largest)": |
|
|
|
def parse_size(size_str): |
|
if 'Unknown' in size_str: |
|
return 0 |
|
try: |
|
value = float(size_str.split(' ')[0]) |
|
unit = size_str.split(' ')[1] |
|
multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} |
|
return value * multipliers.get(unit, 0) |
|
except: |
|
return 0 |
|
|
|
sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True) |
|
elif sort_option == "Size (Smallest)": |
|
def parse_size(size_str): |
|
if 'Unknown' in size_str: |
|
return float('inf') |
|
try: |
|
value = float(size_str.split(' ')[0]) |
|
unit = size_str.split(' ')[1] |
|
multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} |
|
return value * multipliers.get(unit, 0) |
|
except: |
|
return float('inf') |
|
|
|
sorted_files.sort(key=lambda x: parse_size(x['size'])) |
|
|
|
|
|
file_container = st.container() |
|
with file_container: |
|
selected_files = [] |
|
displayed_files = [] |
|
|
|
for i, file in enumerate(sorted_files): |
|
|
|
if file_filter and file_filter.lower() not in file['filename'].lower(): |
|
continue |
|
if show_only_pdfs and not file['filename'].lower().endswith('.pdf'): |
|
continue |
|
|
|
displayed_files.append(i) |
|
with st.container(): |
|
col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1]) |
|
with col1: |
|
selected = st.checkbox("", key=f"select_{i}", value=True) |
|
if selected: |
|
selected_files.append(i) |
|
with col2: |
|
file_icon = get_file_icon(file['filename']) |
|
st.markdown(f"**{file_icon} {file['filename']}**") |
|
st.markdown(f"<span class='info-text'>{file['url'][:60]}...</span>", unsafe_allow_html=True) |
|
with col3: |
|
st.markdown(f"**Size:** {file['size']}") |
|
with col4: |
|
st.button("Preview", key=f"preview_{i}") |
|
|
|
st.divider() |
|
|
|
if not displayed_files: |
|
st.info("No files match your current filters. Try adjusting your search criteria.") |
|
|
|
return selected_files, displayed_files |
|
|
|
def get_file_icon(filename): |
|
"""Return appropriate icon for file type""" |
|
file_icon = "π" |
|
if filename.lower().endswith('.pdf'): |
|
file_icon = "π" |
|
elif filename.lower().endswith(('.doc', '.docx')): |
|
file_icon = "π" |
|
elif filename.lower().endswith(('.xls', '.xlsx')): |
|
file_icon = "π" |
|
elif filename.lower().endswith(('.ppt', '.pptx')): |
|
file_icon = "πΌοΈ" |
|
elif filename.lower().endswith(('.jpg', '.png', '.gif')): |
|
file_icon = "πΌοΈ" |
|
elif filename.lower().endswith(('.mp3', '.wav')): |
|
file_icon = "π" |
|
elif filename.lower().endswith(('.mp4', '.avi', '.mov')): |
|
file_icon = "π¬" |
|
return file_icon |
|
|
|
def handle_downloads(selected_files, download_dir, download_option, download_col1): |
|
"""Handle downloading of selected files""" |
|
if not selected_files: |
|
return |
|
|
|
|
|
with download_col1: |
|
download_status = st.empty() |
|
download_progress = st.progress(0) |
|
|
|
async def run_download(): |
|
async with DownloadManager( |
|
use_proxy=st.session_state.use_proxy, |
|
proxy=st.session_state.proxy_string, |
|
use_stealth=st.session_state.stealth_mode |
|
) as manager: |
|
files_to_download = [st.session_state.files[i] for i in selected_files] |
|
|
|
|
|
st.session_state.downloaded_paths = [] |
|
|
|
for i, file_info in enumerate(files_to_download): |
|
progress = (i) / len(files_to_download) |
|
download_status.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}") |
|
download_progress.progress(progress) |
|
|
|
downloaded_path = await manager.download_file( |
|
file_info, |
|
download_dir, |
|
get_domain(file_info['url']) |
|
) |
|
|
|
if downloaded_path: |
|
st.session_state.downloaded_paths.append(downloaded_path) |
|
|
|
download_progress.progress(1.0) |
|
download_status.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!") |
|
st.session_state.download_complete = True |
|
|
|
|
|
asyncio.run(run_download()) |
|
|
|
|
|
if st.session_state.download_complete: |
|
st.success(f"β
Downloaded {len(st.session_state.downloaded_paths)} files successfully!") |
|
download_links = [] |
|
for path in st.session_state.downloaded_paths: |
|
with open(path, "rb") as f: |
|
file_content = f.read() |
|
file_name = os.path.basename(path) |
|
download_links.append((file_name, file_content)) |
|
|
|
if len(download_links) > 0: |
|
if download_option == "ZIP Archive": |
|
|
|
zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir) |
|
with open(zip_path, "rb") as f: |
|
zip_content = f.read() |
|
st.download_button("π¦ Download ZIP Archive", |
|
zip_content, |
|
file_name=os.path.basename(zip_path), |
|
mime="application/zip") |
|
else: |
|
|
|
st.markdown("<h4>Download Files</h4>", unsafe_allow_html=True) |
|
|
|
|
|
cols = st.columns(3) |
|
for idx, (name, content) in enumerate(download_links): |
|
mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream' |
|
with cols[idx % 3]: |
|
st.download_button( |
|
f"π {name}", |
|
content, |
|
file_name=name, |
|
mime=mime_type, |
|
key=f"dl_{name}", |
|
use_container_width=True |
|
) |
|
|
|
def handle_google_drive_upload(selected_files): |
|
"""Handle uploading files to Google Drive""" |
|
if not st.session_state.google_credentials or not st.session_state.downloaded_paths: |
|
return |
|
|
|
with st.spinner("Uploading to Google Drive..."): |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials) |
|
|
|
|
|
folder_id = None |
|
folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader" |
|
|
|
|
|
query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false" |
|
results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute() |
|
items = results.get('files', []) |
|
|
|
if not items: |
|
|
|
folder_id = create_drive_folder(drive_service, folder_name) |
|
else: |
|
folder_id = items[0]['id'] |
|
|
|
|
|
upload_progress = st.progress(0) |
|
status_text = st.empty() |
|
uploaded_count = 0 |
|
|
|
for i, path in enumerate(st.session_state.downloaded_paths): |
|
progress = i / len(st.session_state.downloaded_paths) |
|
status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}") |
|
upload_progress.progress(progress) |
|
|
|
result = google_drive_upload(path, st.session_state.google_credentials, folder_id) |
|
if isinstance(result, str) and not result.startswith("Error"): |
|
uploaded_count += 1 |
|
|
|
upload_progress.progress(1.0) |
|
status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'") |
|
|
|
st.success(f"β
Files uploaded to Google Drive successfully!") |
|
|
|
def get_domain(url): |
|
"""Extract domain from URL""" |
|
from urllib.parse import urlparse |
|
parsed = urlparse(url) |
|
return parsed.netloc |