craw_web / ui.py
euler314's picture
Rename app/ui.py to ui.py
1b5e738 verified
raw
history blame
19.8 kB
import streamlit as st
import os
import asyncio
import mimetypes
from app.utils import create_zip_file, humanize_file_size, show_user_friendly_error
from app.download_manager import DownloadManager
from app.rag_search import EnhancedRAGSearch
from app.google_drive import (
get_google_auth_url, exchange_code_for_credentials,
google_drive_upload, create_drive_folder
)
import googleapiclient.discovery
def setup_ui():
"""Setup the main UI elements"""
st.markdown("""
<style>
.stTabs [data-baseweb="tab-list"] {
gap: 10px;
}
.stTabs [data-baseweb="tab"] {
height: 50px;
white-space: pre-wrap;
border-radius: 4px 4px 0px 0px;
padding: 10px 16px;
background-color: #f0f2f6;
}
.stTabs [aria-selected="true"] {
background-color: #ffffff !important;
border-bottom: 2px solid #4c78a8;
}
.stFileUploader > div > div > button {
width: 100%;
}
.main-header {
font-size: 2.5rem;
font-weight: 700;
margin-bottom: 10px;
}
.section-subheader {
font-size: 1.3rem;
font-weight: 600;
margin-top: 20px;
margin-bottom: 10px;
}
.info-text {
color: #6c757d;
font-size: 0.9rem;
}
.stButton>button {
width: 100%;
}
.result-card {
background-color: #f8f9fa;
border-radius: 6px;
padding: 16px;
margin-bottom: 12px;
border-left: 4px solid #4c78a8;
}
.sidebar-header {
font-size: 1.2rem;
font-weight: 600;
margin-bottom: 10px;
}
.sidebar-section {
margin-bottom: 20px;
}
</style>
""", unsafe_allow_html=True)
def create_sidebar():
"""Create the sidebar elements"""
with st.sidebar:
st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50)
st.markdown("<p class='sidebar-header'>Advanced File Downloader</p>", unsafe_allow_html=True)
# Mode Selection
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
st.markdown("<p class='sidebar-header'>Mode</p>", unsafe_allow_html=True)
mode = st.radio(
"Select Mode",
["Standard", "Education Mode", "Research Mode", "Media Mode"],
label_visibility="collapsed",
index=["Standard", "Education Mode", "Research Mode", "Media Mode"].index(st.session_state.mode),
horizontal=False
)
if mode != st.session_state.mode:
st.session_state.mode = mode
# Update mode-specific settings
if mode == "Education Mode":
st.session_state.custom_extensions = ".pdf,.doc,.docx,.ppt,.pptx"
st.session_state.prioritize_pdfs = True
elif mode == "Research Mode":
st.session_state.custom_extensions = ".pdf,.txt,.csv,.json,.xlsx"
st.session_state.prioritize_pdfs = True
elif mode == "Media Mode":
st.session_state.custom_extensions = ".jpg,.png,.mp3,.mp4,.avi,.mov"
st.session_state.prioritize_pdfs = False
st.markdown(f"<div class='info-text'>Current: <b>{st.session_state.mode}</b></div>", unsafe_allow_html=True)
st.markdown("</div>", unsafe_allow_html=True)
# Quick Settings
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
st.markdown("<p class='sidebar-header'>Quick Settings</p>", unsafe_allow_html=True)
stealth_mode = st.checkbox("Stealth Mode", value=st.session_state.stealth_mode)
if stealth_mode != st.session_state.stealth_mode:
st.session_state.stealth_mode = stealth_mode
use_proxy = st.checkbox("Use Proxy", value=st.session_state.use_proxy)
if use_proxy != st.session_state.use_proxy:
st.session_state.use_proxy = use_proxy
if use_proxy:
proxy_string = st.text_input("Proxy Address",
placeholder="e.g., http://user:pass@host:port",
value=st.session_state.proxy_string or "")
if proxy_string != st.session_state.proxy_string:
st.session_state.proxy_string = proxy_string
st.markdown("</div>", unsafe_allow_html=True)
# Google Drive Integration
show_google_drive_integration()
# Preset buttons for educational sites
if st.session_state.mode == "Education Mode":
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
st.markdown("<p class='sidebar-header'>Quick Access</p>", unsafe_allow_html=True)
st.markdown("<div class='info-text'>Common Educational Sites</div>", unsafe_allow_html=True)
if st.button("Past Exam Papers"):
st.session_state.preset_url = "https://pastpapers.example.edu"
st.session_state.search_method = "Exam Site Mode"
st.rerun()
if st.button("Open Course Materials"):
st.session_state.preset_url = "https://opencourseware.example.edu"
st.session_state.search_method = "Deep Search"
st.rerun()
if st.button("Research Papers"):
st.session_state.preset_url = "https://papers.example.org"
st.session_state.search_method = "Deep Search"
st.rerun()
st.markdown("</div>", unsafe_allow_html=True)
# Tool status
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
st.markdown("<p class='sidebar-header'>System Status</p>", unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
st.markdown("<div class='info-text'>Search</div>", unsafe_allow_html=True)
st.markdown("<div style='color: green; font-weight: bold;'>Active</div>", unsafe_allow_html=True)
with col2:
st.markdown("<div class='info-text'>Browser</div>", unsafe_allow_html=True)
st.markdown("<div style='color: green; font-weight: bold;'>Ready</div>", unsafe_allow_html=True)
if st.button("Install Dependencies"):
with st.spinner("Installing Playwright dependencies..."):
install_playwright_dependencies()
st.markdown("</div>", unsafe_allow_html=True)
# App info
st.markdown("<div class='sidebar-section' style='position: absolute; bottom: 20px; width: 90%;'>", unsafe_allow_html=True)
st.markdown("<div class='info-text' style='text-align: center;'>Version 2.0 β€’ March 2025</div>", unsafe_allow_html=True)
st.markdown("</div>", unsafe_allow_html=True)
def show_google_drive_integration():
"""Display Google Drive integration UI"""
st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
st.markdown("<p class='sidebar-header'>Google Drive</p>", unsafe_allow_html=True)
if st.session_state.google_credentials:
st.success("βœ… Connected")
drive_folder = st.text_input("Drive Folder",
value="File Downloader" if 'drive_folder' not in st.session_state else st.session_state.drive_folder)
if 'drive_folder' not in st.session_state or drive_folder != st.session_state.drive_folder:
st.session_state.drive_folder = drive_folder
if st.button("Disconnect Drive"):
st.session_state.google_credentials = None
st.rerun()
else:
st.warning("⚠️ Not Connected")
if st.button("Connect Google Drive"):
auth_url = get_google_auth_url()
st.markdown(f"[Click here to authorize]({auth_url})")
auth_code = st.text_input("Enter authorization code:")
if auth_code:
with st.spinner("Connecting to Google Drive..."):
credentials, status_msg = exchange_code_for_credentials(auth_code)
if credentials:
st.session_state.google_credentials = credentials
st.success(status_msg)
st.rerun()
else:
st.error(status_msg)
st.markdown("</div>", unsafe_allow_html=True)
def install_playwright_dependencies():
"""Install Playwright dependencies"""
try:
import subprocess
import os
# Set environment variable for Playwright browsers path
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
# Install system dependencies
subprocess.run(['apt-get', 'update', '-y'], check=True)
packages = [
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
]
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
# Install Playwright and dependencies
subprocess.run(['pip', 'install', 'playwright'], check=True)
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
st.success("Playwright dependencies installed successfully!")
except Exception as e:
st.error(f"Error installing Playwright dependencies: {e}")
st.info("You may need to manually install dependencies. Check console for details.")
def display_file_results(files):
"""Display file results with filtering and sorting options"""
if not files:
return
st.markdown("<h3 class='section-subheader'>Found Files</h3>", unsafe_allow_html=True)
# File filtering options
filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1])
with filter_col1:
file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.")
with filter_col2:
sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"])
with filter_col3:
show_only_pdfs = st.checkbox("PDFs Only", value=False)
# Sort files based on selection
sorted_files = list(files)
if sort_option == "Name":
sorted_files.sort(key=lambda x: x['filename'])
elif sort_option == "Size (Largest)":
# Convert size strings to comparable values
def parse_size(size_str):
if 'Unknown' in size_str:
return 0
try:
value = float(size_str.split(' ')[0])
unit = size_str.split(' ')[1]
multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
return value * multipliers.get(unit, 0)
except:
return 0
sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True)
elif sort_option == "Size (Smallest)":
def parse_size(size_str):
if 'Unknown' in size_str:
return float('inf')
try:
value = float(size_str.split(' ')[0])
unit = size_str.split(' ')[1]
multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
return value * multipliers.get(unit, 0)
except:
return float('inf')
sorted_files.sort(key=lambda x: parse_size(x['size']))
# File list with selection
file_container = st.container()
with file_container:
selected_files = []
displayed_files = []
for i, file in enumerate(sorted_files):
# Apply filters
if file_filter and file_filter.lower() not in file['filename'].lower():
continue
if show_only_pdfs and not file['filename'].lower().endswith('.pdf'):
continue
displayed_files.append(i)
with st.container():
col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1])
with col1:
selected = st.checkbox("", key=f"select_{i}", value=True)
if selected:
selected_files.append(i)
with col2:
file_icon = get_file_icon(file['filename'])
st.markdown(f"**{file_icon} {file['filename']}**")
st.markdown(f"<span class='info-text'>{file['url'][:60]}...</span>", unsafe_allow_html=True)
with col3:
st.markdown(f"**Size:** {file['size']}")
with col4:
st.button("Preview", key=f"preview_{i}")
st.divider()
if not displayed_files:
st.info("No files match your current filters. Try adjusting your search criteria.")
return selected_files, displayed_files
def get_file_icon(filename):
"""Return appropriate icon for file type"""
file_icon = "πŸ“„"
if filename.lower().endswith('.pdf'):
file_icon = "πŸ“"
elif filename.lower().endswith(('.doc', '.docx')):
file_icon = "πŸ“‹"
elif filename.lower().endswith(('.xls', '.xlsx')):
file_icon = "πŸ“Š"
elif filename.lower().endswith(('.ppt', '.pptx')):
file_icon = "πŸ–ΌοΈ"
elif filename.lower().endswith(('.jpg', '.png', '.gif')):
file_icon = "πŸ–ΌοΈ"
elif filename.lower().endswith(('.mp3', '.wav')):
file_icon = "πŸ”Š"
elif filename.lower().endswith(('.mp4', '.avi', '.mov')):
file_icon = "🎬"
return file_icon
def handle_downloads(selected_files, download_dir, download_option, download_col1):
"""Handle downloading of selected files"""
if not selected_files:
return
# Execute the download asynchronously
with download_col1:
download_status = st.empty()
download_progress = st.progress(0)
async def run_download():
async with DownloadManager(
use_proxy=st.session_state.use_proxy,
proxy=st.session_state.proxy_string,
use_stealth=st.session_state.stealth_mode
) as manager:
files_to_download = [st.session_state.files[i] for i in selected_files]
# Reset download paths
st.session_state.downloaded_paths = []
for i, file_info in enumerate(files_to_download):
progress = (i) / len(files_to_download)
download_status.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}")
download_progress.progress(progress)
downloaded_path = await manager.download_file(
file_info,
download_dir,
get_domain(file_info['url'])
)
if downloaded_path:
st.session_state.downloaded_paths.append(downloaded_path)
download_progress.progress(1.0)
download_status.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!")
st.session_state.download_complete = True
# Run the download
asyncio.run(run_download())
# Show download results
if st.session_state.download_complete:
st.success(f"βœ… Downloaded {len(st.session_state.downloaded_paths)} files successfully!")
download_links = []
for path in st.session_state.downloaded_paths:
with open(path, "rb") as f:
file_content = f.read()
file_name = os.path.basename(path)
download_links.append((file_name, file_content))
if len(download_links) > 0:
if download_option == "ZIP Archive":
# Create ZIP archive for download
zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir)
with open(zip_path, "rb") as f:
zip_content = f.read()
st.download_button("πŸ“¦ Download ZIP Archive",
zip_content,
file_name=os.path.basename(zip_path),
mime="application/zip")
else:
# Show individual file download links
st.markdown("<h4>Download Files</h4>", unsafe_allow_html=True)
# Create a grid of download buttons
cols = st.columns(3)
for idx, (name, content) in enumerate(download_links):
mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream'
with cols[idx % 3]:
st.download_button(
f"πŸ“„ {name}",
content,
file_name=name,
mime=mime_type,
key=f"dl_{name}",
use_container_width=True
)
def handle_google_drive_upload(selected_files):
"""Handle uploading files to Google Drive"""
if not st.session_state.google_credentials or not st.session_state.downloaded_paths:
return
with st.spinner("Uploading to Google Drive..."):
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials)
# Create folder if it doesn't exist
folder_id = None
folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader"
# Check if folder exists
query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute()
items = results.get('files', [])
if not items:
# Create folder
folder_id = create_drive_folder(drive_service, folder_name)
else:
folder_id = items[0]['id']
# Upload each file
upload_progress = st.progress(0)
status_text = st.empty()
uploaded_count = 0
for i, path in enumerate(st.session_state.downloaded_paths):
progress = i / len(st.session_state.downloaded_paths)
status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}")
upload_progress.progress(progress)
result = google_drive_upload(path, st.session_state.google_credentials, folder_id)
if isinstance(result, str) and not result.startswith("Error"):
uploaded_count += 1
upload_progress.progress(1.0)
status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'")
st.success(f"βœ… Files uploaded to Google Drive successfully!")
def get_domain(url):
"""Extract domain from URL"""
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.netloc