Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

craw_web / ui.py

euler314

Rename app/ui.py to ui.py

1b5e738 verified 3 months ago

raw

history blame

19.8 kB

	import streamlit as st
	import os
	import asyncio
	import mimetypes
	from app.utils import create_zip_file, humanize_file_size, show_user_friendly_error
	from app.download_manager import DownloadManager
	from app.rag_search import EnhancedRAGSearch
	from app.google_drive import (
	get_google_auth_url, exchange_code_for_credentials,
	google_drive_upload, create_drive_folder
	)
	import googleapiclient.discovery

	def setup_ui():
	"""Setup the main UI elements"""
	st.markdown("""
	<style>
	.stTabs [data-baseweb="tab-list"] {
	gap: 10px;
	}
	.stTabs [data-baseweb="tab"] {
	height: 50px;
	white-space: pre-wrap;
	border-radius: 4px 4px 0px 0px;
	padding: 10px 16px;
	background-color: #f0f2f6;
	}
	.stTabs [aria-selected="true"] {
	background-color: #ffffff !important;
	border-bottom: 2px solid #4c78a8;
	}
	.stFileUploader > div > div > button {
	width: 100%;
	}
	.main-header {
	font-size: 2.5rem;
	font-weight: 700;
	margin-bottom: 10px;
	}
	.section-subheader {
	font-size: 1.3rem;
	font-weight: 600;
	margin-top: 20px;
	margin-bottom: 10px;
	}
	.info-text {
	color: #6c757d;
	font-size: 0.9rem;
	}
	.stButton>button {
	width: 100%;
	}
	.result-card {
	background-color: #f8f9fa;
	border-radius: 6px;
	padding: 16px;
	margin-bottom: 12px;
	border-left: 4px solid #4c78a8;
	}
	.sidebar-header {
	font-size: 1.2rem;
	font-weight: 600;
	margin-bottom: 10px;
	}
	.sidebar-section {
	margin-bottom: 20px;
	}
	</style>
	""", unsafe_allow_html=True)

	def create_sidebar():
	"""Create the sidebar elements"""
	with st.sidebar:
	st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50)
	st.markdown("<p class='sidebar-header'>Advanced File Downloader</p>", unsafe_allow_html=True)

	# Mode Selection
	st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
	st.markdown("<p class='sidebar-header'>Mode</p>", unsafe_allow_html=True)
	mode = st.radio(
	"Select Mode",
	["Standard", "Education Mode", "Research Mode", "Media Mode"],
	label_visibility="collapsed",
	index=["Standard", "Education Mode", "Research Mode", "Media Mode"].index(st.session_state.mode),
	horizontal=False
	)

	if mode != st.session_state.mode:
	st.session_state.mode = mode
	# Update mode-specific settings
	if mode == "Education Mode":
	st.session_state.custom_extensions = ".pdf,.doc,.docx,.ppt,.pptx"
	st.session_state.prioritize_pdfs = True
	elif mode == "Research Mode":
	st.session_state.custom_extensions = ".pdf,.txt,.csv,.json,.xlsx"
	st.session_state.prioritize_pdfs = True
	elif mode == "Media Mode":
	st.session_state.custom_extensions = ".jpg,.png,.mp3,.mp4,.avi,.mov"
	st.session_state.prioritize_pdfs = False

	st.markdown(f"<div class='info-text'>Current: <b>{st.session_state.mode}</b></div>", unsafe_allow_html=True)
	st.markdown("</div>", unsafe_allow_html=True)

	# Quick Settings
	st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
	st.markdown("<p class='sidebar-header'>Quick Settings</p>", unsafe_allow_html=True)

	stealth_mode = st.checkbox("Stealth Mode", value=st.session_state.stealth_mode)
	if stealth_mode != st.session_state.stealth_mode:
	st.session_state.stealth_mode = stealth_mode

	use_proxy = st.checkbox("Use Proxy", value=st.session_state.use_proxy)
	if use_proxy != st.session_state.use_proxy:
	st.session_state.use_proxy = use_proxy

	if use_proxy:
	proxy_string = st.text_input("Proxy Address",
	placeholder="e.g., http://user:pass@host:port",
	value=st.session_state.proxy_string or "")
	if proxy_string != st.session_state.proxy_string:
	st.session_state.proxy_string = proxy_string

	st.markdown("</div>", unsafe_allow_html=True)

	# Google Drive Integration
	show_google_drive_integration()

	# Preset buttons for educational sites
	if st.session_state.mode == "Education Mode":
	st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
	st.markdown("<p class='sidebar-header'>Quick Access</p>", unsafe_allow_html=True)
	st.markdown("<div class='info-text'>Common Educational Sites</div>", unsafe_allow_html=True)

	if st.button("Past Exam Papers"):
	st.session_state.preset_url = "https://pastpapers.example.edu"
	st.session_state.search_method = "Exam Site Mode"
	st.rerun()

	if st.button("Open Course Materials"):
	st.session_state.preset_url = "https://opencourseware.example.edu"
	st.session_state.search_method = "Deep Search"
	st.rerun()

	if st.button("Research Papers"):
	st.session_state.preset_url = "https://papers.example.org"
	st.session_state.search_method = "Deep Search"
	st.rerun()

	st.markdown("</div>", unsafe_allow_html=True)

	# Tool status
	st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
	st.markdown("<p class='sidebar-header'>System Status</p>", unsafe_allow_html=True)

	col1, col2 = st.columns(2)
	with col1:
	st.markdown("<div class='info-text'>Search</div>", unsafe_allow_html=True)
	st.markdown("<div style='color: green; font-weight: bold;'>Active</div>", unsafe_allow_html=True)
	with col2:
	st.markdown("<div class='info-text'>Browser</div>", unsafe_allow_html=True)
	st.markdown("<div style='color: green; font-weight: bold;'>Ready</div>", unsafe_allow_html=True)

	if st.button("Install Dependencies"):
	with st.spinner("Installing Playwright dependencies..."):
	install_playwright_dependencies()

	st.markdown("</div>", unsafe_allow_html=True)

	# App info
	st.markdown("<div class='sidebar-section' style='position: absolute; bottom: 20px; width: 90%;'>", unsafe_allow_html=True)
	st.markdown("<div class='info-text' style='text-align: center;'>Version 2.0 • March 2025</div>", unsafe_allow_html=True)
	st.markdown("</div>", unsafe_allow_html=True)

	def show_google_drive_integration():
	"""Display Google Drive integration UI"""
	st.markdown("<div class='sidebar-section'>", unsafe_allow_html=True)
	st.markdown("<p class='sidebar-header'>Google Drive</p>", unsafe_allow_html=True)

	if st.session_state.google_credentials:
	st.success("✅ Connected")

	drive_folder = st.text_input("Drive Folder",
	value="File Downloader" if 'drive_folder' not in st.session_state else st.session_state.drive_folder)
	if 'drive_folder' not in st.session_state or drive_folder != st.session_state.drive_folder:
	st.session_state.drive_folder = drive_folder

	if st.button("Disconnect Drive"):
	st.session_state.google_credentials = None
	st.rerun()
	else:
	st.warning("⚠️ Not Connected")
	if st.button("Connect Google Drive"):
	auth_url = get_google_auth_url()
	st.markdown(f"[Click here to authorize]({auth_url})")
	auth_code = st.text_input("Enter authorization code:")

	if auth_code:
	with st.spinner("Connecting to Google Drive..."):
	credentials, status_msg = exchange_code_for_credentials(auth_code)
	if credentials:
	st.session_state.google_credentials = credentials
	st.success(status_msg)
	st.rerun()
	else:
	st.error(status_msg)

	st.markdown("</div>", unsafe_allow_html=True)

	def install_playwright_dependencies():
	"""Install Playwright dependencies"""
	try:
	import subprocess
	import os

	# Set environment variable for Playwright browsers path
	os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")

	# Install system dependencies
	subprocess.run(['apt-get', 'update', '-y'], check=True)
	packages = [
	'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
	'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
	'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
	]
	subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)

	# Install Playwright and dependencies
	subprocess.run(['pip', 'install', 'playwright'], check=True)
	subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)

	st.success("Playwright dependencies installed successfully!")
	except Exception as e:
	st.error(f"Error installing Playwright dependencies: {e}")
	st.info("You may need to manually install dependencies. Check console for details.")

	def display_file_results(files):
	"""Display file results with filtering and sorting options"""
	if not files:
	return

	st.markdown("<h3 class='section-subheader'>Found Files</h3>", unsafe_allow_html=True)

	# File filtering options
	filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1])
	with filter_col1:
	file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.")
	with filter_col2:
	sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"])
	with filter_col3:
	show_only_pdfs = st.checkbox("PDFs Only", value=False)

	# Sort files based on selection
	sorted_files = list(files)
	if sort_option == "Name":
	sorted_files.sort(key=lambda x: x['filename'])
	elif sort_option == "Size (Largest)":
	# Convert size strings to comparable values
	def parse_size(size_str):
	if 'Unknown' in size_str:
	return 0
	try:
	value = float(size_str.split(' ')[0])
	unit = size_str.split(' ')[1]
	multipliers = {'bytes': 1, 'KB': 1024, 'MB': 10242, 'GB': 10243, 'TB': 1024**4}
	return value * multipliers.get(unit, 0)
	except:
	return 0

	sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True)
	elif sort_option == "Size (Smallest)":
	def parse_size(size_str):
	if 'Unknown' in size_str:
	return float('inf')
	try:
	value = float(size_str.split(' ')[0])
	unit = size_str.split(' ')[1]
	multipliers = {'bytes': 1, 'KB': 1024, 'MB': 10242, 'GB': 10243, 'TB': 1024**4}
	return value * multipliers.get(unit, 0)
	except:
	return float('inf')

	sorted_files.sort(key=lambda x: parse_size(x['size']))

	# File list with selection
	file_container = st.container()
	with file_container:
	selected_files = []
	displayed_files = []

	for i, file in enumerate(sorted_files):
	# Apply filters
	if file_filter and file_filter.lower() not in file['filename'].lower():
	continue
	if show_only_pdfs and not file['filename'].lower().endswith('.pdf'):
	continue

	displayed_files.append(i)
	with st.container():
	col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1])
	with col1:
	selected = st.checkbox("", key=f"select_{i}", value=True)
	if selected:
	selected_files.append(i)
	with col2:
	file_icon = get_file_icon(file['filename'])
	st.markdown(f"{file_icon} {file['filename']}")
	st.markdown(f"<span class='info-text'>{file['url'][:60]}...</span>", unsafe_allow_html=True)
	with col3:
	st.markdown(f"Size: {file['size']}")
	with col4:
	st.button("Preview", key=f"preview_{i}")

	st.divider()

	if not displayed_files:
	st.info("No files match your current filters. Try adjusting your search criteria.")

	return selected_files, displayed_files

	def get_file_icon(filename):
	"""Return appropriate icon for file type"""
	file_icon = "📄"
	if filename.lower().endswith('.pdf'):
	file_icon = "📝"
	elif filename.lower().endswith(('.doc', '.docx')):
	file_icon = "📋"
	elif filename.lower().endswith(('.xls', '.xlsx')):
	file_icon = "📊"
	elif filename.lower().endswith(('.ppt', '.pptx')):
	file_icon = "🖼️"
	elif filename.lower().endswith(('.jpg', '.png', '.gif')):
	file_icon = "🖼️"
	elif filename.lower().endswith(('.mp3', '.wav')):
	file_icon = "🔊"
	elif filename.lower().endswith(('.mp4', '.avi', '.mov')):
	file_icon = "🎬"
	return file_icon

	def handle_downloads(selected_files, download_dir, download_option, download_col1):
	"""Handle downloading of selected files"""
	if not selected_files:
	return

	# Execute the download asynchronously
	with download_col1:
	download_status = st.empty()
	download_progress = st.progress(0)

	async def run_download():
	async with DownloadManager(
	use_proxy=st.session_state.use_proxy,
	proxy=st.session_state.proxy_string,
	use_stealth=st.session_state.stealth_mode
	) as manager:
	files_to_download = [st.session_state.files[i] for i in selected_files]

	# Reset download paths
	st.session_state.downloaded_paths = []

	for i, file_info in enumerate(files_to_download):
	progress = (i) / len(files_to_download)
	download_status.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}")
	download_progress.progress(progress)

	downloaded_path = await manager.download_file(
	file_info,
	download_dir,
	get_domain(file_info['url'])
	)

	if downloaded_path:
	st.session_state.downloaded_paths.append(downloaded_path)

	download_progress.progress(1.0)
	download_status.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!")
	st.session_state.download_complete = True

	# Run the download
	asyncio.run(run_download())

	# Show download results
	if st.session_state.download_complete:
	st.success(f"✅ Downloaded {len(st.session_state.downloaded_paths)} files successfully!")
	download_links = []
	for path in st.session_state.downloaded_paths:
	with open(path, "rb") as f:
	file_content = f.read()
	file_name = os.path.basename(path)
	download_links.append((file_name, file_content))

	if len(download_links) > 0:
	if download_option == "ZIP Archive":
	# Create ZIP archive for download
	zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir)
	with open(zip_path, "rb") as f:
	zip_content = f.read()
	st.download_button("📦 Download ZIP Archive",
	zip_content,
	file_name=os.path.basename(zip_path),
	mime="application/zip")
	else:
	# Show individual file download links
	st.markdown("<h4>Download Files</h4>", unsafe_allow_html=True)

	# Create a grid of download buttons
	cols = st.columns(3)
	for idx, (name, content) in enumerate(download_links):
	mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream'
	with cols[idx % 3]:
	st.download_button(
	f"📄 {name}",
	content,
	file_name=name,
	mime=mime_type,
	key=f"dl_{name}",
	use_container_width=True
	)

	def handle_google_drive_upload(selected_files):
	"""Handle uploading files to Google Drive"""
	if not st.session_state.google_credentials or not st.session_state.downloaded_paths:
	return

	with st.spinner("Uploading to Google Drive..."):
	drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials)

	# Create folder if it doesn't exist
	folder_id = None
	folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader"

	# Check if folder exists
	query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
	results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute()
	items = results.get('files', [])

	if not items:
	# Create folder
	folder_id = create_drive_folder(drive_service, folder_name)
	else:
	folder_id = items[0]['id']

	# Upload each file
	upload_progress = st.progress(0)
	status_text = st.empty()
	uploaded_count = 0

	for i, path in enumerate(st.session_state.downloaded_paths):
	progress = i / len(st.session_state.downloaded_paths)
	status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}")
	upload_progress.progress(progress)

	result = google_drive_upload(path, st.session_state.google_credentials, folder_id)
	if isinstance(result, str) and not result.startswith("Error"):
	uploaded_count += 1

	upload_progress.progress(1.0)
	status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'")

	st.success(f"✅ Files uploaded to Google Drive successfully!")

	def get_domain(url):
	"""Extract domain from URL"""
	from urllib.parse import urlparse
	parsed = urlparse(url)
	return parsed.netloc