diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -38,12 +38,13 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np import docx2txt + +# Try to import sentence-transformers for better embeddings try: - from langdetect import detect as detect_language + from sentence_transformers import SentenceTransformer + HAVE_TRANSFORMERS = True except ImportError: - # If langdetect is not available, we'll use a simple fallback - def detect_language(text): - return "en" + HAVE_TRANSFORMERS = False # Try to download NLTK data if not already present try: @@ -54,6 +55,16 @@ except LookupError: except: pass +try: + nltk.data.find('corpora/stopwords') +except LookupError: + try: + nltk.download('stopwords', quiet=True) + from nltk.corpus import stopwords + STOPWORDS = set(stopwords.words('english')) + except: + STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by']) + # -------------------- Logging Setup -------------------- logging.basicConfig( level=logging.INFO, @@ -110,22 +121,40 @@ PROXY_ROTATION_CONFIG = { "proxies": [] # Will be populated from the UI if needed } -# -------------------- Enhanced RAG Search Class -------------------- +# -------------------- Enhanced RAG Search with Small LLM -------------------- class EnhancedRAGSearch: def __init__(self): self.file_texts = [] self.chunks = [] # Document chunks for more targeted search self.chunk_metadata = [] # Metadata for each chunk self.file_metadata = [] - self.vectorizer = TfidfVectorizer( - stop_words='english', - ngram_range=(1, 2), # Use bigrams for better context - max_features=10000, # Use more features for better representation - min_df=2 # Minimum document frequency - ) + self.languages = [] + self.model = None + + # Try to load the sentence transformer model if available + if HAVE_TRANSFORMERS: + try: + # Use a small, efficient model + self.model = SentenceTransformer('all-MiniLM-L6-v2') + self.use_transformer = True + logger.info("Using sentence-transformers for RAG") + except Exception as e: + logger.warning(f"Error loading sentence-transformer: {e}") + self.use_transformer = False + else: + self.use_transformer = False + + # Fallback to TF-IDF if transformers not available + if not self.use_transformer: + self.vectorizer = TfidfVectorizer( + stop_words='english', + ngram_range=(1, 2), # Use bigrams for better context + max_features=15000, # Use more features for better representation + min_df=1 # Include rare terms + ) + self.vectors = None self.chunk_vectors = None - self.languages = [] def add_file(self, file_data, file_info): """Add a file to the search index with improved processing""" @@ -139,7 +168,10 @@ class EnhancedRAGSearch: # Try to detect language try: - lang = detect_language(text[:1000]) # Use just the first 1000 chars for speed + # Simple language detection based on stopwords + words = re.findall(r'\b\w+\b', text.lower()) + english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100])) + lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown' self.languages.append(lang) except: self.languages.append('en') # Default to English @@ -242,12 +274,28 @@ class EnhancedRAGSearch: return False try: - # Build document-level index - self.vectors = self.vectorizer.fit_transform(self.file_texts) - - # Build chunk-level index if we have chunks - if self.chunks: - self.chunk_vectors = self.vectorizer.transform(self.chunks) + if self.use_transformer: + # Use sentence transformer models for embeddings + logger.info("Building document and chunk embeddings with transformer model...") + self.vectors = self.model.encode(self.file_texts, show_progress_bar=False) + + # Build chunk-level index if we have chunks + if self.chunks: + # Process in batches to avoid memory issues + batch_size = 32 + chunk_vectors = [] + for i in range(0, len(self.chunks), batch_size): + batch = self.chunks[i:i+batch_size] + batch_vectors = self.model.encode(batch, show_progress_bar=False) + chunk_vectors.append(batch_vectors) + self.chunk_vectors = np.vstack(chunk_vectors) + else: + # Build document-level index + self.vectors = self.vectorizer.fit_transform(self.file_texts) + + # Build chunk-level index if we have chunks + if self.chunks: + self.chunk_vectors = self.vectorizer.transform(self.chunks) return True except Exception as e: @@ -255,21 +303,58 @@ class EnhancedRAGSearch: return False def expand_query(self, query): - """Add related terms to query for better recall""" - # This is a simple implementation - could be enhanced with a proper synonym API - expanded_terms = [] + """Add related terms to query for better recall - mini LLM function""" + # Dictionary of related terms for common keywords + expansions = { + "exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"], + "test": ["exam", "quiz", "assessment", "paper"], + "document": ["file", "paper", "report", "doc", "documentation"], + "manual": ["guide", "instruction", "documentation", "handbook"], + "tutorial": ["guide", "instructions", "how-to", "lesson"], + "article": ["paper", "publication", "journal", "research"], + "research": ["study", "investigation", "paper", "analysis"], + "book": ["textbook", "publication", "volume", "edition"], + "thesis": ["dissertation", "paper", "research", "study"], + "report": ["document", "paper", "analysis", "summary"], + "assignment": ["homework", "task", "project", "work"], + "lecture": ["class", "presentation", "talk", "lesson"], + "notes": ["annotations", "summary", "outline", "study material"], + "syllabus": ["curriculum", "course outline", "program", "plan"], + "paper": ["document", "article", "publication", "exam", "test"], + "question": ["problem", "query", "exercise", "inquiry"], + "solution": ["answer", "resolution", "explanation", "result"], + "reference": ["source", "citation", "bibliography", "resource"], + "analysis": ["examination", "study", "evaluation", "assessment"], + "guide": ["manual", "instruction", "handbook", "tutorial"], + "worksheet": ["exercise", "activity", "handout", "practice"], + "review": ["evaluation", "assessment", "critique", "feedback"], + "material": ["resource", "content", "document", "information"], + "data": ["information", "statistics", "figures", "numbers"] + } + + # Enhanced query expansion simulating a mini-LLM + query_words = re.findall(r'\b\w+\b', query.lower()) + expanded_terms = set() - # Add some common expansions for document search - if "exam" in query.lower(): - expanded_terms.extend(["test", "assessment", "quiz", "paper"]) - elif "document" in query.lower(): - expanded_terms.extend(["file", "paper", "report"]) - elif "manual" in query.lower(): - expanded_terms.extend(["guide", "instruction", "documentation"]) + # Directly add expansions from our dictionary + for word in query_words: + if word in expansions: + expanded_terms.update(expansions[word]) + # Add common academic file formats if not already included + if any(term in query.lower() for term in ["file", "document", "download", "paper"]): + if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]): + expanded_terms.update(["pdf", "docx", "pptx", "xlsx"]) + + # Add special academic terms when the query seems related to education + if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]): + expanded_terms.update(["syllabus", "lecture", "notes", "textbook"]) + # Return original query plus expanded terms if expanded_terms: - return f"{query} {' '.join(expanded_terms)}" + expanded_query = f"{query} {' '.join(expanded_terms)}" + logger.info(f"Expanded query: '{query}' -> '{expanded_query}'") + return expanded_query return query def search(self, query, top_k=5, search_chunks=True): @@ -277,58 +362,114 @@ class EnhancedRAGSearch: if self.vectors is None: return [] + # Simulate a small LLM by expanding the query with related terms + expanded_query = self.expand_query(query) + try: - # Expand the query for better recall - expanded_query = self.expand_query(query) - - # Transform the query - query_vector = self.vectorizer.transform([expanded_query]) - results = [] - # First search at document level for higher-level matches - if self.vectors is not None: - doc_similarities = cosine_similarity(query_vector, self.vectors).flatten() - top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] - - for i, idx in enumerate(top_doc_indices): - if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results - results.append({ - 'file_info': self.file_metadata[idx], - 'score': float(doc_similarities[idx]), - 'rank': i+1, - 'match_type': 'document', - 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' - }) - - # Then search at chunk level for more specific matches if enabled - if search_chunks and self.chunk_vectors is not None: - chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten() - top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results + if self.use_transformer: + # Transform the query to embedding + query_vector = self.model.encode([expanded_query])[0] - # Use a set to avoid duplicate file results - seen_files = set(r['file_info']['url'] for r in results) + # First search at document level for higher-level matches + if self.vectors is not None: + # Compute similarities between query and documents + doc_similarities = cosine_similarity( + query_vector.reshape(1, -1), + self.vectors + ).flatten() + + top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] + + for i, idx in enumerate(top_doc_indices): + if doc_similarities[idx] > 0.2: # Threshold to exclude irrelevant results + results.append({ + 'file_info': self.file_metadata[idx], + 'score': float(doc_similarities[idx]), + 'rank': i+1, + 'match_type': 'document', + 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' + }) - for i, idx in enumerate(top_chunk_indices): - if chunk_similarities[idx] > 0.15: # Higher threshold for chunks - file_index = self.chunk_metadata[idx]['file_index'] - file_info = self.file_metadata[file_index] - - # Only add if we haven't already included this file - if file_info['url'] not in seen_files: - seen_files.add(file_info['url']) + # Then search at chunk level for more specific matches if enabled + if search_chunks and self.chunk_vectors is not None: + # Compute similarities between query and chunks + chunk_similarities = cosine_similarity( + query_vector.reshape(1, -1), + self.chunk_vectors + ).flatten() + + top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results + + # Use a set to avoid duplicate file results + seen_files = set(r['file_info']['url'] for r in results) + + for i, idx in enumerate(top_chunk_indices): + if chunk_similarities[idx] > 0.25: # Higher threshold for chunks + file_index = self.chunk_metadata[idx]['file_index'] + file_info = self.file_metadata[file_index] + + # Only add if we haven't already included this file + if file_info['url'] not in seen_files: + seen_files.add(file_info['url']) + results.append({ + 'file_info': file_info, + 'score': float(chunk_similarities[idx]), + 'rank': len(results) + 1, + 'match_type': 'chunk', + 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', + 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] + }) + + # Stop after we've found enough results + if len(results) >= top_k*1.5: + break + else: + # Fallback to TF-IDF if transformers not available + query_vector = self.vectorizer.transform([expanded_query]) + + # First search at document level + if self.vectors is not None: + doc_similarities = cosine_similarity(query_vector, self.vectors).flatten() + top_doc_indices = doc_similarities.argsort()[-top_k:][::-1] + + for i, idx in enumerate(top_doc_indices): + if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results results.append({ - 'file_info': file_info, - 'score': float(chunk_similarities[idx]), - 'rank': len(results) + 1, - 'match_type': 'chunk', - 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', - 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] + 'file_info': self.file_metadata[idx], + 'score': float(doc_similarities[idx]), + 'rank': i+1, + 'match_type': 'document', + 'language': self.languages[idx] if idx < len(self.languages) else 'unknown' }) + + # Then search at chunk level if enabled + if search_chunks and self.chunk_vectors is not None: + chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten() + top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] + + # Avoid duplicates + seen_files = set(r['file_info']['url'] for r in results) + + for i, idx in enumerate(top_chunk_indices): + if chunk_similarities[idx] > 0.15: + file_index = self.chunk_metadata[idx]['file_index'] + file_info = self.file_metadata[file_index] - # Stop after we've found enough results - if len(results) >= top_k*1.5: - break + if file_info['url'] not in seen_files: + seen_files.add(file_info['url']) + results.append({ + 'file_info': file_info, + 'score': float(chunk_similarities[idx]), + 'rank': len(results) + 1, + 'match_type': 'chunk', + 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown', + 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx] + }) + + if len(results) >= top_k*1.5: + break # Sort combined results by score results.sort(key=lambda x: x['score'], reverse=True) @@ -3658,614 +3799,1018 @@ class DownloadManager: # -------------------- Main App -------------------- def main(): - st.title("Advanced File Downloader") + st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐Ÿ“") - # Initialize playwright if needed - if "playwright_installed" not in st.session_state: - with st.spinner("Setting up browser automation. This may take a minute..."): - install_playwright_dependencies() - st.session_state.playwright_installed = True + # Custom CSS for better appearance + st.markdown(""" + + """, unsafe_allow_html=True) - if "initialized" not in st.session_state: - st.session_state.initialized = True - st.session_state.discovered_files = [] - st.session_state.current_url = None - st.session_state.google_creds = None - st.session_state.selected_files = [] - st.session_state.do_deep_search = False - st.session_state.deep_search_url = None - st.session_state.search_results = [] - # For RAG search - st.session_state.rag_indexed = False - st.session_state.rag_engine = None - + # Initialize session state for storing files + if 'files' not in st.session_state: + st.session_state.files = [] + if 'downloaded_paths' not in st.session_state: + st.session_state.downloaded_paths = [] + if 'download_complete' not in st.session_state: + st.session_state.download_complete = False + if 'selected_tab' not in st.session_state: + st.session_state.selected_tab = 0 + if 'rag_search' not in st.session_state: + st.session_state.rag_search = EnhancedRAGSearch() + if 'keep_progress' not in st.session_state: + st.session_state.keep_progress = False + if 'google_credentials' not in st.session_state: + st.session_state.google_credentials = None + if 'mode' not in st.session_state: + st.session_state.mode = "Standard" + if 'use_proxy' not in st.session_state: + st.session_state.use_proxy = False + if 'proxy_string' not in st.session_state: + st.session_state.proxy_string = None + if 'stealth_mode' not in st.session_state: + st.session_state.stealth_mode = True + + # ============================ + # SIDEBAR + # ============================ with st.sidebar: - mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select") - with st.expander("Advanced Options", expanded=True): - custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt") - max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page") - sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink") - use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox") - proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") - use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox") - - with st.expander("Google Drive Integration", expanded=False): - if st.button("Start Google Sign-In", key="google_signin_btn"): + st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50) + st.markdown("", unsafe_allow_html=True) + + # Mode Selection + st.markdown("", unsafe_allow_html=True) + + # Quick Settings + st.markdown("", unsafe_allow_html=True) + + # Google Drive Integration + st.markdown("", unsafe_allow_html=True) + + # Preset buttons for common EDU sites + if st.session_state.mode == "Education Mode": + st.markdown("", unsafe_allow_html=True) - # Proxy rotation settings - st.write("**Proxy Rotation**") - enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation") - if enable_rotation: - PROXY_ROTATION_CONFIG["enabled"] = True - proxy_list = st.text_area( - "Proxy List (one per line)", - placeholder="http://proxy1:port\nhttp://proxy2:port", - key="proxy_list" - ) - if proxy_list: - PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()] - rotation_interval = st.slider( - "Rotation Interval (# of requests)", - min_value=1, - max_value=50, - value=10, - key="rotation_interval" - ) - PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval + # Tool status + st.markdown("", unsafe_allow_html=True) + + # App info + st.markdown("", unsafe_allow_html=True) - if mode == "Manual URL": - st.header("Manual URL Mode") - url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input") + # ============================ + # MAIN CONTENT AREA + # ============================ + + # Header section + col1, col2 = st.columns([5, 1]) + with col1: + st.markdown("

Advanced File Downloader

", unsafe_allow_html=True) + with col2: + st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70) + + mode_descriptions = { + "Standard": "A versatile tool for discovering and downloading files from any website.", + "Education Mode": "Optimized for educational resources, exams, and academic materials.", + "Research Mode": "Focused on research papers, datasets, and academic publications.", + "Media Mode": "Enhanced for finding and downloading images, videos, and audio files." + } + + st.markdown(f"

{mode_descriptions[st.session_state.mode]}

", unsafe_allow_html=True) + + # Main tabs + tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"]) + + # Tab 1: Search & Download + with tabs[0]: + st.markdown("

Find and Download Files

", unsafe_allow_html=True) + col1, col2 = st.columns([3, 1]) with col1: - if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): - if url: - custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] - valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] - if custom_ext_list != valid_ext_list: - st.warning("Invalid extensions ignored. Use format like '.csv'.") - - # Reset RAG engine for new search - st.session_state.rag_indexed = False - st.session_state.rag_engine = None - - # Define a function to run the deep search - async def run_deep_search(): - async with DownloadManager( - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout) - return files - - with st.spinner("Searching for files..."): - files = asyncio.run(run_deep_search()) - - if files: - st.session_state.discovered_files = files - st.session_state.current_url = url - st.success(f"Found {len(files)} files!") - else: - st.warning("No files found.") - - if st.session_state.discovered_files: - files = st.session_state.discovered_files - - # Display files with direct download buttons - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - - # Add RAG Search interface - st.markdown("### Search Within Discovered Files") - search_query = st.text_input("Enter search terms", key="rag_search_query") - - if st.button("Search Files", key="rag_search_btn") and search_query: - # Initialize RAG search engine - if not st.session_state.rag_indexed: - rag_search = EnhancedRAGSearch() - - with st.spinner("Indexing files for search..."): - # First download files to extract text - temp_dir = "./temp_downloads" - os.makedirs(temp_dir, exist_ok=True) - - async def download_for_indexing(): - downloaded = 0 - async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm: - for i, file_info in enumerate(files): - # Only process common text-based file formats - ext = os.path.splitext(file_info['filename'])[1].lower() - if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']: - path = await dm.download_file(file_info, temp_dir, url) - if path: - with open(path, 'rb') as f: - file_data = f.read() - - # Add to search index - if rag_search.add_file(file_data, file_info): - downloaded += 1 - - # Clean up - os.remove(path) - return downloaded - - indexed_count = asyncio.run(download_for_indexing()) - if indexed_count > 0: - rag_search.build_index() - st.session_state.rag_engine = rag_search - st.session_state.rag_indexed = True - st.success(f"Indexed {indexed_count} files for search") - else: - st.warning("Could not index any files. Try with more text-based documents.") + url = st.text_input("Enter a URL to search for downloadable files:", + placeholder="e.g., https://example.com/resources", + value=st.session_state.get('preset_url', '')) + with col2: + # Initialize search_method with either session state or default value + initial_search_method = st.session_state.get('search_method', "Deep Search") + search_method = st.selectbox("Search Method", + ["Deep Search", "Quick Search", "Exam Site Mode"], + index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method)) + # Update session state when changed + if search_method != st.session_state.get('search_method'): + st.session_state.search_method = search_method + + # Advanced options in an expander + with st.expander("Search Options", expanded=False): + col1, col2, col3 = st.columns(3) + with col1: + depth = st.slider("Search Depth", min_value=1, max_value=5, value=2, + help="Higher values will search more links but take longer") + prioritize_pdfs = st.checkbox("Prioritize PDFs", + value=st.session_state.get('prioritize_pdfs', True), + help="Focus on finding PDF files first") + with col2: + timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60) + follow_subdomains = st.checkbox("Follow Subdomains", value=True, + help="Include links from subdomains in the search") + with col3: + # Default extensions based on mode + default_extensions = { + "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip", + "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx", + "Research Mode": ".pdf,.txt,.csv,.json,.xlsx", + "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov" + } - # Perform the search - if st.session_state.rag_indexed: - search_results = st.session_state.rag_engine.search(search_query) - - if search_results: - st.write(f"Found {len(search_results)} relevant files:") - - for result in search_results: - file_info = result['file_info'] - score = result['score'] - match_type = result.get('match_type', 'document') - - with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"): - st.write(f"Size: {file_info['size']}") - st.write(f"Match type: {match_type}") - - # Show language if available - if 'language' in result: - st.write(f"Language: {result['language']}") - - # Show metadata if available - if 'metadata' in file_info and file_info['metadata']: - st.write("Metadata:") - for k, v in file_info['metadata'].items(): - if k != 'file_id': # Skip technical details - st.write(f"- {k}: {v}") - - # Show content preview for chunk matches - if 'chunk_preview' in result: - st.write("Content preview:") - st.text(result['chunk_preview']) - - # Add direct download button - if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"): - with st.spinner(f"Downloading {file_info['filename']}..."): - async def download_search_result(): - async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm: - path = await dm.download_file(file_info, download_dir, url) - return path - - path = asyncio.run(download_search_result()) - if path: - with open(path, "rb") as f: - file_data = f.read() - - st.download_button( - label=f"Save {file_info['filename']}", - data=file_data, - file_name=file_info['filename'], - mime=mimetypes.guess_type(path)[0] or "application/octet-stream", - key=f"save_rag_{result['rank']}" - ) - else: - st.warning("No matching files found for your query.") - - # Show all files with direct download options - st.markdown("### All Discovered Files") + custom_extensions = st.text_area( + "Custom File Extensions", + value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]), + help="Comma-separated list of file extensions to look for" + ) + + # Update session state when extensions changed + if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions: + st.session_state.custom_extensions = custom_extensions + + search_col1, search_col2 = st.columns([4, 1]) + with search_col1: + search_button = st.button("๐Ÿ” Start Search", use_container_width=True) + with search_col2: + clear_button = st.button("๐Ÿงน Clear Results", use_container_width=True) + + # File results section + if st.session_state.files: + st.markdown("

Found Files

", unsafe_allow_html=True) + + # File filtering options + filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1]) + with filter_col1: + file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.") + with filter_col2: + sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"]) + with filter_col3: + show_only_pdfs = st.checkbox("PDFs Only", value=False) + + # Sort files based on selection + sorted_files = list(st.session_state.files) + if sort_option == "Name": + sorted_files.sort(key=lambda x: x['filename']) + elif sort_option == "Size (Largest)": + # Convert size strings to comparable values + def parse_size(size_str): + if 'Unknown' in size_str: + return 0 + try: + value = float(size_str.split(' ')[0]) + unit = size_str.split(' ')[1] + multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} + return value * multipliers.get(unit, 0) + except: + return 0 + + sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True) + elif sort_option == "Size (Smallest)": + def parse_size(size_str): + if 'Unknown' in size_str: + return float('inf') + try: + value = float(size_str.split(' ')[0]) + unit = size_str.split(' ')[1] + multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4} + return value * multipliers.get(unit, 0) + except: + return float('inf') + + sorted_files.sort(key=lambda x: parse_size(x['size'])) - # Batch download options - col1, col2 = st.columns([1, 4]) - with col1: - if st.button("Select All", key="select_all_btn"): - st.session_state.selected_files = list(range(len(files))) - if st.button("Clear Selection", key="clear_selection_btn"): - st.session_state.selected_files = [] - - # Batch download settings - if 'selected_files' in st.session_state and st.session_state.selected_files: - batch_col1, batch_col2, batch_col3, batch_col4 = st.columns(4) - with batch_col1: - download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") - with batch_col2: - create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") - with batch_col3: - delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox") - with batch_col4: - upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") - - if st.button("Download Selected", key="download_batch_btn"): - if not os.path.exists(download_dir): - os.makedirs(download_dir) - - async def download_files(): - downloaded_paths = [] - progress_bar = st.progress(0) - status_text = st.empty() + # File list with selection + file_container = st.container() + with file_container: + selected_files = [] + displayed_files = [] + + for i, file in enumerate(sorted_files): + # Apply filters + if file_filter and file_filter.lower() not in file['filename'].lower(): + continue + if show_only_pdfs and not file['filename'].lower().endswith('.pdf'): + continue - async with DownloadManager( - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - for i, idx in enumerate(st.session_state.selected_files): - progress = (i + 1) / len(st.session_state.selected_files) - file_info = files[idx] - status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(st.session_state.selected_files)})") - progress_bar.progress(progress) - - path = await dm.download_file(file_info, download_dir, url) - if path: - downloaded_paths.append(path) + displayed_files.append(i) + with st.container(): + col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1]) + with col1: + selected = st.checkbox("", key=f"select_{i}", value=True) + if selected: + selected_files.append(i) + with col2: + file_icon = "๐Ÿ“„" + if file['filename'].lower().endswith('.pdf'): + file_icon = "๐Ÿ“" + elif file['filename'].lower().endswith(('.doc', '.docx')): + file_icon = "๐Ÿ“‹" + elif file['filename'].lower().endswith(('.xls', '.xlsx')): + file_icon = "๐Ÿ“Š" + elif file['filename'].lower().endswith(('.ppt', '.pptx')): + file_icon = "๐Ÿ–ผ๏ธ" + elif file['filename'].lower().endswith(('.jpg', '.png', '.gif')): + file_icon = "๐Ÿ–ผ๏ธ" + elif file['filename'].lower().endswith(('.mp3', '.wav')): + file_icon = "๐Ÿ”Š" + elif file['filename'].lower().endswith(('.mp4', '.avi', '.mov')): + file_icon = "๐ŸŽฌ" - status_text.empty() - progress_bar.empty() - return downloaded_paths - - with st.spinner("Downloading files..."): - downloaded = asyncio.run(download_files()) - - if downloaded: - st.success(f"Successfully downloaded {len(downloaded)} files") + st.markdown(f"**{file_icon} {file['filename']}**") + st.markdown(f"{file['url'][:60]}...", unsafe_allow_html=True) + with col3: + st.markdown(f"**Size:** {file['size']}") + with col4: + st.button("Preview", key=f"preview_{i}") - if create_zip: - zip_path = create_zip_file(downloaded, download_dir) - st.success(f"Created ZIP file: {zip_path}") - - # Provide download link for the zip file - with open(zip_path, "rb") as f: - zip_data = f.read() - - st.download_button( - label="Download ZIP", - data=zip_data, - file_name=os.path.basename(zip_path), - mime="application/zip", - key="download_zip_btn" - ) - - # Upload to Google Drive if requested - if upload_to_drive and st.session_state.google_creds: - drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds) - folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}") - drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id) - if not isinstance(drive_id, str) or not drive_id.startswith("Error"): - st.success(f"Uploaded to Google Drive. File ID: {drive_id}") - else: - st.error(drive_id) - - # Delete original files if requested - if delete_after: - for path in downloaded: - try: - os.remove(path) - except Exception as e: - st.warning(f"Could not delete {path}: {e}") - st.info("Deleted original files after ZIP creation") - - # Individual file display with direct download buttons - for i, file in enumerate(files): - col1, col2, col3 = st.columns([3, 1, 1]) + st.divider() + + if not displayed_files: + st.info("No files match your current filters. Try adjusting your search criteria.") + + # Download options + if selected_files: + col1, col2 = st.columns(2) with col1: - filename = file['filename'] - size = file['size'] - meta = file.get('metadata', {}) - file_info = f"{filename} ({size})" - if meta and 'Pages' in meta: - file_info += f" - {meta.get('Pages', '')} pages" - st.markdown(f"**{i+1}. {file_info}**") - + download_dir = st.text_input("Download Directory", value="downloads") with col2: - # Add direct download button for each file - if st.button(f"Download", key=f"direct_dl_{i}"): - with st.spinner(f"Downloading {filename}..."): - async def download_single_file(): - async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm: - path = await dm.download_file(file, download_dir, url) - return path - - downloaded_path = asyncio.run(download_single_file()) - if downloaded_path: - with open(downloaded_path, "rb") as f: - file_data = f.read() - - st.download_button( - label=f"Save {filename}", - data=file_data, - file_name=filename, - mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream", - key=f"save_file_{i}" - ) + download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True) - with col3: - # Add to selection for batch download - if i in st.session_state.selected_files: - if st.button("Unselect", key=f"unselect_{i}"): - st.session_state.selected_files.remove(i) + download_col1, download_col2, download_col3 = st.columns([3, 1, 1]) + with download_col1: + download_button = st.button("โฌ‡๏ธ Download Selected Files", use_container_width=True) + with download_col2: + google_drive_button = st.button("๐Ÿ“ค Upload to Drive", + use_container_width=True, + disabled=not st.session_state.google_credentials) + with download_col3: + select_all = st.button("Select All Files", use_container_width=True) + + # Handle select all button + if select_all: + for i in displayed_files: + st.session_state[f"select_{i}"] = True + st.rerun() + + # Download progress/results + if st.session_state.download_complete: + st.success(f"โœ… Downloaded {len(st.session_state.downloaded_paths)} files successfully!") + download_links = [] + for path in st.session_state.downloaded_paths: + with open(path, "rb") as f: + file_content = f.read() + file_name = os.path.basename(path) + download_links.append((file_name, file_content)) + + if len(download_links) > 0: + if download_option == "ZIP Archive": + # Create ZIP archive for download + zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir) + with open(zip_path, "rb") as f: + zip_content = f.read() + st.download_button("๐Ÿ“ฆ Download ZIP Archive", + zip_content, + file_name=os.path.basename(zip_path), + mime="application/zip") else: - if st.button("Select", key=f"select_{i}"): - st.session_state.selected_files.append(i) - - elif mode == "Bing Search": - st.header("Bing Search Mode") - query = st.text_input("Enter search query", key="search_query_input") - num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider") + # Show individual file download links + st.markdown("

Download Files

", unsafe_allow_html=True) + + # Create a grid of download buttons + cols = st.columns(3) + for idx, (name, content) in enumerate(download_links): + mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream' + with cols[idx % 3]: + st.download_button( + f"๐Ÿ“„ {name}", + content, + file_name=name, + mime=mime_type, + key=f"dl_{name}", + use_container_width=True + ) + + # Tab 2: Local File Search + with tabs[1]: + st.markdown("

Search Downloaded Files

", unsafe_allow_html=True) + st.write("Upload files to search through their content with AI-powered semantic search.") + + # File upload + uploaded_files = st.file_uploader("Upload documents for search", + accept_multiple_files=True, + type=['pdf', 'docx', 'txt', 'csv', 'json']) - if st.button("Search", key="search_btn"): - if query: - async def run_search(): - async with DownloadManager( - use_proxy=use_proxy, - proxy=proxy, - query=query, - num_results=num_results, - use_stealth=use_stealth - ) as dm: + if uploaded_files: + # Build search index on upload + col1, col2 = st.columns([4, 1]) + with col1: + use_transformer = st.checkbox("Use AI Transformer Model", value=HAVE_TRANSFORMERS, + help="Uses advanced AI for more accurate semantic search (if available)") + with col2: + if st.button("Build Search Index", use_container_width=True): + with st.spinner("Processing files and building search index..."): + files_added = 0 + for uploaded_file in uploaded_files: + file_info = { + 'filename': uploaded_file.name, + 'url': f'local://{uploaded_file.name}', + 'size': humanize_file_size(uploaded_file.size) + } + success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info) + if success: + files_added += 1 + + if files_added > 0: + index_built = st.session_state.rag_search.build_index() + if index_built: + st.success(f"โœ… Successfully indexed {files_added} files!") + else: + st.error("Failed to build search index.") + else: + st.warning("No valid text could be extracted from the files.") + + # Search interface + st.markdown("

Search Files

", unsafe_allow_html=True) + + col1, col2 = st.columns([4, 1]) + with col1: + query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change") + with col2: + expand_query = st.checkbox("Auto-expand query", value=True, + help="Automatically add related terms to your search") + + col1, col2 = st.columns([4, 1]) + with col1: + if st.button("๐Ÿ” Search Documents", use_container_width=True): + if not query: + st.warning("Please enter a search query") + else: with st.spinner("Searching..."): - urls = await dm.search_bing() - if urls: - st.session_state.search_results = urls - st.success(f"Found {len(urls)} results!") - - # Create expanders for each result - for i, url in enumerate(urls, 1): - with st.expander(f"Result {i}: {url}", expanded=(i == 1)): - st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}", on_click=set_deep_search_url, args=(url,)) + results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True) + + if results: + st.markdown(f"**Found {len(results)} relevant documents:**") + for i, result in enumerate(results): + with st.container(): + st.markdown(f"
", unsafe_allow_html=True) + st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})") + + if result.get('chunk_preview'): + st.markdown("**Matching content:**") + st.text(result['chunk_preview']) + + st.markdown("
", unsafe_allow_html=True) else: - st.warning("No search results found.") + st.info("No matching documents found. Try a different query.") + with col2: + num_results = st.number_input("Max results", min_value=1, max_value=20, value=5) + + # Quick search tips + with st.expander("Search Tips", expanded=False): + st.markdown(""" + ### Effective Search Tips - asyncio.run(run_search()) + - **Be specific** with your queries for more accurate results + - **Try different phrasings** if you don't get the results you expect + - Use **quotation marks** for exact phrase matching + - For **complex topics**, break down your search into multiple queries + - **Combine related terms** to improve recall + + The search engine uses advanced algorithms to understand the semantic meaning of your query, + not just keyword matching. + """) + + # Tab 3: Advanced Configuration + with tabs[2]: + st.markdown("

Advanced Settings

", unsafe_allow_html=True) - # Handle deep search - using on_click function to avoid state issues - if 'deep_search_url' in st.session_state and st.session_state.deep_search_url: - url = st.session_state.deep_search_url - st.info(f"Deep searching: {url}") - - # Set up custom extensions - custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] - valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] - - # Reset RAG engine for new search - st.session_state.rag_indexed = False - st.session_state.rag_engine = None - - # Run the deep search - async def run_bing_deep_search(): - async with DownloadManager( - use_proxy=use_proxy, - proxy=proxy, - use_stealth=use_stealth - ) as dm: - files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout) - return files - - with st.spinner("Searching for files..."): - files = asyncio.run(run_bing_deep_search()) - - if files: - st.session_state.discovered_files = files - st.session_state.current_url = url - st.success(f"Found {len(files)} files!") - - # Show files with direct download options - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - - # Individual file display with direct download buttons - for i, file in enumerate(files): - col1, col2, col3 = st.columns([3, 1, 1]) - with col1: - filename = file['filename'] - size = file['size'] - meta = file.get('metadata', {}) - file_info = f"{filename} ({size})" - if meta and 'Pages' in meta: - file_info += f" - {meta.get('Pages', '')} pages" - st.markdown(f"**{i+1}. {file_info}**") - - with col2: - # Add direct download button for each file - if st.button(f"Download", key=f"direct_dl_bing_{i}"): - with st.spinner(f"Downloading {filename}..."): - async def download_single_file(): - async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm: - path = await dm.download_file(file, download_dir, url) - return path - - downloaded_path = asyncio.run(download_single_file()) - if downloaded_path: - with open(downloaded_path, "rb") as f: - file_data = f.read() - - st.download_button( - label=f"Save {filename}", - data=file_data, - file_name=filename, - mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream", - key=f"save_bing_file_{i}" - ) - - with col3: - # Add to selection for batch download - if i in st.session_state.selected_files: - if st.button("Unselect", key=f"bing_unselect_{i}"): - st.session_state.selected_files.remove(i) - else: - if st.button("Select", key=f"bing_select_{i}"): - st.session_state.selected_files.append(i) + config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"]) + + # Browser Settings tab + with config_tabs[0]: + col1, col2 = st.columns(2) + with col1: + use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode, + help="Makes browser harder to detect as automated, but may be slower") - # Add RAG Search interface for Bing results - st.markdown("### Search Within Discovered Files") - search_query = st.text_input("Enter search terms", key="bing_rag_search_query") + handle_captchas = st.checkbox("Handle Captchas Automatically", value=False, + help="Attempt to solve simple captchas automatically") - if st.button("Search Files", key="bing_rag_search_btn") and search_query: - # Initialize RAG search engine - if not st.session_state.rag_indexed: - rag_search = EnhancedRAGSearch() - - with st.spinner("Indexing files for search..."): - # First download files to extract text - temp_dir = "./temp_downloads" - os.makedirs(temp_dir, exist_ok=True) - - async def download_for_indexing(): - downloaded = 0 - async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm: - for i, file_info in enumerate(files): - # Only process common text-based file formats - ext = os.path.splitext(file_info['filename'])[1].lower() - if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']: - path = await dm.download_file(file_info, temp_dir, url) - if path: - with open(path, 'rb') as f: - file_data = f.read() - - # Add to search index - if rag_search.add_file(file_data, file_info): - downloaded += 1 - - # Clean up - os.remove(path) - return downloaded - - indexed_count = asyncio.run(download_for_indexing()) - if indexed_count > 0: - rag_search.build_index() - st.session_state.rag_engine = rag_search - st.session_state.rag_indexed = True - st.success(f"Indexed {indexed_count} files for search") - else: - st.warning("Could not index any files. Try with more text-based documents.") - - # Perform the search - if st.session_state.rag_indexed: - search_results = st.session_state.rag_engine.search(search_query) - - if search_results: - st.write(f"Found {len(search_results)} relevant files:") - - for result in search_results: - file_info = result['file_info'] - score = result['score'] - match_type = result.get('match_type', 'document') - - with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"): - st.write(f"Size: {file_info['size']}") - st.write(f"Match type: {match_type}") - - # Show language if available - if 'language' in result: - st.write(f"Language: {result['language']}") - - # Show metadata if available - if 'metadata' in file_info and file_info['metadata']: - st.write("Metadata:") - for k, v in file_info['metadata'].items(): - if k != 'file_id': # Skip technical details - st.write(f"- {k}: {v}") - - # Show content preview for chunk matches - if 'chunk_preview' in result: - st.write("Content preview:") - st.text(result['chunk_preview']) - - # Add direct download button - if st.button(f"Download this file", key=f"bing_rag_dl_{result['rank']}"): - with st.spinner(f"Downloading {file_info['filename']}..."): - async def download_search_result(): - async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm: - path = await dm.download_file(file_info, download_dir, url) - return path - - path = asyncio.run(download_search_result()) - if path: - with open(path, "rb") as f: - file_data = f.read() - - st.download_button( - label=f"Save {file_info['filename']}", - data=file_data, - file_name=file_info['filename'], - mime=mimetypes.guess_type(path)[0] or "application/octet-stream", - key=f"save_bing_rag_{result['rank']}" - ) - else: - st.warning("No matching files found for your query.") - else: - st.warning("No files found.") + download_timeout = st.slider("Download Timeout (seconds)", + min_value=30, max_value=600, value=300, + help="Maximum time to wait for downloads to complete") + with col2: + user_agent = st.selectbox("User Agent", USER_AGENTS, index=0, + help="Browser identity to use when accessing websites") + + save_screenshots = st.checkbox("Save Browser Screenshots", value=False, + help="Save screenshots when errors occur for debugging") + + browser_lang = st.selectbox("Browser Language", + ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"], + index=0) - # Reset the deep search URL after processing - st.session_state.deep_search_url = None - - # Add a special section for direct Google Drive file download - st.markdown("---") - with st.expander("Download View-Only Google Drive Document", expanded=False): - st.write("Download protected/view-only Google Drive documents - just enter the file ID") - file_id = st.text_input("Google Drive File ID", - placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku", - help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')") + if st.button("Update Browser Settings"): + st.session_state.stealth_mode = use_stealth + st.success("Browser settings updated!") + + # Dependency installation section + st.markdown("

Dependencies

", unsafe_allow_html=True) + if st.button("Install Playwright Dependencies"): + with st.spinner("Installing dependencies..."): + install_playwright_dependencies() - if st.button("Download Document") and file_id: - download_dir = "./downloads" - os.makedirs(download_dir, exist_ok=True) - output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf") - - with st.spinner("Downloading view-only document... (this may take a minute)"): - async def download_viewonly(): - async with DownloadManager(use_stealth=use_stealth) as dm: - file_info = { - 'url': f"https://drive.google.com/file/d/{file_id}/view", - 'filename': f"gdrive_{file_id}.pdf", - 'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True} - } - result_path = await dm.force_download_viewonly(file_info, output_path) - return result_path + # Proxy Configuration tab + with config_tabs[1]: + proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy, + help="Route requests through a proxy server for anonymity or bypassing restrictions") + + if proxy_enabled: + proxy_col1, proxy_col2 = st.columns(2) + with proxy_col1: + proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"]) + proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1") + with proxy_col2: + proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080") + proxy_auth = st.text_input("Proxy Authentication (optional)", + placeholder="username:password", type="password") + + st.markdown("

Proxy Rotation

", unsafe_allow_html=True) + use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False, + help="Automatically rotate between multiple proxies for better anonymity") + + if use_proxy_rotation: + proxy_list = st.text_area("Proxy List (one per line)", + placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080") + rotation_interval = st.slider("Rotation Interval (requests)", + min_value=1, max_value=50, value=10, + help="How often to switch proxies") + + if st.button("Save Proxy Configuration"): + # Construct the proxy string + proxy_string = None + if proxy_enabled and proxy_host and proxy_port: + proxy_prefix = f"{proxy_type.lower()}://" + proxy_auth_str = f"{proxy_auth}@" if proxy_auth else "" + proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}" + + # Update session state + st.session_state.use_proxy = proxy_enabled + st.session_state.proxy_string = proxy_string + + # Configure proxy rotation if enabled + # Configure proxy rotation if enabled + if use_proxy_rotation and proxy_list: + PROXY_ROTATION_CONFIG["enabled"] = True + PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval + PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] + + st.success("Proxy configuration updated!") + + # Download Options tab + with config_tabs[2]: + col1, col2 = st.columns(2) + with col1: + st.markdown("

Download Behavior

", unsafe_allow_html=True) + + skip_existing = st.checkbox("Skip Existing Files", value=True, + help="Don't download files that already exist locally") + + auto_rename = st.checkbox("Auto-Rename Duplicates", value=True, + help="Automatically rename files instead of overwriting") + + verify_downloads = st.checkbox("Verify Downloads", value=True, + help="Check file integrity after download") + + max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3, + help="Number of times to retry failed downloads") + + with col2: + st.markdown("

File Organization

", unsafe_allow_html=True) + + auto_organize = st.checkbox("Auto-Organize Files", value=True, + help="Automatically organize files by type") + + default_dir = st.text_input("Default Download Directory", value="downloads", + help="Default location to save downloaded files") + + org_by_domain = st.checkbox("Organize by Domain", value=False, + help="Create subdirectories based on source domains") + + org_by_type = st.checkbox("Organize by File Type", value=False, + help="Create subdirectories based on file types") + + if st.button("Save Download Settings"): + st.session_state.download_settings = { + "skip_existing": skip_existing, + "auto_rename": auto_rename, + "verify_downloads": verify_downloads, + "max_retries": max_retries, + "auto_organize": auto_organize, + "default_dir": default_dir, + "org_by_domain": org_by_domain, + "org_by_type": org_by_type + } + st.success("Download settings saved!") + + # System tab + with config_tabs[3]: + col1, col2 = st.columns(2) + with col1: + st.markdown("

Memory & Performance

", unsafe_allow_html=True) + + max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3, + help="Maximum number of simultaneous downloads") + + memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024, + help="Maximum memory to use for file processing") + + processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2, + help="Number of threads to use for file processing") + + with col2: + st.markdown("

Logs & Diagnostics

", unsafe_allow_html=True) + + log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1, + help="Detail level for application logs") - result = asyncio.run(download_viewonly()) + save_debug_info = st.checkbox("Save Debug Information", value=False, + help="Save detailed information about program execution") - if result: - st.success("Document downloaded successfully!") + log_dir = st.text_input("Log Directory", value="logs", + help="Directory to save log files") + + if st.button("Apply System Settings"): + st.session_state.system_settings = { + "max_concurrent": max_concurrent, + "memory_limit": memory_limit, + "processing_threads": processing_threads, + "log_level": log_level, + "save_debug_info": save_debug_info, + "log_dir": log_dir + } + # Update logging configuration + log_level_num = getattr(logging, log_level) + logging.getLogger().setLevel(log_level_num) + st.success("System settings applied!") + + # Reset application button + st.markdown("

Application Control

", unsafe_allow_html=True) + reset_col1, reset_col2 = st.columns([1, 3]) + with reset_col1: + if st.button("Reset Application", use_container_width=True): + for key in list(st.session_state.keys()): + if key != 'google_credentials': # Preserve Google auth + del st.session_state[key] + st.success("Application has been reset!") + st.rerun() + with reset_col2: + st.info("This will clear all search results, downloaded files, and reset settings to defaults.") + + # Tab 4: Help + with tabs[3]: + st.markdown("

Help & Documentation

", unsafe_allow_html=True) + + help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"]) + + with help_tabs[0]: + st.markdown(""" + ### Getting Started + + 1. **Enter a URL** on the Search & Download tab + 2. Select a **Search Method**: + - **Deep Search**: Thorough but slower + - **Quick Search**: Fast but may miss some files + - **Exam Site Mode**: Optimized for educational resource sites + 3. Click **Start Search** to find downloadable files + 4. Select files you want to download + 5. Click **Download Selected Files** + + #### Using Different Modes + + Select a mode from the sidebar to optimize the tool for different use cases: + + - **Standard Mode**: Balanced for general use + - **Education Mode**: Optimized for finding academic materials + - **Research Mode**: Better for research papers and datasets + - **Media Mode**: Enhanced for finding images, videos, and audio + + For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials. + """) + + with help_tabs[1]: + st.markdown(""" + ### Advanced Features + + - **Local File Search**: Upload files and search through their content using the enhanced RAG search + - **Custom Extensions**: Specify additional file types to look for beyond the default set + - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers + - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity + - **Google Drive Integration**: Upload downloaded files directly to your Google Drive + + #### Search Tips + + - For educational sites, include specific terms like "exam", "test", "paper" in the URL + - When using Local File Search, try different variations of your query for better results + - Use filtering and sorting options to find the most relevant files quickly + + #### File Organization + + You can configure automatic file organization in the Advanced Configuration tab: + + - **Organize by Domain**: Creates folders based on the source website + - **Organize by File Type**: Separates files into folders by their extension + - **Auto-Rename**: Prevents overwriting existing files with same names + """) + + with help_tabs[2]: + st.markdown(""" + ### Troubleshooting + + #### Common Issues + + - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions + - **Downloads failing**: Check if the site requires authentication or uses captchas + - **Slow performance**: Reduce search depth or disable stealth mode for faster results + - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings + + #### Captcha Issues + + Some websites use captchas to prevent automated access. If you encounter captchas: + + 1. Try using a different proxy + 2. Enable "Handle Captchas Automatically" for simple captchas + 3. For complex captchas, you may need to manually access the site first + + #### Proxy Problems + + If you're having issues with proxies: + + 1. Verify your proxy is working with an external tool + 2. Check that you've entered the correct format (http://host:port) + 3. Some websites may block known proxy IPs + + #### Memory Usage + + If the application is using too much memory: + + 1. Reduce the "Memory Limit" in System settings + 2. Process fewer files at once + 3. Use lower search depth values + """) + + with help_tabs[3]: + st.markdown(""" + ### About This Tool + + **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources. + + #### Key Features + + - **Smart Discovery**: Finds downloadable files even when they're not directly linked + - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques + - **Educational Focus**: Specialized detection for exam papers and academic resources + - **Stealth Capabilities**: Avoids detection by anti-scraping measures + + #### Technical Details + + This tool uses: + + - **Playwright**: For browser automation and stealth capabilities + - **Sentence Transformers**: For AI-powered semantic search + - **Streamlit**: For the user interface + - **Google Drive API**: For cloud integration + + #### Credits + + Created with Python, Streamlit, Playwright, and various AI libraries. + + For issues or suggestions, please contact the developer. + + Version 2.0 - March 2025 + """) + + # Handle search and download actions + if search_button and url: + # Reset files and downloaded paths + st.session_state.files = [] + st.session_state.downloaded_paths = [] + st.session_state.download_complete = False + + # Clear the preset URL if it was used + if 'preset_url' in st.session_state: + st.session_state.preset_url = '' + + # Prepare custom extensions + custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()] + + # Configure proxy from session state + proxy_string = st.session_state.proxy_string if st.session_state.use_proxy else None + + # Set up proxy rotation if enabled + if 'use_proxy_rotation' in locals() and use_proxy_rotation and proxy_list: + PROXY_ROTATION_CONFIG["enabled"] = True + PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval + PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()] + + # Configure search parameters based on method + sublink_limit = 5000 if search_method == "Deep Search" else 1000 + search_depth = depth if search_method == "Deep Search" else 1 + is_exam_site = search_method == "Exam Site Mode" + + # Execute the search asynchronously + async def run_search(): + async with DownloadManager( + use_proxy=st.session_state.use_proxy, + proxy=proxy_string, + use_stealth=st.session_state.stealth_mode, + proxy_rotation=PROXY_ROTATION_CONFIG["enabled"] + ) as manager: + # For exam sites, use specialized approach + if is_exam_site: + st.session_state.keep_progress = True + edu_links = await manager.get_edu_exam_links(url) + all_files = [] - # Provide download button - with open(result, "rb") as f: - file_bytes = f.read() + progress_text = st.empty() + progress_bar = st.progress(0) + + # Process each exam link + for i, link in enumerate(edu_links): + progress = (i+1) / max(1, len(edu_links)) + progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}") + progress_bar.progress(progress) + + files = await manager.extract_downloadable_files(link, custom_ext_list) + all_files.extend(files) + + st.session_state.files = all_files + progress_text.empty() + progress_bar.empty() + st.session_state.keep_progress = False - st.download_button( - label="Download PDF", - data=file_bytes, - file_name=f"gdrive_{file_id}.pdf", - mime="application/pdf" - ) else: - st.error("Failed to download the document. Please check the file ID and try again.") - - # Add footer with attribution - st.markdown('---') - st.markdown('Created by [Euler314](https://github.com/euler314)') - -# Helper function for Bing search deep search URL setting -def set_deep_search_url(url): - st.session_state.deep_search_url = url + # Use general search method + files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout) + st.session_state.files = files + + # Run the search + asyncio.run(run_search()) + st.rerun() + + # Handle download button + if 'download_button' in locals() and download_button and selected_files: + # Create download directory + os.makedirs(download_dir, exist_ok=True) + + # Reset download state + st.session_state.downloaded_paths = [] + st.session_state.download_complete = False + + # Get selected files + files_to_download = [st.session_state.files[i] for i in selected_files] + + # Execute the download asynchronously + async def run_download(): + async with DownloadManager( + use_proxy=st.session_state.use_proxy, + proxy=st.session_state.proxy_string, + use_stealth=st.session_state.stealth_mode + ) as manager: + download_progress = st.progress(0) + status_text = st.empty() + + for i, file_info in enumerate(files_to_download): + progress = (i) / len(files_to_download) + status_text.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}") + download_progress.progress(progress) + + downloaded_path = await manager.download_file( + file_info, + download_dir, + get_domain(file_info['url']) + ) + + if downloaded_path: + st.session_state.downloaded_paths.append(downloaded_path) + + download_progress.progress(1.0) + status_text.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!") + st.session_state.download_complete = True + + # Run the download + asyncio.run(run_download()) + st.rerun() + + # Handle Google Drive upload + if 'google_drive_button' in locals() and google_drive_button and st.session_state.google_credentials and st.session_state.downloaded_paths: + with st.spinner("Uploading to Google Drive..."): + drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials) + + # Create folder if it doesn't exist + folder_id = None + folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader" + + # Check if folder exists + query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false" + results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute() + items = results.get('files', []) + + if not items: + # Create folder + folder_id = create_drive_folder(drive_service, folder_name) + else: + folder_id = items[0]['id'] + + # Upload each file + upload_progress = st.progress(0) + status_text = st.empty() + uploaded_count = 0 + + for i, path in enumerate(st.session_state.downloaded_paths): + progress = i / len(st.session_state.downloaded_paths) + status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}") + upload_progress.progress(progress) + + result = google_drive_upload(path, st.session_state.google_credentials, folder_id) + if isinstance(result, str) and not result.startswith("Error"): + uploaded_count += 1 + + upload_progress.progress(1.0) + status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'") + + st.success(f"โœ… Files uploaded to Google Drive successfully!") + + # Handle clear button + if clear_button: + st.session_state.files = [] + st.session_state.downloaded_paths = [] + st.session_state.download_complete = False + if 'preset_url' in st.session_state: + st.session_state.preset_url = '' + st.rerun() if __name__ == "__main__": main() \ No newline at end of file