diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -38,12 +38,13 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import docx2txt
+
+# Try to import sentence-transformers for better embeddings
try:
- from langdetect import detect as detect_language
+ from sentence_transformers import SentenceTransformer
+ HAVE_TRANSFORMERS = True
except ImportError:
- # If langdetect is not available, we'll use a simple fallback
- def detect_language(text):
- return "en"
+ HAVE_TRANSFORMERS = False
# Try to download NLTK data if not already present
try:
@@ -54,6 +55,16 @@ except LookupError:
except:
pass
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ try:
+ nltk.download('stopwords', quiet=True)
+ from nltk.corpus import stopwords
+ STOPWORDS = set(stopwords.words('english'))
+ except:
+ STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by'])
+
# -------------------- Logging Setup --------------------
logging.basicConfig(
level=logging.INFO,
@@ -110,22 +121,40 @@ PROXY_ROTATION_CONFIG = {
"proxies": [] # Will be populated from the UI if needed
}
-# -------------------- Enhanced RAG Search Class --------------------
+# -------------------- Enhanced RAG Search with Small LLM --------------------
class EnhancedRAGSearch:
def __init__(self):
self.file_texts = []
self.chunks = [] # Document chunks for more targeted search
self.chunk_metadata = [] # Metadata for each chunk
self.file_metadata = []
- self.vectorizer = TfidfVectorizer(
- stop_words='english',
- ngram_range=(1, 2), # Use bigrams for better context
- max_features=10000, # Use more features for better representation
- min_df=2 # Minimum document frequency
- )
+ self.languages = []
+ self.model = None
+
+ # Try to load the sentence transformer model if available
+ if HAVE_TRANSFORMERS:
+ try:
+ # Use a small, efficient model
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
+ self.use_transformer = True
+ logger.info("Using sentence-transformers for RAG")
+ except Exception as e:
+ logger.warning(f"Error loading sentence-transformer: {e}")
+ self.use_transformer = False
+ else:
+ self.use_transformer = False
+
+ # Fallback to TF-IDF if transformers not available
+ if not self.use_transformer:
+ self.vectorizer = TfidfVectorizer(
+ stop_words='english',
+ ngram_range=(1, 2), # Use bigrams for better context
+ max_features=15000, # Use more features for better representation
+ min_df=1 # Include rare terms
+ )
+
self.vectors = None
self.chunk_vectors = None
- self.languages = []
def add_file(self, file_data, file_info):
"""Add a file to the search index with improved processing"""
@@ -139,7 +168,10 @@ class EnhancedRAGSearch:
# Try to detect language
try:
- lang = detect_language(text[:1000]) # Use just the first 1000 chars for speed
+ # Simple language detection based on stopwords
+ words = re.findall(r'\b\w+\b', text.lower())
+ english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100]))
+ lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown'
self.languages.append(lang)
except:
self.languages.append('en') # Default to English
@@ -242,12 +274,28 @@ class EnhancedRAGSearch:
return False
try:
- # Build document-level index
- self.vectors = self.vectorizer.fit_transform(self.file_texts)
-
- # Build chunk-level index if we have chunks
- if self.chunks:
- self.chunk_vectors = self.vectorizer.transform(self.chunks)
+ if self.use_transformer:
+ # Use sentence transformer models for embeddings
+ logger.info("Building document and chunk embeddings with transformer model...")
+ self.vectors = self.model.encode(self.file_texts, show_progress_bar=False)
+
+ # Build chunk-level index if we have chunks
+ if self.chunks:
+ # Process in batches to avoid memory issues
+ batch_size = 32
+ chunk_vectors = []
+ for i in range(0, len(self.chunks), batch_size):
+ batch = self.chunks[i:i+batch_size]
+ batch_vectors = self.model.encode(batch, show_progress_bar=False)
+ chunk_vectors.append(batch_vectors)
+ self.chunk_vectors = np.vstack(chunk_vectors)
+ else:
+ # Build document-level index
+ self.vectors = self.vectorizer.fit_transform(self.file_texts)
+
+ # Build chunk-level index if we have chunks
+ if self.chunks:
+ self.chunk_vectors = self.vectorizer.transform(self.chunks)
return True
except Exception as e:
@@ -255,21 +303,58 @@ class EnhancedRAGSearch:
return False
def expand_query(self, query):
- """Add related terms to query for better recall"""
- # This is a simple implementation - could be enhanced with a proper synonym API
- expanded_terms = []
+ """Add related terms to query for better recall - mini LLM function"""
+ # Dictionary of related terms for common keywords
+ expansions = {
+ "exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"],
+ "test": ["exam", "quiz", "assessment", "paper"],
+ "document": ["file", "paper", "report", "doc", "documentation"],
+ "manual": ["guide", "instruction", "documentation", "handbook"],
+ "tutorial": ["guide", "instructions", "how-to", "lesson"],
+ "article": ["paper", "publication", "journal", "research"],
+ "research": ["study", "investigation", "paper", "analysis"],
+ "book": ["textbook", "publication", "volume", "edition"],
+ "thesis": ["dissertation", "paper", "research", "study"],
+ "report": ["document", "paper", "analysis", "summary"],
+ "assignment": ["homework", "task", "project", "work"],
+ "lecture": ["class", "presentation", "talk", "lesson"],
+ "notes": ["annotations", "summary", "outline", "study material"],
+ "syllabus": ["curriculum", "course outline", "program", "plan"],
+ "paper": ["document", "article", "publication", "exam", "test"],
+ "question": ["problem", "query", "exercise", "inquiry"],
+ "solution": ["answer", "resolution", "explanation", "result"],
+ "reference": ["source", "citation", "bibliography", "resource"],
+ "analysis": ["examination", "study", "evaluation", "assessment"],
+ "guide": ["manual", "instruction", "handbook", "tutorial"],
+ "worksheet": ["exercise", "activity", "handout", "practice"],
+ "review": ["evaluation", "assessment", "critique", "feedback"],
+ "material": ["resource", "content", "document", "information"],
+ "data": ["information", "statistics", "figures", "numbers"]
+ }
+
+ # Enhanced query expansion simulating a mini-LLM
+ query_words = re.findall(r'\b\w+\b', query.lower())
+ expanded_terms = set()
- # Add some common expansions for document search
- if "exam" in query.lower():
- expanded_terms.extend(["test", "assessment", "quiz", "paper"])
- elif "document" in query.lower():
- expanded_terms.extend(["file", "paper", "report"])
- elif "manual" in query.lower():
- expanded_terms.extend(["guide", "instruction", "documentation"])
+ # Directly add expansions from our dictionary
+ for word in query_words:
+ if word in expansions:
+ expanded_terms.update(expansions[word])
+ # Add common academic file formats if not already included
+ if any(term in query.lower() for term in ["file", "document", "download", "paper"]):
+ if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]):
+ expanded_terms.update(["pdf", "docx", "pptx", "xlsx"])
+
+ # Add special academic terms when the query seems related to education
+ if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]):
+ expanded_terms.update(["syllabus", "lecture", "notes", "textbook"])
+
# Return original query plus expanded terms
if expanded_terms:
- return f"{query} {' '.join(expanded_terms)}"
+ expanded_query = f"{query} {' '.join(expanded_terms)}"
+ logger.info(f"Expanded query: '{query}' -> '{expanded_query}'")
+ return expanded_query
return query
def search(self, query, top_k=5, search_chunks=True):
@@ -277,58 +362,114 @@ class EnhancedRAGSearch:
if self.vectors is None:
return []
+ # Simulate a small LLM by expanding the query with related terms
+ expanded_query = self.expand_query(query)
+
try:
- # Expand the query for better recall
- expanded_query = self.expand_query(query)
-
- # Transform the query
- query_vector = self.vectorizer.transform([expanded_query])
-
results = []
- # First search at document level for higher-level matches
- if self.vectors is not None:
- doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
- top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
-
- for i, idx in enumerate(top_doc_indices):
- if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results
- results.append({
- 'file_info': self.file_metadata[idx],
- 'score': float(doc_similarities[idx]),
- 'rank': i+1,
- 'match_type': 'document',
- 'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
- })
-
- # Then search at chunk level for more specific matches if enabled
- if search_chunks and self.chunk_vectors is not None:
- chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
- top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results
+ if self.use_transformer:
+ # Transform the query to embedding
+ query_vector = self.model.encode([expanded_query])[0]
- # Use a set to avoid duplicate file results
- seen_files = set(r['file_info']['url'] for r in results)
+ # First search at document level for higher-level matches
+ if self.vectors is not None:
+ # Compute similarities between query and documents
+ doc_similarities = cosine_similarity(
+ query_vector.reshape(1, -1),
+ self.vectors
+ ).flatten()
+
+ top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
+
+ for i, idx in enumerate(top_doc_indices):
+ if doc_similarities[idx] > 0.2: # Threshold to exclude irrelevant results
+ results.append({
+ 'file_info': self.file_metadata[idx],
+ 'score': float(doc_similarities[idx]),
+ 'rank': i+1,
+ 'match_type': 'document',
+ 'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
+ })
- for i, idx in enumerate(top_chunk_indices):
- if chunk_similarities[idx] > 0.15: # Higher threshold for chunks
- file_index = self.chunk_metadata[idx]['file_index']
- file_info = self.file_metadata[file_index]
-
- # Only add if we haven't already included this file
- if file_info['url'] not in seen_files:
- seen_files.add(file_info['url'])
+ # Then search at chunk level for more specific matches if enabled
+ if search_chunks and self.chunk_vectors is not None:
+ # Compute similarities between query and chunks
+ chunk_similarities = cosine_similarity(
+ query_vector.reshape(1, -1),
+ self.chunk_vectors
+ ).flatten()
+
+ top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results
+
+ # Use a set to avoid duplicate file results
+ seen_files = set(r['file_info']['url'] for r in results)
+
+ for i, idx in enumerate(top_chunk_indices):
+ if chunk_similarities[idx] > 0.25: # Higher threshold for chunks
+ file_index = self.chunk_metadata[idx]['file_index']
+ file_info = self.file_metadata[file_index]
+
+ # Only add if we haven't already included this file
+ if file_info['url'] not in seen_files:
+ seen_files.add(file_info['url'])
+ results.append({
+ 'file_info': file_info,
+ 'score': float(chunk_similarities[idx]),
+ 'rank': len(results) + 1,
+ 'match_type': 'chunk',
+ 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
+ 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
+ })
+
+ # Stop after we've found enough results
+ if len(results) >= top_k*1.5:
+ break
+ else:
+ # Fallback to TF-IDF if transformers not available
+ query_vector = self.vectorizer.transform([expanded_query])
+
+ # First search at document level
+ if self.vectors is not None:
+ doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
+ top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
+
+ for i, idx in enumerate(top_doc_indices):
+ if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results
results.append({
- 'file_info': file_info,
- 'score': float(chunk_similarities[idx]),
- 'rank': len(results) + 1,
- 'match_type': 'chunk',
- 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
- 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
+ 'file_info': self.file_metadata[idx],
+ 'score': float(doc_similarities[idx]),
+ 'rank': i+1,
+ 'match_type': 'document',
+ 'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
})
+
+ # Then search at chunk level if enabled
+ if search_chunks and self.chunk_vectors is not None:
+ chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
+ top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1]
+
+ # Avoid duplicates
+ seen_files = set(r['file_info']['url'] for r in results)
+
+ for i, idx in enumerate(top_chunk_indices):
+ if chunk_similarities[idx] > 0.15:
+ file_index = self.chunk_metadata[idx]['file_index']
+ file_info = self.file_metadata[file_index]
- # Stop after we've found enough results
- if len(results) >= top_k*1.5:
- break
+ if file_info['url'] not in seen_files:
+ seen_files.add(file_info['url'])
+ results.append({
+ 'file_info': file_info,
+ 'score': float(chunk_similarities[idx]),
+ 'rank': len(results) + 1,
+ 'match_type': 'chunk',
+ 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
+ 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
+ })
+
+ if len(results) >= top_k*1.5:
+ break
# Sort combined results by score
results.sort(key=lambda x: x['score'], reverse=True)
@@ -3658,614 +3799,1018 @@ class DownloadManager:
# -------------------- Main App --------------------
def main():
- st.title("Advanced File Downloader")
+ st.set_page_config(page_title="Advanced File Downloader", layout="wide", page_icon="๐")
- # Initialize playwright if needed
- if "playwright_installed" not in st.session_state:
- with st.spinner("Setting up browser automation. This may take a minute..."):
- install_playwright_dependencies()
- st.session_state.playwright_installed = True
+ # Custom CSS for better appearance
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
- if "initialized" not in st.session_state:
- st.session_state.initialized = True
- st.session_state.discovered_files = []
- st.session_state.current_url = None
- st.session_state.google_creds = None
- st.session_state.selected_files = []
- st.session_state.do_deep_search = False
- st.session_state.deep_search_url = None
- st.session_state.search_results = []
- # For RAG search
- st.session_state.rag_indexed = False
- st.session_state.rag_engine = None
-
+ # Initialize session state for storing files
+ if 'files' not in st.session_state:
+ st.session_state.files = []
+ if 'downloaded_paths' not in st.session_state:
+ st.session_state.downloaded_paths = []
+ if 'download_complete' not in st.session_state:
+ st.session_state.download_complete = False
+ if 'selected_tab' not in st.session_state:
+ st.session_state.selected_tab = 0
+ if 'rag_search' not in st.session_state:
+ st.session_state.rag_search = EnhancedRAGSearch()
+ if 'keep_progress' not in st.session_state:
+ st.session_state.keep_progress = False
+ if 'google_credentials' not in st.session_state:
+ st.session_state.google_credentials = None
+ if 'mode' not in st.session_state:
+ st.session_state.mode = "Standard"
+ if 'use_proxy' not in st.session_state:
+ st.session_state.use_proxy = False
+ if 'proxy_string' not in st.session_state:
+ st.session_state.proxy_string = None
+ if 'stealth_mode' not in st.session_state:
+ st.session_state.stealth_mode = True
+
+ # ============================
+ # SIDEBAR
+ # ============================
with st.sidebar:
- mode = st.radio("Select Mode", ["Manual URL", "Bing Search"], key="mode_select")
- with st.expander("Advanced Options", expanded=True):
- custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
- max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
- sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
- use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
- proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
- use_stealth = st.checkbox("Use Stealth Mode (harder to detect)", value=True, key="stealth_checkbox")
-
- with st.expander("Google Drive Integration", expanded=False):
- if st.button("Start Google Sign-In", key="google_signin_btn"):
+ st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=50)
+ st.markdown("
", unsafe_allow_html=True)
+
+ # Mode Selection
+ st.markdown("", unsafe_allow_html=True)
+
+ # Quick Settings
+ st.markdown("", unsafe_allow_html=True)
+
+ # Google Drive Integration
+ st.markdown("", unsafe_allow_html=True)
+
+ # Preset buttons for common EDU sites
+ if st.session_state.mode == "Education Mode":
+ st.markdown("", unsafe_allow_html=True)
- # Proxy rotation settings
- st.write("**Proxy Rotation**")
- enable_rotation = st.checkbox("Enable Proxy Rotation", value=False, key="enable_rotation")
- if enable_rotation:
- PROXY_ROTATION_CONFIG["enabled"] = True
- proxy_list = st.text_area(
- "Proxy List (one per line)",
- placeholder="http://proxy1:port\nhttp://proxy2:port",
- key="proxy_list"
- )
- if proxy_list:
- PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.split("\n") if p.strip()]
- rotation_interval = st.slider(
- "Rotation Interval (# of requests)",
- min_value=1,
- max_value=50,
- value=10,
- key="rotation_interval"
- )
- PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
+ # Tool status
+ st.markdown("", unsafe_allow_html=True)
+
+ # App info
+ st.markdown("", unsafe_allow_html=True)
- if mode == "Manual URL":
- st.header("Manual URL Mode")
- url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
+ # ============================
+ # MAIN CONTENT AREA
+ # ============================
+
+ # Header section
+ col1, col2 = st.columns([5, 1])
+ with col1:
+ st.markdown("Advanced File Downloader
", unsafe_allow_html=True)
+ with col2:
+ st.image("https://img.icons8.com/color/96/000000/download--v1.png", width=70)
+
+ mode_descriptions = {
+ "Standard": "A versatile tool for discovering and downloading files from any website.",
+ "Education Mode": "Optimized for educational resources, exams, and academic materials.",
+ "Research Mode": "Focused on research papers, datasets, and academic publications.",
+ "Media Mode": "Enhanced for finding and downloading images, videos, and audio files."
+ }
+
+ st.markdown(f"{mode_descriptions[st.session_state.mode]}
", unsafe_allow_html=True)
+
+ # Main tabs
+ tabs = st.tabs(["Search & Download", "Local File Search", "Advanced Configuration", "Help"])
+
+ # Tab 1: Search & Download
+ with tabs[0]:
+ st.markdown("", unsafe_allow_html=True)
+
col1, col2 = st.columns([3, 1])
with col1:
- if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
- if url:
- custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
- valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
- if custom_ext_list != valid_ext_list:
- st.warning("Invalid extensions ignored. Use format like '.csv'.")
-
- # Reset RAG engine for new search
- st.session_state.rag_indexed = False
- st.session_state.rag_engine = None
-
- # Define a function to run the deep search
- async def run_deep_search():
- async with DownloadManager(
- use_proxy=use_proxy,
- proxy=proxy,
- use_stealth=use_stealth
- ) as dm:
- files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
- return files
-
- with st.spinner("Searching for files..."):
- files = asyncio.run(run_deep_search())
-
- if files:
- st.session_state.discovered_files = files
- st.session_state.current_url = url
- st.success(f"Found {len(files)} files!")
- else:
- st.warning("No files found.")
-
- if st.session_state.discovered_files:
- files = st.session_state.discovered_files
-
- # Display files with direct download buttons
- download_dir = "./downloads"
- os.makedirs(download_dir, exist_ok=True)
-
- # Add RAG Search interface
- st.markdown("### Search Within Discovered Files")
- search_query = st.text_input("Enter search terms", key="rag_search_query")
-
- if st.button("Search Files", key="rag_search_btn") and search_query:
- # Initialize RAG search engine
- if not st.session_state.rag_indexed:
- rag_search = EnhancedRAGSearch()
-
- with st.spinner("Indexing files for search..."):
- # First download files to extract text
- temp_dir = "./temp_downloads"
- os.makedirs(temp_dir, exist_ok=True)
-
- async def download_for_indexing():
- downloaded = 0
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
- for i, file_info in enumerate(files):
- # Only process common text-based file formats
- ext = os.path.splitext(file_info['filename'])[1].lower()
- if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
- path = await dm.download_file(file_info, temp_dir, url)
- if path:
- with open(path, 'rb') as f:
- file_data = f.read()
-
- # Add to search index
- if rag_search.add_file(file_data, file_info):
- downloaded += 1
-
- # Clean up
- os.remove(path)
- return downloaded
-
- indexed_count = asyncio.run(download_for_indexing())
- if indexed_count > 0:
- rag_search.build_index()
- st.session_state.rag_engine = rag_search
- st.session_state.rag_indexed = True
- st.success(f"Indexed {indexed_count} files for search")
- else:
- st.warning("Could not index any files. Try with more text-based documents.")
+ url = st.text_input("Enter a URL to search for downloadable files:",
+ placeholder="e.g., https://example.com/resources",
+ value=st.session_state.get('preset_url', ''))
+ with col2:
+ # Initialize search_method with either session state or default value
+ initial_search_method = st.session_state.get('search_method', "Deep Search")
+ search_method = st.selectbox("Search Method",
+ ["Deep Search", "Quick Search", "Exam Site Mode"],
+ index=["Deep Search", "Quick Search", "Exam Site Mode"].index(initial_search_method))
+ # Update session state when changed
+ if search_method != st.session_state.get('search_method'):
+ st.session_state.search_method = search_method
+
+ # Advanced options in an expander
+ with st.expander("Search Options", expanded=False):
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ depth = st.slider("Search Depth", min_value=1, max_value=5, value=2,
+ help="Higher values will search more links but take longer")
+ prioritize_pdfs = st.checkbox("Prioritize PDFs",
+ value=st.session_state.get('prioritize_pdfs', True),
+ help="Focus on finding PDF files first")
+ with col2:
+ timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=60)
+ follow_subdomains = st.checkbox("Follow Subdomains", value=True,
+ help="Include links from subdomains in the search")
+ with col3:
+ # Default extensions based on mode
+ default_extensions = {
+ "Standard": ".pdf,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.zip",
+ "Education Mode": ".pdf,.doc,.docx,.ppt,.pptx",
+ "Research Mode": ".pdf,.txt,.csv,.json,.xlsx",
+ "Media Mode": ".jpg,.png,.mp3,.mp4,.avi,.mov"
+ }
- # Perform the search
- if st.session_state.rag_indexed:
- search_results = st.session_state.rag_engine.search(search_query)
-
- if search_results:
- st.write(f"Found {len(search_results)} relevant files:")
-
- for result in search_results:
- file_info = result['file_info']
- score = result['score']
- match_type = result.get('match_type', 'document')
-
- with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
- st.write(f"Size: {file_info['size']}")
- st.write(f"Match type: {match_type}")
-
- # Show language if available
- if 'language' in result:
- st.write(f"Language: {result['language']}")
-
- # Show metadata if available
- if 'metadata' in file_info and file_info['metadata']:
- st.write("Metadata:")
- for k, v in file_info['metadata'].items():
- if k != 'file_id': # Skip technical details
- st.write(f"- {k}: {v}")
-
- # Show content preview for chunk matches
- if 'chunk_preview' in result:
- st.write("Content preview:")
- st.text(result['chunk_preview'])
-
- # Add direct download button
- if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
- with st.spinner(f"Downloading {file_info['filename']}..."):
- async def download_search_result():
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
- path = await dm.download_file(file_info, download_dir, url)
- return path
-
- path = asyncio.run(download_search_result())
- if path:
- with open(path, "rb") as f:
- file_data = f.read()
-
- st.download_button(
- label=f"Save {file_info['filename']}",
- data=file_data,
- file_name=file_info['filename'],
- mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
- key=f"save_rag_{result['rank']}"
- )
- else:
- st.warning("No matching files found for your query.")
-
- # Show all files with direct download options
- st.markdown("### All Discovered Files")
+ custom_extensions = st.text_area(
+ "Custom File Extensions",
+ value=st.session_state.get('custom_extensions', default_extensions[st.session_state.mode]),
+ help="Comma-separated list of file extensions to look for"
+ )
+
+ # Update session state when extensions changed
+ if 'custom_extensions' not in st.session_state or custom_extensions != st.session_state.custom_extensions:
+ st.session_state.custom_extensions = custom_extensions
+
+ search_col1, search_col2 = st.columns([4, 1])
+ with search_col1:
+ search_button = st.button("๐ Start Search", use_container_width=True)
+ with search_col2:
+ clear_button = st.button("๐งน Clear Results", use_container_width=True)
+
+ # File results section
+ if st.session_state.files:
+ st.markdown("", unsafe_allow_html=True)
+
+ # File filtering options
+ filter_col1, filter_col2, filter_col3 = st.columns([2, 2, 1])
+ with filter_col1:
+ file_filter = st.text_input("Filter files by name:", placeholder="e.g., exam, 2023, etc.")
+ with filter_col2:
+ sort_option = st.selectbox("Sort by:", ["Relevance", "Name", "Size (Largest)", "Size (Smallest)"])
+ with filter_col3:
+ show_only_pdfs = st.checkbox("PDFs Only", value=False)
+
+ # Sort files based on selection
+ sorted_files = list(st.session_state.files)
+ if sort_option == "Name":
+ sorted_files.sort(key=lambda x: x['filename'])
+ elif sort_option == "Size (Largest)":
+ # Convert size strings to comparable values
+ def parse_size(size_str):
+ if 'Unknown' in size_str:
+ return 0
+ try:
+ value = float(size_str.split(' ')[0])
+ unit = size_str.split(' ')[1]
+ multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
+ return value * multipliers.get(unit, 0)
+ except:
+ return 0
+
+ sorted_files.sort(key=lambda x: parse_size(x['size']), reverse=True)
+ elif sort_option == "Size (Smallest)":
+ def parse_size(size_str):
+ if 'Unknown' in size_str:
+ return float('inf')
+ try:
+ value = float(size_str.split(' ')[0])
+ unit = size_str.split(' ')[1]
+ multipliers = {'bytes': 1, 'KB': 1024, 'MB': 1024**2, 'GB': 1024**3, 'TB': 1024**4}
+ return value * multipliers.get(unit, 0)
+ except:
+ return float('inf')
+
+ sorted_files.sort(key=lambda x: parse_size(x['size']))
- # Batch download options
- col1, col2 = st.columns([1, 4])
- with col1:
- if st.button("Select All", key="select_all_btn"):
- st.session_state.selected_files = list(range(len(files)))
- if st.button("Clear Selection", key="clear_selection_btn"):
- st.session_state.selected_files = []
-
- # Batch download settings
- if 'selected_files' in st.session_state and st.session_state.selected_files:
- batch_col1, batch_col2, batch_col3, batch_col4 = st.columns(4)
- with batch_col1:
- download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
- with batch_col2:
- create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
- with batch_col3:
- delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
- with batch_col4:
- upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
-
- if st.button("Download Selected", key="download_batch_btn"):
- if not os.path.exists(download_dir):
- os.makedirs(download_dir)
-
- async def download_files():
- downloaded_paths = []
- progress_bar = st.progress(0)
- status_text = st.empty()
+ # File list with selection
+ file_container = st.container()
+ with file_container:
+ selected_files = []
+ displayed_files = []
+
+ for i, file in enumerate(sorted_files):
+ # Apply filters
+ if file_filter and file_filter.lower() not in file['filename'].lower():
+ continue
+ if show_only_pdfs and not file['filename'].lower().endswith('.pdf'):
+ continue
- async with DownloadManager(
- use_proxy=use_proxy,
- proxy=proxy,
- use_stealth=use_stealth
- ) as dm:
- for i, idx in enumerate(st.session_state.selected_files):
- progress = (i + 1) / len(st.session_state.selected_files)
- file_info = files[idx]
- status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(st.session_state.selected_files)})")
- progress_bar.progress(progress)
-
- path = await dm.download_file(file_info, download_dir, url)
- if path:
- downloaded_paths.append(path)
+ displayed_files.append(i)
+ with st.container():
+ col1, col2, col3, col4 = st.columns([0.5, 3, 1, 1])
+ with col1:
+ selected = st.checkbox("", key=f"select_{i}", value=True)
+ if selected:
+ selected_files.append(i)
+ with col2:
+ file_icon = "๐"
+ if file['filename'].lower().endswith('.pdf'):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.doc', '.docx')):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.xls', '.xlsx')):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.ppt', '.pptx')):
+ file_icon = "๐ผ๏ธ"
+ elif file['filename'].lower().endswith(('.jpg', '.png', '.gif')):
+ file_icon = "๐ผ๏ธ"
+ elif file['filename'].lower().endswith(('.mp3', '.wav')):
+ file_icon = "๐"
+ elif file['filename'].lower().endswith(('.mp4', '.avi', '.mov')):
+ file_icon = "๐ฌ"
- status_text.empty()
- progress_bar.empty()
- return downloaded_paths
-
- with st.spinner("Downloading files..."):
- downloaded = asyncio.run(download_files())
-
- if downloaded:
- st.success(f"Successfully downloaded {len(downloaded)} files")
+ st.markdown(f"**{file_icon} {file['filename']}**")
+ st.markdown(f"{file['url'][:60]}...", unsafe_allow_html=True)
+ with col3:
+ st.markdown(f"**Size:** {file['size']}")
+ with col4:
+ st.button("Preview", key=f"preview_{i}")
- if create_zip:
- zip_path = create_zip_file(downloaded, download_dir)
- st.success(f"Created ZIP file: {zip_path}")
-
- # Provide download link for the zip file
- with open(zip_path, "rb") as f:
- zip_data = f.read()
-
- st.download_button(
- label="Download ZIP",
- data=zip_data,
- file_name=os.path.basename(zip_path),
- mime="application/zip",
- key="download_zip_btn"
- )
-
- # Upload to Google Drive if requested
- if upload_to_drive and st.session_state.google_creds:
- drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
- folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
- drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
- if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
- st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
- else:
- st.error(drive_id)
-
- # Delete original files if requested
- if delete_after:
- for path in downloaded:
- try:
- os.remove(path)
- except Exception as e:
- st.warning(f"Could not delete {path}: {e}")
- st.info("Deleted original files after ZIP creation")
-
- # Individual file display with direct download buttons
- for i, file in enumerate(files):
- col1, col2, col3 = st.columns([3, 1, 1])
+ st.divider()
+
+ if not displayed_files:
+ st.info("No files match your current filters. Try adjusting your search criteria.")
+
+ # Download options
+ if selected_files:
+ col1, col2 = st.columns(2)
with col1:
- filename = file['filename']
- size = file['size']
- meta = file.get('metadata', {})
- file_info = f"{filename} ({size})"
- if meta and 'Pages' in meta:
- file_info += f" - {meta.get('Pages', '')} pages"
- st.markdown(f"**{i+1}. {file_info}**")
-
+ download_dir = st.text_input("Download Directory", value="downloads")
with col2:
- # Add direct download button for each file
- if st.button(f"Download", key=f"direct_dl_{i}"):
- with st.spinner(f"Downloading {filename}..."):
- async def download_single_file():
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
- path = await dm.download_file(file, download_dir, url)
- return path
-
- downloaded_path = asyncio.run(download_single_file())
- if downloaded_path:
- with open(downloaded_path, "rb") as f:
- file_data = f.read()
-
- st.download_button(
- label=f"Save {filename}",
- data=file_data,
- file_name=filename,
- mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
- key=f"save_file_{i}"
- )
+ download_option = st.radio("Download as", ["Individual Files", "ZIP Archive"], horizontal=True)
- with col3:
- # Add to selection for batch download
- if i in st.session_state.selected_files:
- if st.button("Unselect", key=f"unselect_{i}"):
- st.session_state.selected_files.remove(i)
+ download_col1, download_col2, download_col3 = st.columns([3, 1, 1])
+ with download_col1:
+ download_button = st.button("โฌ๏ธ Download Selected Files", use_container_width=True)
+ with download_col2:
+ google_drive_button = st.button("๐ค Upload to Drive",
+ use_container_width=True,
+ disabled=not st.session_state.google_credentials)
+ with download_col3:
+ select_all = st.button("Select All Files", use_container_width=True)
+
+ # Handle select all button
+ if select_all:
+ for i in displayed_files:
+ st.session_state[f"select_{i}"] = True
+ st.rerun()
+
+ # Download progress/results
+ if st.session_state.download_complete:
+ st.success(f"โ
Downloaded {len(st.session_state.downloaded_paths)} files successfully!")
+ download_links = []
+ for path in st.session_state.downloaded_paths:
+ with open(path, "rb") as f:
+ file_content = f.read()
+ file_name = os.path.basename(path)
+ download_links.append((file_name, file_content))
+
+ if len(download_links) > 0:
+ if download_option == "ZIP Archive":
+ # Create ZIP archive for download
+ zip_path = create_zip_file(st.session_state.downloaded_paths, download_dir)
+ with open(zip_path, "rb") as f:
+ zip_content = f.read()
+ st.download_button("๐ฆ Download ZIP Archive",
+ zip_content,
+ file_name=os.path.basename(zip_path),
+ mime="application/zip")
else:
- if st.button("Select", key=f"select_{i}"):
- st.session_state.selected_files.append(i)
-
- elif mode == "Bing Search":
- st.header("Bing Search Mode")
- query = st.text_input("Enter search query", key="search_query_input")
- num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
+ # Show individual file download links
+ st.markdown("Download Files
", unsafe_allow_html=True)
+
+ # Create a grid of download buttons
+ cols = st.columns(3)
+ for idx, (name, content) in enumerate(download_links):
+ mime_type = mimetypes.guess_type(name)[0] or 'application/octet-stream'
+ with cols[idx % 3]:
+ st.download_button(
+ f"๐ {name}",
+ content,
+ file_name=name,
+ mime=mime_type,
+ key=f"dl_{name}",
+ use_container_width=True
+ )
+
+ # Tab 2: Local File Search
+ with tabs[1]:
+ st.markdown("", unsafe_allow_html=True)
+ st.write("Upload files to search through their content with AI-powered semantic search.")
+
+ # File upload
+ uploaded_files = st.file_uploader("Upload documents for search",
+ accept_multiple_files=True,
+ type=['pdf', 'docx', 'txt', 'csv', 'json'])
- if st.button("Search", key="search_btn"):
- if query:
- async def run_search():
- async with DownloadManager(
- use_proxy=use_proxy,
- proxy=proxy,
- query=query,
- num_results=num_results,
- use_stealth=use_stealth
- ) as dm:
+ if uploaded_files:
+ # Build search index on upload
+ col1, col2 = st.columns([4, 1])
+ with col1:
+ use_transformer = st.checkbox("Use AI Transformer Model", value=HAVE_TRANSFORMERS,
+ help="Uses advanced AI for more accurate semantic search (if available)")
+ with col2:
+ if st.button("Build Search Index", use_container_width=True):
+ with st.spinner("Processing files and building search index..."):
+ files_added = 0
+ for uploaded_file in uploaded_files:
+ file_info = {
+ 'filename': uploaded_file.name,
+ 'url': f'local://{uploaded_file.name}',
+ 'size': humanize_file_size(uploaded_file.size)
+ }
+ success = st.session_state.rag_search.add_file(uploaded_file.getvalue(), file_info)
+ if success:
+ files_added += 1
+
+ if files_added > 0:
+ index_built = st.session_state.rag_search.build_index()
+ if index_built:
+ st.success(f"โ
Successfully indexed {files_added} files!")
+ else:
+ st.error("Failed to build search index.")
+ else:
+ st.warning("No valid text could be extracted from the files.")
+
+ # Search interface
+ st.markdown("", unsafe_allow_html=True)
+
+ col1, col2 = st.columns([4, 1])
+ with col1:
+ query = st.text_input("Enter search query:", placeholder="e.g., neural networks, climate change")
+ with col2:
+ expand_query = st.checkbox("Auto-expand query", value=True,
+ help="Automatically add related terms to your search")
+
+ col1, col2 = st.columns([4, 1])
+ with col1:
+ if st.button("๐ Search Documents", use_container_width=True):
+ if not query:
+ st.warning("Please enter a search query")
+ else:
with st.spinner("Searching..."):
- urls = await dm.search_bing()
- if urls:
- st.session_state.search_results = urls
- st.success(f"Found {len(urls)} results!")
-
- # Create expanders for each result
- for i, url in enumerate(urls, 1):
- with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
- st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}", on_click=set_deep_search_url, args=(url,))
+ results = st.session_state.rag_search.search(query, top_k=5, search_chunks=True)
+
+ if results:
+ st.markdown(f"**Found {len(results)} relevant documents:**")
+ for i, result in enumerate(results):
+ with st.container():
+ st.markdown(f"", unsafe_allow_html=True)
+ st.markdown(f"**{i+1}. {result['file_info']['filename']}** (Score: {result['score']:.2f})")
+
+ if result.get('chunk_preview'):
+ st.markdown("**Matching content:**")
+ st.text(result['chunk_preview'])
+
+ st.markdown("
", unsafe_allow_html=True)
else:
- st.warning("No search results found.")
+ st.info("No matching documents found. Try a different query.")
+ with col2:
+ num_results = st.number_input("Max results", min_value=1, max_value=20, value=5)
+
+ # Quick search tips
+ with st.expander("Search Tips", expanded=False):
+ st.markdown("""
+ ### Effective Search Tips
- asyncio.run(run_search())
+ - **Be specific** with your queries for more accurate results
+ - **Try different phrasings** if you don't get the results you expect
+ - Use **quotation marks** for exact phrase matching
+ - For **complex topics**, break down your search into multiple queries
+ - **Combine related terms** to improve recall
+
+ The search engine uses advanced algorithms to understand the semantic meaning of your query,
+ not just keyword matching.
+ """)
+
+ # Tab 3: Advanced Configuration
+ with tabs[2]:
+ st.markdown("", unsafe_allow_html=True)
- # Handle deep search - using on_click function to avoid state issues
- if 'deep_search_url' in st.session_state and st.session_state.deep_search_url:
- url = st.session_state.deep_search_url
- st.info(f"Deep searching: {url}")
-
- # Set up custom extensions
- custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
- valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
-
- # Reset RAG engine for new search
- st.session_state.rag_indexed = False
- st.session_state.rag_engine = None
-
- # Run the deep search
- async def run_bing_deep_search():
- async with DownloadManager(
- use_proxy=use_proxy,
- proxy=proxy,
- use_stealth=use_stealth
- ) as dm:
- files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
- return files
-
- with st.spinner("Searching for files..."):
- files = asyncio.run(run_bing_deep_search())
-
- if files:
- st.session_state.discovered_files = files
- st.session_state.current_url = url
- st.success(f"Found {len(files)} files!")
-
- # Show files with direct download options
- download_dir = "./downloads"
- os.makedirs(download_dir, exist_ok=True)
-
- # Individual file display with direct download buttons
- for i, file in enumerate(files):
- col1, col2, col3 = st.columns([3, 1, 1])
- with col1:
- filename = file['filename']
- size = file['size']
- meta = file.get('metadata', {})
- file_info = f"{filename} ({size})"
- if meta and 'Pages' in meta:
- file_info += f" - {meta.get('Pages', '')} pages"
- st.markdown(f"**{i+1}. {file_info}**")
-
- with col2:
- # Add direct download button for each file
- if st.button(f"Download", key=f"direct_dl_bing_{i}"):
- with st.spinner(f"Downloading {filename}..."):
- async def download_single_file():
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
- path = await dm.download_file(file, download_dir, url)
- return path
-
- downloaded_path = asyncio.run(download_single_file())
- if downloaded_path:
- with open(downloaded_path, "rb") as f:
- file_data = f.read()
-
- st.download_button(
- label=f"Save {filename}",
- data=file_data,
- file_name=filename,
- mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
- key=f"save_bing_file_{i}"
- )
-
- with col3:
- # Add to selection for batch download
- if i in st.session_state.selected_files:
- if st.button("Unselect", key=f"bing_unselect_{i}"):
- st.session_state.selected_files.remove(i)
- else:
- if st.button("Select", key=f"bing_select_{i}"):
- st.session_state.selected_files.append(i)
+ config_tabs = st.tabs(["Browser Settings", "Proxy Configuration", "Download Options", "System"])
+
+ # Browser Settings tab
+ with config_tabs[0]:
+ col1, col2 = st.columns(2)
+ with col1:
+ use_stealth = st.checkbox("Use Stealth Mode", value=st.session_state.stealth_mode,
+ help="Makes browser harder to detect as automated, but may be slower")
- # Add RAG Search interface for Bing results
- st.markdown("### Search Within Discovered Files")
- search_query = st.text_input("Enter search terms", key="bing_rag_search_query")
+ handle_captchas = st.checkbox("Handle Captchas Automatically", value=False,
+ help="Attempt to solve simple captchas automatically")
- if st.button("Search Files", key="bing_rag_search_btn") and search_query:
- # Initialize RAG search engine
- if not st.session_state.rag_indexed:
- rag_search = EnhancedRAGSearch()
-
- with st.spinner("Indexing files for search..."):
- # First download files to extract text
- temp_dir = "./temp_downloads"
- os.makedirs(temp_dir, exist_ok=True)
-
- async def download_for_indexing():
- downloaded = 0
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
- for i, file_info in enumerate(files):
- # Only process common text-based file formats
- ext = os.path.splitext(file_info['filename'])[1].lower()
- if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
- path = await dm.download_file(file_info, temp_dir, url)
- if path:
- with open(path, 'rb') as f:
- file_data = f.read()
-
- # Add to search index
- if rag_search.add_file(file_data, file_info):
- downloaded += 1
-
- # Clean up
- os.remove(path)
- return downloaded
-
- indexed_count = asyncio.run(download_for_indexing())
- if indexed_count > 0:
- rag_search.build_index()
- st.session_state.rag_engine = rag_search
- st.session_state.rag_indexed = True
- st.success(f"Indexed {indexed_count} files for search")
- else:
- st.warning("Could not index any files. Try with more text-based documents.")
-
- # Perform the search
- if st.session_state.rag_indexed:
- search_results = st.session_state.rag_engine.search(search_query)
-
- if search_results:
- st.write(f"Found {len(search_results)} relevant files:")
-
- for result in search_results:
- file_info = result['file_info']
- score = result['score']
- match_type = result.get('match_type', 'document')
-
- with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
- st.write(f"Size: {file_info['size']}")
- st.write(f"Match type: {match_type}")
-
- # Show language if available
- if 'language' in result:
- st.write(f"Language: {result['language']}")
-
- # Show metadata if available
- if 'metadata' in file_info and file_info['metadata']:
- st.write("Metadata:")
- for k, v in file_info['metadata'].items():
- if k != 'file_id': # Skip technical details
- st.write(f"- {k}: {v}")
-
- # Show content preview for chunk matches
- if 'chunk_preview' in result:
- st.write("Content preview:")
- st.text(result['chunk_preview'])
-
- # Add direct download button
- if st.button(f"Download this file", key=f"bing_rag_dl_{result['rank']}"):
- with st.spinner(f"Downloading {file_info['filename']}..."):
- async def download_search_result():
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
- path = await dm.download_file(file_info, download_dir, url)
- return path
-
- path = asyncio.run(download_search_result())
- if path:
- with open(path, "rb") as f:
- file_data = f.read()
-
- st.download_button(
- label=f"Save {file_info['filename']}",
- data=file_data,
- file_name=file_info['filename'],
- mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
- key=f"save_bing_rag_{result['rank']}"
- )
- else:
- st.warning("No matching files found for your query.")
- else:
- st.warning("No files found.")
+ download_timeout = st.slider("Download Timeout (seconds)",
+ min_value=30, max_value=600, value=300,
+ help="Maximum time to wait for downloads to complete")
+ with col2:
+ user_agent = st.selectbox("User Agent", USER_AGENTS, index=0,
+ help="Browser identity to use when accessing websites")
+
+ save_screenshots = st.checkbox("Save Browser Screenshots", value=False,
+ help="Save screenshots when errors occur for debugging")
+
+ browser_lang = st.selectbox("Browser Language",
+ ["English (US)", "English (UK)", "Spanish", "French", "German", "Chinese"],
+ index=0)
- # Reset the deep search URL after processing
- st.session_state.deep_search_url = None
-
- # Add a special section for direct Google Drive file download
- st.markdown("---")
- with st.expander("Download View-Only Google Drive Document", expanded=False):
- st.write("Download protected/view-only Google Drive documents - just enter the file ID")
- file_id = st.text_input("Google Drive File ID",
- placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku",
- help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')")
+ if st.button("Update Browser Settings"):
+ st.session_state.stealth_mode = use_stealth
+ st.success("Browser settings updated!")
+
+ # Dependency installation section
+ st.markdown("", unsafe_allow_html=True)
+ if st.button("Install Playwright Dependencies"):
+ with st.spinner("Installing dependencies..."):
+ install_playwright_dependencies()
- if st.button("Download Document") and file_id:
- download_dir = "./downloads"
- os.makedirs(download_dir, exist_ok=True)
- output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
-
- with st.spinner("Downloading view-only document... (this may take a minute)"):
- async def download_viewonly():
- async with DownloadManager(use_stealth=use_stealth) as dm:
- file_info = {
- 'url': f"https://drive.google.com/file/d/{file_id}/view",
- 'filename': f"gdrive_{file_id}.pdf",
- 'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True}
- }
- result_path = await dm.force_download_viewonly(file_info, output_path)
- return result_path
+ # Proxy Configuration tab
+ with config_tabs[1]:
+ proxy_enabled = st.checkbox("Enable Proxy", value=st.session_state.use_proxy,
+ help="Route requests through a proxy server for anonymity or bypassing restrictions")
+
+ if proxy_enabled:
+ proxy_col1, proxy_col2 = st.columns(2)
+ with proxy_col1:
+ proxy_type = st.selectbox("Proxy Type", ["HTTP", "SOCKS5", "HTTPS"])
+ proxy_host = st.text_input("Proxy Host", placeholder="e.g., 127.0.0.1")
+ with proxy_col2:
+ proxy_port = st.text_input("Proxy Port", placeholder="e.g., 8080")
+ proxy_auth = st.text_input("Proxy Authentication (optional)",
+ placeholder="username:password", type="password")
+
+ st.markdown("", unsafe_allow_html=True)
+ use_proxy_rotation = st.checkbox("Enable Proxy Rotation", value=False,
+ help="Automatically rotate between multiple proxies for better anonymity")
+
+ if use_proxy_rotation:
+ proxy_list = st.text_area("Proxy List (one per line)",
+ placeholder="http://proxy1.example.com:8080\nhttp://proxy2.example.com:8080")
+ rotation_interval = st.slider("Rotation Interval (requests)",
+ min_value=1, max_value=50, value=10,
+ help="How often to switch proxies")
+
+ if st.button("Save Proxy Configuration"):
+ # Construct the proxy string
+ proxy_string = None
+ if proxy_enabled and proxy_host and proxy_port:
+ proxy_prefix = f"{proxy_type.lower()}://"
+ proxy_auth_str = f"{proxy_auth}@" if proxy_auth else ""
+ proxy_string = f"{proxy_prefix}{proxy_auth_str}{proxy_host}:{proxy_port}"
+
+ # Update session state
+ st.session_state.use_proxy = proxy_enabled
+ st.session_state.proxy_string = proxy_string
+
+ # Configure proxy rotation if enabled
+ # Configure proxy rotation if enabled
+ if use_proxy_rotation and proxy_list:
+ PROXY_ROTATION_CONFIG["enabled"] = True
+ PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
+ PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
+
+ st.success("Proxy configuration updated!")
+
+ # Download Options tab
+ with config_tabs[2]:
+ col1, col2 = st.columns(2)
+ with col1:
+ st.markdown("", unsafe_allow_html=True)
+
+ skip_existing = st.checkbox("Skip Existing Files", value=True,
+ help="Don't download files that already exist locally")
+
+ auto_rename = st.checkbox("Auto-Rename Duplicates", value=True,
+ help="Automatically rename files instead of overwriting")
+
+ verify_downloads = st.checkbox("Verify Downloads", value=True,
+ help="Check file integrity after download")
+
+ max_retries = st.slider("Max Retries", min_value=0, max_value=10, value=3,
+ help="Number of times to retry failed downloads")
+
+ with col2:
+ st.markdown("", unsafe_allow_html=True)
+
+ auto_organize = st.checkbox("Auto-Organize Files", value=True,
+ help="Automatically organize files by type")
+
+ default_dir = st.text_input("Default Download Directory", value="downloads",
+ help="Default location to save downloaded files")
+
+ org_by_domain = st.checkbox("Organize by Domain", value=False,
+ help="Create subdirectories based on source domains")
+
+ org_by_type = st.checkbox("Organize by File Type", value=False,
+ help="Create subdirectories based on file types")
+
+ if st.button("Save Download Settings"):
+ st.session_state.download_settings = {
+ "skip_existing": skip_existing,
+ "auto_rename": auto_rename,
+ "verify_downloads": verify_downloads,
+ "max_retries": max_retries,
+ "auto_organize": auto_organize,
+ "default_dir": default_dir,
+ "org_by_domain": org_by_domain,
+ "org_by_type": org_by_type
+ }
+ st.success("Download settings saved!")
+
+ # System tab
+ with config_tabs[3]:
+ col1, col2 = st.columns(2)
+ with col1:
+ st.markdown("", unsafe_allow_html=True)
+
+ max_concurrent = st.slider("Max Concurrent Downloads", min_value=1, max_value=10, value=3,
+ help="Maximum number of simultaneous downloads")
+
+ memory_limit = st.slider("Memory Limit (MB)", min_value=256, max_value=4096, value=1024,
+ help="Maximum memory to use for file processing")
+
+ processing_threads = st.slider("Processing Threads", min_value=1, max_value=8, value=2,
+ help="Number of threads to use for file processing")
+
+ with col2:
+ st.markdown("", unsafe_allow_html=True)
+
+ log_level = st.selectbox("Log Level", ["DEBUG", "INFO", "WARNING", "ERROR"], index=1,
+ help="Detail level for application logs")
- result = asyncio.run(download_viewonly())
+ save_debug_info = st.checkbox("Save Debug Information", value=False,
+ help="Save detailed information about program execution")
- if result:
- st.success("Document downloaded successfully!")
+ log_dir = st.text_input("Log Directory", value="logs",
+ help="Directory to save log files")
+
+ if st.button("Apply System Settings"):
+ st.session_state.system_settings = {
+ "max_concurrent": max_concurrent,
+ "memory_limit": memory_limit,
+ "processing_threads": processing_threads,
+ "log_level": log_level,
+ "save_debug_info": save_debug_info,
+ "log_dir": log_dir
+ }
+ # Update logging configuration
+ log_level_num = getattr(logging, log_level)
+ logging.getLogger().setLevel(log_level_num)
+ st.success("System settings applied!")
+
+ # Reset application button
+ st.markdown("", unsafe_allow_html=True)
+ reset_col1, reset_col2 = st.columns([1, 3])
+ with reset_col1:
+ if st.button("Reset Application", use_container_width=True):
+ for key in list(st.session_state.keys()):
+ if key != 'google_credentials': # Preserve Google auth
+ del st.session_state[key]
+ st.success("Application has been reset!")
+ st.rerun()
+ with reset_col2:
+ st.info("This will clear all search results, downloaded files, and reset settings to defaults.")
+
+ # Tab 4: Help
+ with tabs[3]:
+ st.markdown("", unsafe_allow_html=True)
+
+ help_tabs = st.tabs(["Quick Start", "Advanced Features", "Troubleshooting", "About"])
+
+ with help_tabs[0]:
+ st.markdown("""
+ ### Getting Started
+
+ 1. **Enter a URL** on the Search & Download tab
+ 2. Select a **Search Method**:
+ - **Deep Search**: Thorough but slower
+ - **Quick Search**: Fast but may miss some files
+ - **Exam Site Mode**: Optimized for educational resource sites
+ 3. Click **Start Search** to find downloadable files
+ 4. Select files you want to download
+ 5. Click **Download Selected Files**
+
+ #### Using Different Modes
+
+ Select a mode from the sidebar to optimize the tool for different use cases:
+
+ - **Standard Mode**: Balanced for general use
+ - **Education Mode**: Optimized for finding academic materials
+ - **Research Mode**: Better for research papers and datasets
+ - **Media Mode**: Enhanced for finding images, videos, and audio
+
+ For best results with educational materials, use the **Exam Site Mode** with websites that contain past exams, papers, or course materials.
+ """)
+
+ with help_tabs[1]:
+ st.markdown("""
+ ### Advanced Features
+
+ - **Local File Search**: Upload files and search through their content using the enhanced RAG search
+ - **Custom Extensions**: Specify additional file types to look for beyond the default set
+ - **Stealth Mode**: Makes the browser harder to detect as automated, useful for sites that block scrapers
+ - **Proxy Support**: Use proxies to access region-restricted content or improve anonymity
+ - **Google Drive Integration**: Upload downloaded files directly to your Google Drive
+
+ #### Search Tips
+
+ - For educational sites, include specific terms like "exam", "test", "paper" in the URL
+ - When using Local File Search, try different variations of your query for better results
+ - Use filtering and sorting options to find the most relevant files quickly
+
+ #### File Organization
+
+ You can configure automatic file organization in the Advanced Configuration tab:
+
+ - **Organize by Domain**: Creates folders based on the source website
+ - **Organize by File Type**: Separates files into folders by their extension
+ - **Auto-Rename**: Prevents overwriting existing files with same names
+ """)
+
+ with help_tabs[2]:
+ st.markdown("""
+ ### Troubleshooting
+
+ #### Common Issues
+
+ - **No files found**: Try using Deep Search with higher depth value, or add more specific file extensions
+ - **Downloads failing**: Check if the site requires authentication or uses captchas
+ - **Slow performance**: Reduce search depth or disable stealth mode for faster results
+ - **Browser errors**: Click "Install Playwright Dependencies" in Advanced Settings
+
+ #### Captcha Issues
+
+ Some websites use captchas to prevent automated access. If you encounter captchas:
+
+ 1. Try using a different proxy
+ 2. Enable "Handle Captchas Automatically" for simple captchas
+ 3. For complex captchas, you may need to manually access the site first
+
+ #### Proxy Problems
+
+ If you're having issues with proxies:
+
+ 1. Verify your proxy is working with an external tool
+ 2. Check that you've entered the correct format (http://host:port)
+ 3. Some websites may block known proxy IPs
+
+ #### Memory Usage
+
+ If the application is using too much memory:
+
+ 1. Reduce the "Memory Limit" in System settings
+ 2. Process fewer files at once
+ 3. Use lower search depth values
+ """)
+
+ with help_tabs[3]:
+ st.markdown("""
+ ### About This Tool
+
+ **Advanced File Downloader** is a sophisticated tool designed to discover and download files from websites with enhanced capabilities for educational resources.
+
+ #### Key Features
+
+ - **Smart Discovery**: Finds downloadable files even when they're not directly linked
+ - **Enhanced RAG Search**: Search through downloaded documents using advanced AI techniques
+ - **Educational Focus**: Specialized detection for exam papers and academic resources
+ - **Stealth Capabilities**: Avoids detection by anti-scraping measures
+
+ #### Technical Details
+
+ This tool uses:
+
+ - **Playwright**: For browser automation and stealth capabilities
+ - **Sentence Transformers**: For AI-powered semantic search
+ - **Streamlit**: For the user interface
+ - **Google Drive API**: For cloud integration
+
+ #### Credits
+
+ Created with Python, Streamlit, Playwright, and various AI libraries.
+
+ For issues or suggestions, please contact the developer.
+
+ Version 2.0 - March 2025
+ """)
+
+ # Handle search and download actions
+ if search_button and url:
+ # Reset files and downloaded paths
+ st.session_state.files = []
+ st.session_state.downloaded_paths = []
+ st.session_state.download_complete = False
+
+ # Clear the preset URL if it was used
+ if 'preset_url' in st.session_state:
+ st.session_state.preset_url = ''
+
+ # Prepare custom extensions
+ custom_ext_list = [ext.strip() for ext in custom_extensions.split(",") if ext.strip()]
+
+ # Configure proxy from session state
+ proxy_string = st.session_state.proxy_string if st.session_state.use_proxy else None
+
+ # Set up proxy rotation if enabled
+ if 'use_proxy_rotation' in locals() and use_proxy_rotation and proxy_list:
+ PROXY_ROTATION_CONFIG["enabled"] = True
+ PROXY_ROTATION_CONFIG["rotation_interval"] = rotation_interval
+ PROXY_ROTATION_CONFIG["proxies"] = [p.strip() for p in proxy_list.splitlines() if p.strip()]
+
+ # Configure search parameters based on method
+ sublink_limit = 5000 if search_method == "Deep Search" else 1000
+ search_depth = depth if search_method == "Deep Search" else 1
+ is_exam_site = search_method == "Exam Site Mode"
+
+ # Execute the search asynchronously
+ async def run_search():
+ async with DownloadManager(
+ use_proxy=st.session_state.use_proxy,
+ proxy=proxy_string,
+ use_stealth=st.session_state.stealth_mode,
+ proxy_rotation=PROXY_ROTATION_CONFIG["enabled"]
+ ) as manager:
+ # For exam sites, use specialized approach
+ if is_exam_site:
+ st.session_state.keep_progress = True
+ edu_links = await manager.get_edu_exam_links(url)
+ all_files = []
- # Provide download button
- with open(result, "rb") as f:
- file_bytes = f.read()
+ progress_text = st.empty()
+ progress_bar = st.progress(0)
+
+ # Process each exam link
+ for i, link in enumerate(edu_links):
+ progress = (i+1) / max(1, len(edu_links))
+ progress_text.text(f"Processing exam link {i+1}/{len(edu_links)}: {link}")
+ progress_bar.progress(progress)
+
+ files = await manager.extract_downloadable_files(link, custom_ext_list)
+ all_files.extend(files)
+
+ st.session_state.files = all_files
+ progress_text.empty()
+ progress_bar.empty()
+ st.session_state.keep_progress = False
- st.download_button(
- label="Download PDF",
- data=file_bytes,
- file_name=f"gdrive_{file_id}.pdf",
- mime="application/pdf"
- )
else:
- st.error("Failed to download the document. Please check the file ID and try again.")
-
- # Add footer with attribution
- st.markdown('---')
- st.markdown('Created by [Euler314](https://github.com/euler314)')
-
-# Helper function for Bing search deep search URL setting
-def set_deep_search_url(url):
- st.session_state.deep_search_url = url
+ # Use general search method
+ files = await manager.deep_search(url, custom_ext_list, sublink_limit, timeout)
+ st.session_state.files = files
+
+ # Run the search
+ asyncio.run(run_search())
+ st.rerun()
+
+ # Handle download button
+ if 'download_button' in locals() and download_button and selected_files:
+ # Create download directory
+ os.makedirs(download_dir, exist_ok=True)
+
+ # Reset download state
+ st.session_state.downloaded_paths = []
+ st.session_state.download_complete = False
+
+ # Get selected files
+ files_to_download = [st.session_state.files[i] for i in selected_files]
+
+ # Execute the download asynchronously
+ async def run_download():
+ async with DownloadManager(
+ use_proxy=st.session_state.use_proxy,
+ proxy=st.session_state.proxy_string,
+ use_stealth=st.session_state.stealth_mode
+ ) as manager:
+ download_progress = st.progress(0)
+ status_text = st.empty()
+
+ for i, file_info in enumerate(files_to_download):
+ progress = (i) / len(files_to_download)
+ status_text.text(f"Downloading {i+1}/{len(files_to_download)}: {file_info['filename']}")
+ download_progress.progress(progress)
+
+ downloaded_path = await manager.download_file(
+ file_info,
+ download_dir,
+ get_domain(file_info['url'])
+ )
+
+ if downloaded_path:
+ st.session_state.downloaded_paths.append(downloaded_path)
+
+ download_progress.progress(1.0)
+ status_text.text(f"Downloaded {len(st.session_state.downloaded_paths)}/{len(files_to_download)} files successfully!")
+ st.session_state.download_complete = True
+
+ # Run the download
+ asyncio.run(run_download())
+ st.rerun()
+
+ # Handle Google Drive upload
+ if 'google_drive_button' in locals() and google_drive_button and st.session_state.google_credentials and st.session_state.downloaded_paths:
+ with st.spinner("Uploading to Google Drive..."):
+ drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_credentials)
+
+ # Create folder if it doesn't exist
+ folder_id = None
+ folder_name = st.session_state.drive_folder if 'drive_folder' in st.session_state else "File Downloader"
+
+ # Check if folder exists
+ query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
+ results = drive_service.files().list(q=query, spaces='drive', fields='files(id)').execute()
+ items = results.get('files', [])
+
+ if not items:
+ # Create folder
+ folder_id = create_drive_folder(drive_service, folder_name)
+ else:
+ folder_id = items[0]['id']
+
+ # Upload each file
+ upload_progress = st.progress(0)
+ status_text = st.empty()
+ uploaded_count = 0
+
+ for i, path in enumerate(st.session_state.downloaded_paths):
+ progress = i / len(st.session_state.downloaded_paths)
+ status_text.text(f"Uploading {i+1}/{len(st.session_state.downloaded_paths)}: {os.path.basename(path)}")
+ upload_progress.progress(progress)
+
+ result = google_drive_upload(path, st.session_state.google_credentials, folder_id)
+ if isinstance(result, str) and not result.startswith("Error"):
+ uploaded_count += 1
+
+ upload_progress.progress(1.0)
+ status_text.text(f"Uploaded {uploaded_count}/{len(st.session_state.downloaded_paths)} files to Google Drive folder '{folder_name}'")
+
+ st.success(f"โ
Files uploaded to Google Drive successfully!")
+
+ # Handle clear button
+ if clear_button:
+ st.session_state.files = []
+ st.session_state.downloaded_paths = []
+ st.session_state.download_complete = False
+ if 'preset_url' in st.session_state:
+ st.session_state.preset_url = ''
+ st.rerun()
if __name__ == "__main__":
main()
\ No newline at end of file