Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Mar 10

Commit

73aa1af

verified ·

1 Parent(s): 65b12b7

Update app.py

Browse files

Files changed (1) hide show

app.py +1023 -158

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import subprocess
 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 import asyncio
 import logging
-from urllib.parse import urlparse, urljoin, unquote
 import re
 from pathlib import Path
 from io import BytesIO
@@ -32,13 +32,27 @@ import googleapiclient.discovery
 import google.auth.transport.requests
 import googleapiclient.http
-# New imports for RAG search
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import docx2txt
-import PyPDF2
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
@@ -96,37 +110,126 @@ PROXY_ROTATION_CONFIG = {
     "proxies": []  # Will be populated from the UI if needed
 }
-# -------------------- RAG Search Class --------------------
-class RAGSearch:
     def __init__(self):
         self.file_texts = []
         self.file_metadata = []
-        self.vectorizer = TfidfVectorizer(stop_words='english')
         self.vectors = None
     def add_file(self, file_data, file_info):
-        """Add a file to the search index"""
-        file_ext = os.path.splitext(file_info['filename'])[1]
         text = self.extract_text(file_data, file_ext)
         if text:
             self.file_texts.append(text)
             self.file_metadata.append(file_info)
             return True
         return False
     def extract_text(self, file_data, file_ext):
-        """Extract text from different file types"""
         try:
             if file_ext.lower() == '.pdf':
                 reader = PyPDF2.PdfReader(BytesIO(file_data))
                 text = ""
                 for page in reader.pages:
-                    text += page.extract_text() + "\n"
                 return text
             elif file_ext.lower() in ['.docx', '.doc']:
                 return docx2txt.process(BytesIO(file_data))
-            elif file_ext.lower() in ['.txt', '.csv', '.json']:
                 return file_data.decode('utf-8', errors='ignore')
             else:
                 return ""
         except Exception as e:
@@ -134,35 +237,107 @@ class RAGSearch:
             return ""
     def build_index(self):
-        """Build the search index"""
         if not self.file_texts:
             return False
         try:
             self.vectors = self.vectorizer.fit_transform(self.file_texts)
             return True
         except Exception as e:
             logger.error(f"Error building search index: {e}")
             return False
-    def search(self, query, top_k=5):
-        """Search the index for relevant files"""
         if self.vectors is None:
             return []
         try:
-            query_vector = self.vectorizer.transform([query])
-            similarities = cosine_similarity(query_vector, self.vectors).flatten()
-            top_indices = similarities.argsort()[-top_k:][::-1]
             results = []
-            for i, idx in enumerate(top_indices):
-                if similarities[idx] > 0:
-                    results.append({
-                        'file_info': self.file_metadata[idx],
-                        'score': float(similarities[idx]),
-                        'rank': i+1
-                    })
-            return results
         except Exception as e:
             logger.error(f"Error during search: {e}")
             return []
@@ -222,6 +397,90 @@ def detect_captcha(html_content):
     html_lower = html_content.lower()
     return any(pattern in html_lower for pattern in captcha_patterns)
 # -------------------- Google Drive Functions --------------------
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
@@ -314,6 +573,10 @@ class DownloadManager:
         self.request_count = 0
         self.captcha_detected = False
         self.download_timeout = 300  # 5 minutes timeout for downloads
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
@@ -594,13 +857,51 @@ class DownloadManager:
         try:
             await self.rotate_proxy_if_needed()
-            async with self.context.new_page() as page:
-                response = await page.request.head(url, timeout=15000)
-                length = response.headers.get('Content-Length', None)
-                if length:
-                    return sizeof_fmt(int(length))
-                else:
-                    return "Unknown Size"
         except Exception as e:
             logger.warning(f"Error getting file size: {e}")
             return "Unknown Size"
@@ -627,14 +928,53 @@ class DownloadManager:
             return {}
     async def extract_real_download_url(self, url):
         try:
-            await self.rotate_proxy_if_needed()
-            async with self.context.new_page() as page:
-                response = await page.goto(url, wait_until='networkidle', timeout=30000)
-                if response and response.headers.get('location'):
-                    return response.headers['location']
-                return page.url
         except Exception as e:
             logger.error(f"Error extracting real download URL: {e}")
             return url
@@ -702,13 +1042,17 @@ class DownloadManager:
                         if any(full_url.lower().endswith(ext) for ext in
                               ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                             links.add(full_url)
             except Exception as e:
                 logger.warning(f"Request-based extraction failed: {e}")
             # Browser-based approach for more thorough extraction or if initial approach was inadequate
             try:
                 # Check if we need to proceed with browser-based extraction
-                if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
                     logger.info("Using browser for enhanced link extraction")
                     # Rotate proxy if needed
@@ -800,6 +1144,27 @@ class DownloadManager:
                                   ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                                 links.add(href)
                     # Check for ASP.NET specific elements that might contain exam links
                     grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
                     for grid in grid_elements:
@@ -928,6 +1293,11 @@ class DownloadManager:
                     "/resource/", "/material/", "/notes/", "/subjectmaterial/"
                 ]):
                     filtered_links.append(link)
             logger.info(f"Found {len(filtered_links)} potential exam document links")
             return filtered_links
@@ -955,31 +1325,119 @@ class DownloadManager:
                 }
             }
             // Check for links in data attributes
-            const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link]');
             for (const el of elements) {
-                for (const attr of ['data-url', 'data-href', 'data-src', 'data-link']) {
                     const val = el.getAttribute(attr);
-                    if (val && val.match(/^https?:\/\//)) {
-                        links.add(val);
                     }
                 }
             }
             // Look for URLs in inline event handlers
-            const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup]');
             for (const el of clickableElements) {
-                for (const attr of ['onclick', 'onmousedown', 'onmouseup']) {
                     const val = el.getAttribute(attr);
                     if (val) {
                         const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
                         for (let match of urlMatches) {
                             links.add(match.replace(/["']/g, ''));
                         }
                     }
                 }
             }
             return Array.from(links);
         }
         """)
@@ -1046,14 +1504,116 @@ class DownloadManager:
         for link in shadow_links:
             hidden_links.add(link)
         return hidden_links
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
             # Rotate proxy if needed
             await self.rotate_proxy_if_needed()
             # Special handling for educational exam sites
             if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
                                                       ["exam", "test", "pastpaper", "eduexp"]):
@@ -1095,7 +1655,8 @@ class DownloadManager:
                         'url': real_url,
                         'filename': filename,
                         'size': size_str,
-                        'metadata': meta
                     })
                 # If we found exam files with the specialized method, return them
@@ -1156,7 +1717,8 @@ class DownloadManager:
                         'url': real_url,
                         'filename': filename,
                         'size': await self.get_file_size(real_url),
-                        'metadata': {}
                     })
                     return found_files
@@ -1177,7 +1739,7 @@ class DownloadManager:
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
-                if '.php' in href.lower() or 'download' in href.lower():
                     full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
@@ -1185,7 +1747,8 @@ class DownloadManager:
                             'url': real_url,
                             'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
                             'size': await self.get_file_size(real_url),
-                            'metadata': {}
                         })
                         continue
@@ -1199,7 +1762,8 @@ class DownloadManager:
                         'url': file_url,
                         'filename': os.path.basename(file_url.split('?')[0]),
                         'size': size_str,
-                        'metadata': meta
                     })
                 # Handle Google Drive links
@@ -1229,7 +1793,8 @@ class DownloadManager:
                                 'view_only': is_view_only,
                                 'file_type': file_type,
                                 'file_id': file_id
-                            }
                         })
             # Also check for files in other elements (iframe, embed, object, etc.)
@@ -1246,7 +1811,8 @@ class DownloadManager:
                         'url': file_url,
                         'filename': os.path.basename(file_url.split('?')[0]),
                         'size': size_str,
-                        'metadata': meta
                     })
             # Check for file links in onclick attributes
@@ -1264,7 +1830,8 @@ class DownloadManager:
                             'url': url_match,
                             'filename': os.path.basename(url_match.split('?')[0]),
                             'size': size_str,
-                            'metadata': meta
                         })
             # Also check for data-src and data-url attributes (common in lazy-loaded sites)
@@ -1279,7 +1846,8 @@ class DownloadManager:
                                 'url': file_url,
                                 'filename': os.path.basename(file_url.split('?')[0]),
                                 'size': await self.get_file_size(file_url),
-                                'metadata': {}
                             })
                     except:
                         pass
@@ -1313,7 +1881,8 @@ class DownloadManager:
                                 'url': json_url,
                                 'filename': os.path.basename(json_url.split('?')[0]),
                                 'size': await self.get_file_size(json_url),
-                                'metadata': {}
                             })
                 except:
                     pass
@@ -1364,7 +1933,8 @@ class DownloadManager:
                             'url': href,
                             'filename': os.path.basename(href.split('?')[0]),
                             'size': await self.get_file_size(href),
-                            'metadata': {}
                         })
             # Check for hidden links that might be in JavaScript, iframes, or dynamic content
@@ -1375,7 +1945,8 @@ class DownloadManager:
                         'url': link,
                         'filename': os.path.basename(link.split('?')[0]),
                         'size': await self.get_file_size(link),
-                        'metadata': {}
                     })
             # Deduplicate files by URL
@@ -1393,7 +1964,7 @@ class DownloadManager:
             return []
     async def download_file(self, file_info, save_dir, referer):
-        file_url = file_info['url']
         fname = file_info['filename']
         path = os.path.join(save_dir, fname)
         base, ext = os.path.splitext(fname)
@@ -1403,6 +1974,11 @@ class DownloadManager:
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
             # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
@@ -1414,6 +1990,7 @@ class DownloadManager:
                     logger.info(f"Attempting to download view-only file: {file_url}")
                     result_path = await self.force_download_viewonly(file_info, path)
                     if result_path:
                         return result_path
                     # If that failed, try the regular download approach
@@ -1422,13 +1999,60 @@ class DownloadManager:
                 # Try regular download methods
                 success = await self.download_from_google_drive(file_url, path)
                 if success:
                     return path
                 # If all methods failed for Google Drive, try one last approach
                 logger.warning("All standard methods failed, attempting force download")
                 result_path = await self.force_download_viewonly(file_info, path)
                 return result_path if result_path else None
             # Rotate proxy if needed
             await self.rotate_proxy_if_needed()
@@ -1456,6 +2080,7 @@ class DownloadManager:
                             # Verify file was downloaded correctly
                             if os.path.exists(path) and os.path.getsize(path) > 0:
                                 return path
             except Exception as e:
                 logger.warning(f"Direct download failed: {e}, trying browser approach")
@@ -1475,7 +2100,9 @@ class DownloadManager:
                         content = await response.body()
                         with open(path, 'wb') as f:
                             f.write(content)
-                        return path
                     else:
                         logger.error(f"Download failed with status {response.status}: {file_url}")
@@ -1502,6 +2129,7 @@ class DownloadManager:
                     await download.save_as(path)
                     if os.path.exists(path) and os.path.getsize(path) > 0:
                         return path
                 except Exception as e:
                     logger.error(f"Browser download manager approach failed: {e}")
@@ -2515,6 +3143,21 @@ class DownloadManager:
         try:
             logger.info(f"Fetching sublinks from: {url}")
             # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
             if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
                                                       ["exam", "test", "pastpaper", "eduexp"]):
@@ -2532,8 +3175,12 @@ class DownloadManager:
             await self.rotate_proxy_if_needed()
             # Standard sublink extraction for all sites
-            await self.page.goto(url, timeout=30000, wait_until='networkidle')
             # Get base URL for resolving relative links
             parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
@@ -2732,8 +3379,46 @@ class DownloadManager:
                 if href and not href.startswith('javascript:'):
                     links.add(href)
             logger.info(f"Found {len(links)} sublinks")
-            return list(links)[:limit]
         except Exception as e:
             logger.error(f"Error getting sublinks from {url}: {e}")
@@ -2834,6 +3519,9 @@ class DownloadManager:
         file_count_text = st.empty()
         try:
             progress_text.text("Analyzing main page...")
             # Special handling for ASP.NET pages
             is_aspnet = False
@@ -2848,6 +3536,25 @@ class DownloadManager:
             except Exception:
                 pass
             # Extract files from main page
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
@@ -2873,9 +3580,50 @@ class DownloadManager:
                 progress_bar.progress(progress)
                 try:
-                    # Use a longer timeout for ASP.NET pages which can be slower
                     sub_timeout = timeout * 2 if is_aspnet else timeout
                     # Extract files from sublink
                     sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                     all_files.extend(sub_files)
@@ -2994,21 +3742,22 @@ def main():
                     if custom_ext_list != valid_ext_list:
                         st.warning("Invalid extensions ignored. Use format like '.csv'.")
-                    @st.cache_resource
-                    def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
-                        async def _run():
-                            async with DownloadManager(
-                                use_proxy=use_proxy_val,
-                                proxy=proxy_val,
-                                use_stealth=use_stealth_val
-                            ) as dm:
-                                files = await dm.deep_search(url, ext_list, max_links, timeout_val)
-                                return files
-                        return asyncio.run(_run())
                     with st.spinner("Searching for files..."):
-                        files = run_deep_search(url, valid_ext_list, max_sublinks,
-                                              sublink_timeout, use_proxy, proxy, use_stealth)
                     if files:
                         st.session_state.discovered_files = files
@@ -3031,7 +3780,7 @@ def main():
             if st.button("Search Files", key="rag_search_btn") and search_query:
                 # Initialize RAG search engine
                 if not st.session_state.rag_indexed:
-                    rag_search = RAGSearch()
                     with st.spinner("Indexing files for search..."):
                         # First download files to extract text
@@ -3044,7 +3793,7 @@ def main():
                                 for i, file_info in enumerate(files):
                                     # Only process common text-based file formats
                                     ext = os.path.splitext(file_info['filename'])[1].lower()
-                                    if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json']:
                                         path = await dm.download_file(file_info, temp_dir, url)
                                         if path:
                                             with open(path, 'rb') as f:
@@ -3077,14 +3826,28 @@ def main():
                         for result in search_results:
                             file_info = result['file_info']
                             score = result['score']
                             with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
                                 st.write(f"Size: {file_info['size']}")
                                 if 'metadata' in file_info and file_info['metadata']:
                                     st.write("Metadata:")
                                     for k, v in file_info['metadata'].items():
                                         if k != 'file_id':  # Skip technical details
                                             st.write(f"- {k}: {v}")
                                 # Add direct download button
                                 if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
                                     with st.spinner(f"Downloading {file_info['filename']}..."):
@@ -3267,94 +4030,192 @@ def main():
                                 # Create expanders for each result
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
-                                        if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
-                                            st.session_state.deep_search_url = url
-                                            st.session_state.do_deep_search = True
                             else:
                                 st.warning("No search results found.")
                 asyncio.run(run_search())
-            # Handle deep search based on search results
-            if st.session_state.do_deep_search and st.session_state.deep_search_url:
-                url = st.session_state.deep_search_url
-                st.info(f"Deep searching: {url}")
-                # Reset the flag to avoid re-running
-                st.session_state.do_deep_search = False
-                # Set up custom extensions
-                custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
-                valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
-                @st.cache_resource
-                def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
-                    async def _run():
-                        async with DownloadManager(
-                            use_proxy=use_proxy_val,
-                            proxy=proxy_val,
-                            use_stealth=use_stealth_val
-                        ) as dm:
-                            files = await dm.deep_search(url, ext_list, max_links, timeout_val)
-                            return files
-                    return asyncio.run(_run())
-                with st.spinner("Searching for files..."):
-                    files = run_deep_search(url, valid_ext_list, max_sublinks,
-                                           sublink_timeout, use_proxy, proxy, use_stealth)
-                if files:
-                    st.session_state.discovered_files = files
-                    st.session_state.current_url = url
-                    st.success(f"Found {len(files)} files!")
-                    # Show files with direct download options
-                    download_dir = "./downloads"
-                    os.makedirs(download_dir, exist_ok=True)
-                    for i, file in enumerate(files):
-                        col1, col2, col3 = st.columns([3, 1, 1])
-                        with col1:
-                            filename = file['filename']
-                            size = file['size']
-                            meta = file.get('metadata', {})
-                            file_info = f"{filename} ({size})"
-                            if meta and 'Pages' in meta:
-                                file_info += f" - {meta.get('Pages', '')} pages"
-                            st.markdown(f"**{i+1}. {file_info}**")
-                        with col2:
-                            # Add direct download button for each file
-                            if st.button(f"Download", key=f"direct_dl_{i}"):
-                                with st.spinner(f"Downloading {filename}..."):
-                                    async def download_single_file():
-                                        async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
-                                            path = await dm.download_file(file, download_dir, url)
-                                            return path
-                                    downloaded_path = asyncio.run(download_single_file())
-                                    if downloaded_path:
-                                        with open(downloaded_path, "rb") as f:
-                                            file_data = f.read()
-                                        st.download_button(
-                                            label=f"Save {filename}",
-                                            data=file_data,
-                                            file_name=filename,
-                                            mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
-                                            key=f"save_file_{i}"
-                                        )
-                        with col3:
-                            # Add to selection for batch download
-                            if i in st.session_state.selected_files:
-                                if st.button("Unselect", key=f"unselect_{i}"):
-                                    st.session_state.selected_files.remove(i)
                             else:
-                                if st.button("Select", key=f"select_{i}"):
-                                    st.session_state.selected_files.append(i)
-                else:
-                    st.warning("No files found.")
     # Add a special section for direct Google Drive file download
     st.markdown("---")
@@ -3400,7 +4261,11 @@ def main():
     # Add footer with attribution
     st.markdown('---')
-    st.markdown('Created by [Euler314](https://github.com/yu314coder)')
 if __name__ == "__main__":
     main()

 from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 import asyncio
 import logging
+from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote
 import re
 from pathlib import Path
 from io import BytesIO
 import google.auth.transport.requests
 import googleapiclient.http
+# Enhanced RAG search imports
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import docx2txt
+try:
+    from langdetect import detect as detect_language
+except ImportError:
+    # If langdetect is not available, we'll use a simple fallback
+    def detect_language(text):
+        return "en"
+# Try to download NLTK data if not already present
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    try:
+        nltk.download('punkt', quiet=True)
+    except:
+        pass
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     "proxies": []  # Will be populated from the UI if needed
 }
+# -------------------- Enhanced RAG Search Class --------------------
+class EnhancedRAGSearch:
     def __init__(self):
         self.file_texts = []
+        self.chunks = []  # Document chunks for more targeted search
+        self.chunk_metadata = []  # Metadata for each chunk
         self.file_metadata = []
+        self.vectorizer = TfidfVectorizer(
+            stop_words='english',
+            ngram_range=(1, 2),  # Use bigrams for better context
+            max_features=10000,   # Use more features for better representation
+            min_df=2              # Minimum document frequency
+        )
         self.vectors = None
+        self.chunk_vectors = None
+        self.languages = []
     def add_file(self, file_data, file_info):
+        """Add a file to the search index with improved processing"""
+        file_ext = os.path.splitext(file_info['filename'])[1].lower()
         text = self.extract_text(file_data, file_ext)
         if text:
+            # Store the whole document text
             self.file_texts.append(text)
             self.file_metadata.append(file_info)
+            # Try to detect language
+            try:
+                lang = detect_language(text[:1000])  # Use just the first 1000 chars for speed
+                self.languages.append(lang)
+            except:
+                self.languages.append('en')  # Default to English
+            # Create chunks for more granular search
+            chunks = self.create_chunks(text)
+            for chunk in chunks:
+                self.chunks.append(chunk)
+                self.chunk_metadata.append({
+                    'file_info': file_info,
+                    'chunk_size': len(chunk),
+                    'file_index': len(self.file_texts) - 1
+                })
             return True
         return False
+    def create_chunks(self, text, chunk_size=1000, overlap=200):
+        """Split text into overlapping chunks for better search precision"""
+        # Try to use NLTK for sentence-aware chunking
+        try:
+            sentences = nltk.sent_tokenize(text)
+            chunks = []
+            current_chunk = ""
+            for sentence in sentences:
+                if len(current_chunk) + len(sentence) <= chunk_size:
+                    current_chunk += sentence + " "
+                else:
+                    # Add current chunk if it has content
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    # Start new chunk with overlap from previous chunk
+                    if len(current_chunk) > overlap:
+                        # Find the last space within the overlap region
+                        overlap_text = current_chunk[-overlap:]
+                        last_space = overlap_text.rfind(' ')
+                        if last_space != -1:
+                            current_chunk = current_chunk[-(overlap-last_space):] + sentence + " "
+                        else:
+                            current_chunk = sentence + " "
+                    else:
+                        current_chunk = sentence + " "
+            # Add the last chunk if it has content
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            return chunks
+        except:
+            # Fallback to simpler chunking approach
+            chunks = []
+            for i in range(0, len(text), chunk_size - overlap):
+                chunk = text[i:i + chunk_size]
+                if chunk:
+                    chunks.append(chunk)
+            return chunks
     def extract_text(self, file_data, file_ext):
+        """Extract text from different file types with enhanced support"""
         try:
             if file_ext.lower() == '.pdf':
                 reader = PyPDF2.PdfReader(BytesIO(file_data))
                 text = ""
                 for page in reader.pages:
+                    extracted = page.extract_text()
+                    if extracted:
+                        text += extracted + "\n"
+                    # If text extraction fails, try to OCR (would need extra libraries)
                 return text
             elif file_ext.lower() in ['.docx', '.doc']:
                 return docx2txt.process(BytesIO(file_data))
+            elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']:
+                # Handle both UTF-8 and other common encodings
+                try:
+                    return file_data.decode('utf-8', errors='ignore')
+                except:
+                    encodings = ['latin-1', 'iso-8859-1', 'windows-1252']
+                    for enc in encodings:
+                        try:
+                            return file_data.decode(enc, errors='ignore')
+                        except:
+                            pass
+                # Last resort fallback
                 return file_data.decode('utf-8', errors='ignore')
+            elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']:
+                # For these types, we would need additional libraries
+                # For now, return a placeholder message
+                return f"[Content of {file_ext} file - install additional libraries for full text extraction]"
             else:
                 return ""
         except Exception as e:
             return ""
     def build_index(self):
+        """Build both document and chunk search indices"""
         if not self.file_texts:
             return False
         try:
+            # Build document-level index
             self.vectors = self.vectorizer.fit_transform(self.file_texts)
+            # Build chunk-level index if we have chunks
+            if self.chunks:
+                self.chunk_vectors = self.vectorizer.transform(self.chunks)
             return True
         except Exception as e:
             logger.error(f"Error building search index: {e}")
             return False
+    def expand_query(self, query):
+        """Add related terms to query for better recall"""
+        # This is a simple implementation - could be enhanced with a proper synonym API
+        expanded_terms = []
+        # Add some common expansions for document search
+        if "exam" in query.lower():
+            expanded_terms.extend(["test", "assessment", "quiz", "paper"])
+        elif "document" in query.lower():
+            expanded_terms.extend(["file", "paper", "report"])
+        elif "manual" in query.lower():
+            expanded_terms.extend(["guide", "instruction", "documentation"])
+        # Return original query plus expanded terms
+        if expanded_terms:
+            return f"{query} {' '.join(expanded_terms)}"
+        return query
+    def search(self, query, top_k=5, search_chunks=True):
+        """Enhanced search with both document and chunk-level search"""
         if self.vectors is None:
             return []
         try:
+            # Expand the query for better recall
+            expanded_query = self.expand_query(query)
+            # Transform the query
+            query_vector = self.vectorizer.transform([expanded_query])
             results = []
+            # First search at document level for higher-level matches
+            if self.vectors is not None:
+                doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
+                top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
+                for i, idx in enumerate(top_doc_indices):
+                    if doc_similarities[idx] > 0.1:  # Threshold to exclude irrelevant results
+                        results.append({
+                            'file_info': self.file_metadata[idx],
+                            'score': float(doc_similarities[idx]),
+                            'rank': i+1,
+                            'match_type': 'document',
+                            'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
+                        })
+            # Then search at chunk level for more specific matches if enabled
+            if search_chunks and self.chunk_vectors is not None:
+                chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
+                top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1]  # Get more chunk results
+                # Use a set to avoid duplicate file results
+                seen_files = set(r['file_info']['url'] for r in results)
+                for i, idx in enumerate(top_chunk_indices):
+                    if chunk_similarities[idx] > 0.15:  # Higher threshold for chunks
+                        file_index = self.chunk_metadata[idx]['file_index']
+                        file_info = self.file_metadata[file_index]
+                        # Only add if we haven't already included this file
+                        if file_info['url'] not in seen_files:
+                            seen_files.add(file_info['url'])
+                            results.append({
+                                'file_info': file_info,
+                                'score': float(chunk_similarities[idx]),
+                                'rank': len(results) + 1,
+                                'match_type': 'chunk',
+                                'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
+                                'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
+                            })
+                            # Stop after we've found enough results
+                            if len(results) >= top_k*1.5:
+                                break
+            # Sort combined results by score
+            results.sort(key=lambda x: x['score'], reverse=True)
+            # Re-rank and truncate
+            for i, result in enumerate(results[:top_k]):
+                result['rank'] = i+1
+            return results[:top_k]
         except Exception as e:
             logger.error(f"Error during search: {e}")
             return []
     html_lower = html_content.lower()
     return any(pattern in html_lower for pattern in captcha_patterns)
+def is_download_link(url):
+    """Enhanced function to detect if a URL is likely a download link"""
+    # Check for obvious download indicators in URL
+    url_lower = url.lower()
+    # Check for common download-related terms in the URL
+    download_terms = [
+        'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
+        'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
+    ]
+    if any(term in url_lower for term in download_terms):
+        return True
+    # Check for common download script patterns
+    script_patterns = [
+        'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
+        'download.aspx', 'getfile.aspx', 'file.aspx',
+        'downloadhandler', 'filehandler', 'filedownload',
+        'download.jsp', 'download.cgi', 'download.do',
+        'download-file', 'get-file',
+        'downloadfile', 'getfile', 'viewfile',
+        'Action=downloadfile', 'action=download', 'action=view',
+        'download?', 'file?', 'get?', 'view?'
+    ]
+    if any(pattern in url_lower for pattern in script_patterns):
+        return True
+    # Check for common file extensions in the URL path or parameters
+    path = urlparse(url).path
+    common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
+                         '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
+                         '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']
+    if any(ext in path.lower() for ext in common_extensions):
+        return True
+    # Check for file ID or file parameters in URL
+    params = parse_qs(urlparse(url).query)
+    param_keys = params.keys()
+    file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
+    if any(key.lower() in file_param_indicators for key in param_keys):
+        return True
+    # Check for complex encoding patterns like in the example URL
+    if 'Action=downloadfile' in url or 'fname=' in url:
+        return True
+    return False
+def normalize_download_url(url):
+    """Normalize download URLs to handle various formats and encodings"""
+    try:
+        # Handle common URL shorteners and redirections
+        parsed = urlparse(url)
+        # Handle phpMyAdmin-style encoded URLs
+        if 'Action=downloadfile' in url and 'file=' in url:
+            # Extract the encoded file parameter
+            params = parse_qs(parsed.query)
+            if 'file' in params:
+                # This is just a placeholder - in a real implementation,
+                # you would need to handle the specific encoding used
+                encoded_file = params['file'][0]
+                # Keep the URL as is for now, since we'll handle it during download
+                return url
+        # Handle URLs with fname parameter (like in the example)
+        if 'fname=' in url:
+            # Keep as is - we'll handle this specially during download
+            return url
+        # For other URLs, make sure they are properly quoted
+        path = parsed.path
+        # Only quote the path portion if needed
+        if '%' not in path and ' ' in path:
+            path = quote(path)
+        # Reconstruct the URL
+        normalized = parsed._replace(path=path).geturl()
+        return normalized
+    except Exception as e:
+        logger.error(f"Error normalizing URL {url}: {e}")
+        return url
 # -------------------- Google Drive Functions --------------------
 def get_google_auth_url():
     client_config = GOOGLE_OAUTH_CONFIG["web"]
         self.request_count = 0
         self.captcha_detected = False
         self.download_timeout = 300  # 5 minutes timeout for downloads
+        # Track visited URLs to avoid revisiting the same URL multiple times
+        self.visited_urls = set()
+        # Track successfully downloaded files to avoid redownloading
+        self.downloaded_files = set()
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
         try:
             await self.rotate_proxy_if_needed()
+            # For complex download URLs, we need to be careful with HEAD requests
+            if '?' in url or 'Action=downloadfile' in url or 'fname=' in url:
+                # For these URLs, we'll try a more reliable approach using range headers
+                headers = {
+                    'User-Agent': get_random_user_agent(),
+                    'Range': 'bytes=0-0'  # Just request the first byte to check headers
+                }
+                try:
+                    with requests.get(url, headers=headers, stream=True, timeout=10) as r:
+                        if 'Content-Range' in r.headers:
+                            content_range = r.headers['Content-Range']
+                            match = re.search(r'bytes 0-0/(\d+)', content_range)
+                            if match:
+                                size = int(match.group(1))
+                                return sizeof_fmt(size)
+                        if 'Content-Length' in r.headers:
+                            size = int(r.headers['Content-Length'])
+                            # If size is 1, it's likely just our single requested byte
+                            if size > 1:
+                                return sizeof_fmt(size)
+                except Exception as e:
+                    logger.warning(f"Error getting file size with Range request: {e}")
+                # Fallback to browser approach
+                try:
+                    async with self.context.new_page() as page:
+                        response = await page.request.head(url, timeout=15000)
+                        length = response.headers.get('Content-Length', None)
+                        if length:
+                            return sizeof_fmt(int(length))
+                except Exception as e:
+                    logger.warning(f"Error getting file size with browser: {e}")
+                return "Unknown Size"
+            else:
+                # Standard approach for normal URLs
+                async with self.context.new_page() as page:
+                    response = await page.request.head(url, timeout=15000)
+                    length = response.headers.get('Content-Length', None)
+                    if length:
+                        return sizeof_fmt(int(length))
+                    else:
+                        return "Unknown Size"
         except Exception as e:
             logger.warning(f"Error getting file size: {e}")
             return "Unknown Size"
             return {}
     async def extract_real_download_url(self, url):
+        """Enhanced method to extract real download URL, handling complex URLs"""
         try:
+            # Check if this is a complex download URL that needs special handling
+            if 'Action=downloadfile' in url or 'fname=' in url:
+                logger.info(f"Complex download URL detected: {url}")
+                # For these special cases, we'll use the browser to navigate and intercept redirects
+                await self.rotate_proxy_if_needed()
+                async with self.context.new_page() as page:
+                    # Set up request interception to capture redirects
+                    await page.route('**', lambda route: route.continue_())
+                    # Listen for all responses
+                    responses = []
+                    page.on('response', lambda response: responses.append(response))
+                    try:
+                        # Go to the URL
+                        await page.goto(url, wait_until='networkidle', timeout=30000)
+                        # Check all responses for potential downloads
+                        for response in responses:
+                            # Look for content-disposition headers indicating a download
+                            content_disposition = response.headers.get('Content-Disposition', '')
+                            if 'attachment' in content_disposition or 'filename=' in content_disposition:
+                                return response.url
+                            # Look for content-type headers indicating a file
+                            content_type = response.headers.get('Content-Type', '')
+                            if content_type and content_type != 'text/html' and not content_type.startswith('text/'):
+                                return response.url
+                        # If no clear download was detected, return the final URL
+                        return page.url
+                    except Exception as e:
+                        logger.warning(f"Error extracting real download URL: {e}")
+                        return url
+            else:
+                # Standard approach for normal URLs
+                await self.rotate_proxy_if_needed()
+                async with self.context.new_page() as page:
+                    response = await page.goto(url, wait_until='networkidle', timeout=30000)
+                    if response and response.headers.get('location'):
+                        return response.headers['location']
+                    return page.url
         except Exception as e:
             logger.error(f"Error extracting real download URL: {e}")
             return url
                         if any(full_url.lower().endswith(ext) for ext in
                               ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                             links.add(full_url)
+                    # Check for download script parameters
+                    if "Action=downloadfile" in url or "fname=" in url:
+                        links.add(url)  # Add the URL itself as it's a download link
             except Exception as e:
                 logger.warning(f"Request-based extraction failed: {e}")
             # Browser-based approach for more thorough extraction or if initial approach was inadequate
             try:
                 # Check if we need to proceed with browser-based extraction
+                if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url:
                     logger.info("Using browser for enhanced link extraction")
                     # Rotate proxy if needed
                                   ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
                                 links.add(href)
+                    # Check for download links in the page
+                    download_links = await self.page.evaluate("""
+                        () => {
+                            // Find all links that might be download links
+                            const links = Array.from(document.querySelectorAll('a[href]'));
+                            return links
+                                .filter(a => {
+                                    const href = a.href.toLowerCase();
+                                    return href.includes('download') ||
+                                           href.includes('getfile') ||
+                                           href.includes('view.php') ||
+                                           href.includes('action=downloadfile') ||
+                                           href.includes('fname=');
+                                })
+                                .map(a => a.href);
+                        }
+                    """)
+                    for dl_link in download_links:
+                        links.add(dl_link)
                     # Check for ASP.NET specific elements that might contain exam links
                     grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
                     for grid in grid_elements:
                     "/resource/", "/material/", "/notes/", "/subjectmaterial/"
                 ]):
                     filtered_links.append(link)
+                    continue
+                # Check for download links (these may not have obvious extensions)
+                if is_download_link(link):
+                    filtered_links.append(link)
             logger.info(f"Found {len(filtered_links)} potential exam document links")
             return filtered_links
                 }
             }
+            // Look for download-related variables in scripts
+            for (const script of scripts) {
+                const content = script.textContent || '';
+                // Look for common patterns for file URLs in JavaScript
+                if (content.includes('downloadURL') || content.includes('fileURL') ||
+                    content.includes('pdfURL') || content.includes('documentURL')) {
+                    // Extract potential URLs
+                    const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || [];
+                    for (let match of potentialUrls) {
+                        const url = match.replace(/["']/g, '');
+                        // Try to resolve relative URLs
+                        if (url.startsWith('/') || !url.includes('://')) {
+                            if (url.startsWith('/')) {
+                                links.add(window.location.origin + url);
+                            } else {
+                                // Handle relative paths more carefully
+                                const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+                                links.add(base + url);
+                            }
+                        } else if (url.startsWith('http')) {
+                            links.add(url);
+                        }
+                    }
+                }
+            }
             // Check for links in data attributes
+            const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]');
             for (const el of elements) {
+                for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) {
                     const val = el.getAttribute(attr);
+                    if (val) {
+                        // Try to resolve relative URLs
+                        if (val.startsWith('/')) {
+                            links.add(window.location.origin + val);
+                        } else if (val.startsWith('http')) {
+                            links.add(val);
+                        } else if (!val.startsWith('javascript:') && !val.startsWith('#')) {
+                            // Handle relative paths
+                            const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+                            links.add(base + val);
+                        }
                     }
                 }
             }
             // Look for URLs in inline event handlers
+            const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]');
             for (const el of clickableElements) {
+                for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) {
                     const val = el.getAttribute(attr);
                     if (val) {
+                        // Check for JavaScript URLs with window.location
+                        if (val.includes('window.location') || val.includes('document.location')) {
+                            const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/);
+                            if (urlMatch && urlMatch[1]) {
+                                const url = urlMatch[1];
+                                if (url.startsWith('/')) {
+                                    links.add(window.location.origin + url);
+                                } else if (url.startsWith('http')) {
+                                    links.add(url);
+                                } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
+                                    const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+                                    links.add(base + url);
+                                }
+                            }
+                        }
+                        // Check for direct URLs in attributes
                         const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
                         for (let match of urlMatches) {
                             links.add(match.replace(/["']/g, ''));
                         }
+                        // Check for download.php and similar patterns
+                        if (val.includes('download.php') || val.includes('getfile.php') ||
+                            val.includes('Action=downloadfile') || val.includes('viewfile.php')) {
+                            // Handle both onclick handlers and direct hrefs
+                            let url = '';
+                            if (attr === 'href') {
+                                url = val;
+                            } else {
+                                // Extract URL from JavaScript
+                                const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i);
+                                if (jsUrlMatch) {
+                                    url = jsUrlMatch[1];
+                                }
+                            }
+                            // Resolve URL if needed
+                            if (url) {
+                                if (url.startsWith('/')) {
+                                    links.add(window.location.origin + url);
+                                } else if (url.startsWith('http')) {
+                                    links.add(url);
+                                } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
+                                    const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
+                                    links.add(base + url);
+                                }
+                            }
+                        }
                     }
                 }
             }
+            // Find PHP/ASP file download links
+            const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]');
+            for (const link of fileLinks) {
+                links.add(link.href);
+            }
             return Array.from(links);
         }
         """)
         for link in shadow_links:
             hidden_links.add(link)
+        # Look for download links in forms
+        form_links = await page.evaluate("""
+        () => {
+            const links = new Set();
+            // Check for form actions that might be download endpoints
+            const forms = document.querySelectorAll('form');
+            for (const form of forms) {
+                const action = form.action || '';
+                if (action && (
+                    action.includes('download') ||
+                    action.includes('getfile') ||
+                    action.includes('viewfile') ||
+                    action.includes('Action=downloadfile')
+                )) {
+                    // Collect input values that might be needed for the download
+                    const inputs = {};
+                    const formInputs = form.querySelectorAll('input[name]');
+                    for (const input of formInputs) {
+                        inputs[input.name] = input.value;
+                    }
+                    // Store both the form action and any important inputs
+                    links.add(action);
+                }
+            }
+            return Array.from(links);
+        }
+        """)
+        for link in form_links:
+            hidden_links.add(link)
         return hidden_links
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
+            # Normalize the URL to handle special cases
+            normalized_url = normalize_download_url(url)
+            # Skip if we've already visited this URL
+            if normalized_url in self.visited_urls:
+                logger.info(f"Skipping already visited URL: {normalized_url}")
+                return []
+            # Mark this URL as visited
+            self.visited_urls.add(normalized_url)
             # Rotate proxy if needed
             await self.rotate_proxy_if_needed()
+            # First check if this is a direct download link (Action=downloadfile or fname parameter)
+            if is_download_link(normalized_url):
+                logger.info(f"Processing potential direct download link: {normalized_url}")
+                # Try to extract the real download URL if needed
+                real_url = await self.extract_real_download_url(normalized_url)
+                # Determine filename - for complex URLs this can be tricky
+                filename = os.path.basename(urlparse(real_url).path)
+                # Handle URL-encoded filenames
+                if '%' in filename:
+                    try:
+                        filename = unquote(filename)
+                    except Exception:
+                        pass
+                # For URLs with download parameters, try to extract filename from query
+                if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
+                    # Look for file parameter
+                    params = parse_qs(urlparse(normalized_url).query)
+                    # Check common filename parameters
+                    for param in ['file', 'filename', 'name', 'fname', 'f']:
+                        if param in params and params[param]:
+                            potential_filename = params[param][0]
+                            if potential_filename and '/' not in potential_filename and '\\' not in potential_filename:
+                                filename = os.path.basename(potential_filename)
+                                break
+                # If still no valid filename, use domain-based fallback
+                if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
+                    domain = get_domain(real_url)
+                    # Try to determine file type from content-type or extension hints in URL
+                    ext = '.pdf'  # Default
+                    for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
+                        if common_ext in normalized_url.lower():
+                            ext = common_ext
+                            break
+                    filename = f"file_from_{domain}{ext}"
+                # Get file size
+                size_str = await self.get_file_size(real_url)
+                # Add to found files
+                found_files.append({
+                    'url': real_url,
+                    'filename': filename,
+                    'size': size_str,
+                    'metadata': {},
+                    'download_url': normalized_url  # Keep original URL for downloading
+                })
+                # For direct download links, we can return early
+                if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)):
+                    return found_files
             # Special handling for educational exam sites
             if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
                                                       ["exam", "test", "pastpaper", "eduexp"]):
                         'url': real_url,
                         'filename': filename,
                         'size': size_str,
+                        'metadata': meta,
+                        'download_url': link  # Store original link for downloading
                     })
                 # If we found exam files with the specialized method, return them
                         'url': real_url,
                         'filename': filename,
                         'size': await self.get_file_size(real_url),
+                        'metadata': {},
+                        'download_url': final_url  # Keep original URL for downloading
                     })
                     return found_files
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
+                if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower():
                     full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                             'url': real_url,
                             'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
                             'size': await self.get_file_size(real_url),
+                            'metadata': {},
+                            'download_url': full_url  # Original URL for download
                         })
                         continue
                         'url': file_url,
                         'filename': os.path.basename(file_url.split('?')[0]),
                         'size': size_str,
+                        'metadata': meta,
+                        'download_url': file_url  # Same as URL for direct links
                     })
                 # Handle Google Drive links
                                 'view_only': is_view_only,
                                 'file_type': file_type,
                                 'file_id': file_id
+                            },
+                            'download_url': href  # Same as URL for Google Drive
                         })
             # Also check for files in other elements (iframe, embed, object, etc.)
                         'url': file_url,
                         'filename': os.path.basename(file_url.split('?')[0]),
                         'size': size_str,
+                        'metadata': meta,
+                        'download_url': file_url
                     })
             # Check for file links in onclick attributes
                             'url': url_match,
                             'filename': os.path.basename(url_match.split('?')[0]),
                             'size': size_str,
+                            'metadata': meta,
+                            'download_url': url_match
                         })
             # Also check for data-src and data-url attributes (common in lazy-loaded sites)
                                 'url': file_url,
                                 'filename': os.path.basename(file_url.split('?')[0]),
                                 'size': await self.get_file_size(file_url),
+                                'metadata': {},
+                                'download_url': file_url
                             })
                     except:
                         pass
                                 'url': json_url,
                                 'filename': os.path.basename(json_url.split('?')[0]),
                                 'size': await self.get_file_size(json_url),
+                                'metadata': {},
+                                'download_url': json_url
                             })
                 except:
                     pass
                             'url': href,
                             'filename': os.path.basename(href.split('?')[0]),
                             'size': await self.get_file_size(href),
+                            'metadata': {},
+                            'download_url': href
                         })
             # Check for hidden links that might be in JavaScript, iframes, or dynamic content
                         'url': link,
                         'filename': os.path.basename(link.split('?')[0]),
                         'size': await self.get_file_size(link),
+                        'metadata': {},
+                        'download_url': link
                     })
             # Deduplicate files by URL
             return []
     async def download_file(self, file_info, save_dir, referer):
+        file_url = file_info.get('download_url', file_info['url'])  # Use download_url if available
         fname = file_info['filename']
         path = os.path.join(save_dir, fname)
         base, ext = os.path.splitext(fname)
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
+        # Check if we've already downloaded this file
+        if file_url in self.downloaded_files:
+            logger.info(f"File already downloaded: {file_url}")
+            return None
         try:
             # Special handling for Google Drive files
             if "drive.google.com" in file_url or "docs.google.com" in file_url:
                     logger.info(f"Attempting to download view-only file: {file_url}")
                     result_path = await self.force_download_viewonly(file_info, path)
                     if result_path:
+                        self.downloaded_files.add(file_url)
                         return result_path
                     # If that failed, try the regular download approach
                 # Try regular download methods
                 success = await self.download_from_google_drive(file_url, path)
                 if success:
+                    self.downloaded_files.add(file_url)
                     return path
                 # If all methods failed for Google Drive, try one last approach
                 logger.warning("All standard methods failed, attempting force download")
                 result_path = await self.force_download_viewonly(file_info, path)
+                if result_path:
+                    self.downloaded_files.add(file_url)
                 return result_path if result_path else None
+            # Special handling for complex download URLs
+            if 'Action=downloadfile' in file_url or 'fname=' in file_url:
+                logger.info(f"Using browser download approach for complex URL: {file_url}")
+                # For these URLs, we'll need to navigate to the page and handle the download
+                await self.rotate_proxy_if_needed()
+                async with self.context.new_page() as page:
+                    # Set up download event listener
+                    download_promise = page.wait_for_event("download")
+                    # Navigate to the URL
+                    await page.goto(file_url, timeout=60000)
+                    # Wait for the download to start
+                    try:
+                        download = await download_promise
+                        await download.save_as(path)
+                        if os.path.exists(path) and os.path.getsize(path) > 0:
+                            self.downloaded_files.add(file_url)
+                            return path
+                    except Exception as e:
+                        logger.error(f"Browser download failed: {e}")
+                        # If download didn't start automatically, try to find and click download buttons
+                        download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]')
+                        for button in download_buttons:
+                            try:
+                                await button.click()
+                                try:
+                                    download = await download_promise
+                                    await download.save_as(path)
+                                    if os.path.exists(path) and os.path.getsize(path) > 0:
+                                        self.downloaded_files.add(file_url)
+                                        return path
+                                except:
+                                    pass
+                            except:
+                                continue
+                # If browser approach failed, try direct request as last resort
+                logger.info("Browser approach failed, trying direct request")
             # Rotate proxy if needed
             await self.rotate_proxy_if_needed()
                             # Verify file was downloaded correctly
                             if os.path.exists(path) and os.path.getsize(path) > 0:
+                                self.downloaded_files.add(file_url)
                                 return path
             except Exception as e:
                 logger.warning(f"Direct download failed: {e}, trying browser approach")
                         content = await response.body()
                         with open(path, 'wb') as f:
                             f.write(content)
+                        if os.path.exists(path) and os.path.getsize(path) > 0:
+                            self.downloaded_files.add(file_url)
+                            return path
                     else:
                         logger.error(f"Download failed with status {response.status}: {file_url}")
                     await download.save_as(path)
                     if os.path.exists(path) and os.path.getsize(path) > 0:
+                        self.downloaded_files.add(file_url)
                         return path
                 except Exception as e:
                     logger.error(f"Browser download manager approach failed: {e}")
         try:
             logger.info(f"Fetching sublinks from: {url}")
+            # Check if this is a direct download link
+            if is_download_link(url):
+                logger.info(f"URL appears to be a direct download link: {url}")
+                links.add(url)
+                return list(links)[:limit]
+            # Skip if we've already visited this URL
+            normalized_url = normalize_download_url(url)
+            if normalized_url in self.visited_urls:
+                logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}")
+                return list(links)[:limit]
+            # Add to visited URLs
+            self.visited_urls.add(normalized_url)
             # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
             if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
                                                       ["exam", "test", "pastpaper", "eduexp"]):
             await self.rotate_proxy_if_needed()
             # Standard sublink extraction for all sites
+            try:
+                await self.page.goto(url, timeout=30000, wait_until='networkidle')
+            except Exception as e:
+                logger.warning(f"Error navigating to URL for sublink extraction: {e}")
+                # Continue with what we have, we'll try to extract links anyway
             # Get base URL for resolving relative links
             parsed_base = urlparse(url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
                 if href and not href.startswith('javascript:'):
                     links.add(href)
+            # Find all download links
+            download_links = await self.page.evaluate("""
+                () => {
+                    return Array.from(document.querySelectorAll('a[href]'))
+                        .filter(a => {
+                            const href = a.href.toLowerCase();
+                            return href.includes('download') ||
+                                   href.includes('file') ||
+                                   href.includes('get') ||
+                                   href.includes('view.php') ||
+                                   href.includes('action=') ||
+                                   href.includes('fname=');
+                        })
+                        .map(a => a.href);
+                }
+            """)
+            for download_link in download_links:
+                links.add(download_link)
+            # Also check for hidden links in JavaScript, iframes, or dynamic content
+            js_links = await self.discover_hidden_links(self.page)
+            for link in js_links:
+                links.add(link)
             logger.info(f"Found {len(links)} sublinks")
+            # Prioritize download links
+            prioritized_links = []
+            normal_links = []
+            for link in links:
+                if is_download_link(link):
+                    prioritized_links.append(link)
+                else:
+                    normal_links.append(link)
+            # Return prioritized links first, then normal links, up to the limit
+            result = prioritized_links + normal_links
+            return result[:limit]
         except Exception as e:
             logger.error(f"Error getting sublinks from {url}: {e}")
         file_count_text = st.empty()
         try:
+            # Reset the visited URLs for a fresh deep search
+            self.visited_urls = set()
             progress_text.text("Analyzing main page...")
             # Special handling for ASP.NET pages
             is_aspnet = False
             except Exception:
                 pass
+            # Check if this URL is a direct download
+            if is_download_link(url):
+                progress_text.text("URL appears to be a direct download. Analyzing...")
+                # Try to extract file directly
+                normalized_url = normalize_download_url(url)
+                file_info = {
+                    'url': normalized_url,
+                    'download_url': normalized_url,
+                    'filename': os.path.basename(urlparse(normalized_url).path) or 'download',
+                    'size': 'Unknown Size',
+                    'metadata': {}
+                }
+                # Add to visited URLs
+                self.visited_urls.add(normalized_url)
+                progress_bar.progress(1.0)
+                return [file_info]
             # Extract files from main page
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
                 progress_bar.progress(progress)
                 try:
+                    # Check if this is a direct download link
+                    if is_download_link(sublink):
+                        # For download links, just add the link directly
+                        normalized_url = normalize_download_url(sublink)
+                        # Skip if already visited
+                        if normalized_url in self.visited_urls:
+                            continue
+                        # Mark as visited
+                        self.visited_urls.add(normalized_url)
+                        # Get file size if possible
+                        size_str = await self.get_file_size(normalized_url)
+                        # Get filename, with fallback to domain-based name
+                        filename = os.path.basename(urlparse(normalized_url).path)
+                        if not filename or filename == '/' or '?' in filename:
+                            domain = get_domain(normalized_url)
+                            ext = '.pdf'  # Default extension
+                            for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']:
+                                if common_ext in normalized_url.lower():
+                                    ext = common_ext
+                                    break
+                            filename = f"file_from_{domain}{ext}"
+                        # Add file to results
+                        all_files.append({
+                            'url': normalized_url,
+                            'download_url': normalized_url,
+                            'filename': filename,
+                            'size': size_str,
+                            'metadata': {}
+                        })
+                        file_count_text.text(f"Found {len(all_files)} total files")
+                        continue
+                    # For regular links, use a longer timeout for ASP.NET pages which can be slower
                     sub_timeout = timeout * 2 if is_aspnet else timeout
+                    # Skip already visited URLs
+                    if sublink in self.visited_urls:
+                        continue
                     # Extract files from sublink
                     sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                     all_files.extend(sub_files)
                     if custom_ext_list != valid_ext_list:
                         st.warning("Invalid extensions ignored. Use format like '.csv'.")
+                    # Reset RAG engine for new search
+                    st.session_state.rag_indexed = False
+                    st.session_state.rag_engine = None
+                    # Define a function to run the deep search
+                    async def run_deep_search():
+                        async with DownloadManager(
+                            use_proxy=use_proxy,
+                            proxy=proxy,
+                            use_stealth=use_stealth
+                        ) as dm:
+                            files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
+                            return files
                     with st.spinner("Searching for files..."):
+                        files = asyncio.run(run_deep_search())
                     if files:
                         st.session_state.discovered_files = files
             if st.button("Search Files", key="rag_search_btn") and search_query:
                 # Initialize RAG search engine
                 if not st.session_state.rag_indexed:
+                    rag_search = EnhancedRAGSearch()
                     with st.spinner("Indexing files for search..."):
                         # First download files to extract text
                                 for i, file_info in enumerate(files):
                                     # Only process common text-based file formats
                                     ext = os.path.splitext(file_info['filename'])[1].lower()
+                                    if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
                                         path = await dm.download_file(file_info, temp_dir, url)
                                         if path:
                                             with open(path, 'rb') as f:
                         for result in search_results:
                             file_info = result['file_info']
                             score = result['score']
+                            match_type = result.get('match_type', 'document')
                             with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
                                 st.write(f"Size: {file_info['size']}")
+                                st.write(f"Match type: {match_type}")
+                                # Show language if available
+                                if 'language' in result:
+                                    st.write(f"Language: {result['language']}")
+                                # Show metadata if available
                                 if 'metadata' in file_info and file_info['metadata']:
                                     st.write("Metadata:")
                                     for k, v in file_info['metadata'].items():
                                         if k != 'file_id':  # Skip technical details
                                             st.write(f"- {k}: {v}")
+                                # Show content preview for chunk matches
+                                if 'chunk_preview' in result:
+                                    st.write("Content preview:")
+                                    st.text(result['chunk_preview'])
                                 # Add direct download button
                                 if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
                                     with st.spinner(f"Downloading {file_info['filename']}..."):
                                 # Create expanders for each result
                                 for i, url in enumerate(urls, 1):
                                     with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
+                                        st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}", on_click=set_deep_search_url, args=(url,))
                             else:
                                 st.warning("No search results found.")
                 asyncio.run(run_search())
+        # Handle deep search - using on_click function to avoid state issues
+        if 'deep_search_url' in st.session_state and st.session_state.deep_search_url:
+            url = st.session_state.deep_search_url
+            st.info(f"Deep searching: {url}")
+            # Set up custom extensions
+            custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
+            valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
+            # Reset RAG engine for new search
+            st.session_state.rag_indexed = False
+            st.session_state.rag_engine = None
+            # Run the deep search
+            async def run_bing_deep_search():
+                async with DownloadManager(
+                    use_proxy=use_proxy,
+                    proxy=proxy,
+                    use_stealth=use_stealth
+                ) as dm:
+                    files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
+                    return files
+            with st.spinner("Searching for files..."):
+                files = asyncio.run(run_bing_deep_search())
+            if files:
+                st.session_state.discovered_files = files
+                st.session_state.current_url = url
+                st.success(f"Found {len(files)} files!")
+                # Show files with direct download options
+                download_dir = "./downloads"
+                os.makedirs(download_dir, exist_ok=True)
+                # Individual file display with direct download buttons
+                for i, file in enumerate(files):
+                    col1, col2, col3 = st.columns([3, 1, 1])
+                    with col1:
+                        filename = file['filename']
+                        size = file['size']
+                        meta = file.get('metadata', {})
+                        file_info = f"{filename} ({size})"
+                        if meta and 'Pages' in meta:
+                            file_info += f" - {meta.get('Pages', '')} pages"
+                        st.markdown(f"**{i+1}. {file_info}**")
+                    with col2:
+                        # Add direct download button for each file
+                        if st.button(f"Download", key=f"direct_dl_bing_{i}"):
+                            with st.spinner(f"Downloading {filename}..."):
+                                async def download_single_file():
+                                    async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                        path = await dm.download_file(file, download_dir, url)
+                                        return path
+                                downloaded_path = asyncio.run(download_single_file())
+                                if downloaded_path:
+                                    with open(downloaded_path, "rb") as f:
+                                        file_data = f.read()
+                                    st.download_button(
+                                        label=f"Save {filename}",
+                                        data=file_data,
+                                        file_name=filename,
+                                        mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
+                                        key=f"save_bing_file_{i}"
+                                    )
+                    with col3:
+                        # Add to selection for batch download
+                        if i in st.session_state.selected_files:
+                            if st.button("Unselect", key=f"bing_unselect_{i}"):
+                                st.session_state.selected_files.remove(i)
+                        else:
+                            if st.button("Select", key=f"bing_select_{i}"):
+                                st.session_state.selected_files.append(i)
+                # Add RAG Search interface for Bing results
+                st.markdown("### Search Within Discovered Files")
+                search_query = st.text_input("Enter search terms", key="bing_rag_search_query")
+                if st.button("Search Files", key="bing_rag_search_btn") and search_query:
+                    # Initialize RAG search engine
+                    if not st.session_state.rag_indexed:
+                        rag_search = EnhancedRAGSearch()
+                        with st.spinner("Indexing files for search..."):
+                            # First download files to extract text
+                            temp_dir = "./temp_downloads"
+                            os.makedirs(temp_dir, exist_ok=True)
+                            async def download_for_indexing():
+                                downloaded = 0
+                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                    for i, file_info in enumerate(files):
+                                        # Only process common text-based file formats
+                                        ext = os.path.splitext(file_info['filename'])[1].lower()
+                                        if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
+                                            path = await dm.download_file(file_info, temp_dir, url)
+                                            if path:
+                                                with open(path, 'rb') as f:
+                                                    file_data = f.read()
+                                                # Add to search index
+                                                if rag_search.add_file(file_data, file_info):
+                                                    downloaded += 1
+                                                # Clean up
+                                                os.remove(path)
+                                return downloaded
+                            indexed_count = asyncio.run(download_for_indexing())
+                            if indexed_count > 0:
+                                rag_search.build_index()
+                                st.session_state.rag_engine = rag_search
+                                st.session_state.rag_indexed = True
+                                st.success(f"Indexed {indexed_count} files for search")
                             else:
+                                st.warning("Could not index any files. Try with more text-based documents.")
+                    # Perform the search
+                    if st.session_state.rag_indexed:
+                        search_results = st.session_state.rag_engine.search(search_query)
+                        if search_results:
+                            st.write(f"Found {len(search_results)} relevant files:")
+                            for result in search_results:
+                                file_info = result['file_info']
+                                score = result['score']
+                                match_type = result.get('match_type', 'document')
+                                with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
+                                    st.write(f"Size: {file_info['size']}")
+                                    st.write(f"Match type: {match_type}")
+                                    # Show language if available
+                                    if 'language' in result:
+                                        st.write(f"Language: {result['language']}")
+                                    # Show metadata if available
+                                    if 'metadata' in file_info and file_info['metadata']:
+                                        st.write("Metadata:")
+                                        for k, v in file_info['metadata'].items():
+                                            if k != 'file_id':  # Skip technical details
+                                                st.write(f"- {k}: {v}")
+                                    # Show content preview for chunk matches
+                                    if 'chunk_preview' in result:
+                                        st.write("Content preview:")
+                                        st.text(result['chunk_preview'])
+                                    # Add direct download button
+                                    if st.button(f"Download this file", key=f"bing_rag_dl_{result['rank']}"):
+                                        with st.spinner(f"Downloading {file_info['filename']}..."):
+                                            async def download_search_result():
+                                                async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
+                                                    path = await dm.download_file(file_info, download_dir, url)
+                                                    return path
+                                            path = asyncio.run(download_search_result())
+                                            if path:
+                                                with open(path, "rb") as f:
+                                                    file_data = f.read()
+                                                st.download_button(
+                                                    label=f"Save {file_info['filename']}",
+                                                    data=file_data,
+                                                    file_name=file_info['filename'],
+                                                    mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
+                                                    key=f"save_bing_rag_{result['rank']}"
+                                                )
+                        else:
+                            st.warning("No matching files found for your query.")
+            else:
+                st.warning("No files found.")
+            # Reset the deep search URL after processing
+            st.session_state.deep_search_url = None
     # Add a special section for direct Google Drive file download
     st.markdown("---")
     # Add footer with attribution
     st.markdown('---')
+    st.markdown('Created by [Euler314](https://github.com/euler314)')
+# Helper function for Bing search deep search URL setting
+def set_deep_search_url(url):
+    st.session_state.deep_search_url = url
 if __name__ == "__main__":
     main()