Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Mar 8

Commit

907ffd6

verified ·

1 Parent(s): b9d5bbe

Update app.py

Browse files

Files changed (1) hide show

app.py +442 -285

app.py CHANGED Viewed

@@ -90,7 +90,7 @@ def load_models():
         # Load SentenceTransformer
         try:
-            semantic_model = SentenceTransformer('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
         except Exception as e:
             st.error(f"Error loading SentenceTransformer: {e}")
             semantic_model = None
@@ -314,12 +314,14 @@ class DownloadManager:
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
                 if '.php' in href.lower() or 'download' in href.lower():
-                    full_url = href if href.startswith('http') else f"{base_url}{href}"
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                         found_files.append({
@@ -331,7 +333,7 @@ class DownloadManager:
                         continue
                 if any(href.lower().endswith(ext) for ext in all_exts):
-                    file_url = href if href.startswith('http') else f"{base_url}{href}"
                     size_str = await self.get_file_size(file_url)
                     meta = {}
                     if file_url.lower().endswith('.pdf'):
@@ -373,6 +375,41 @@ class DownloadManager:
                             }
                         })
             seen_urls = set()
             unique_files = []
             for f in found_files:
@@ -882,67 +919,6 @@ class DownloadManager:
         logger.warning("Standard download methods failed")
         return False
-    async def get_google_drive_file_info(self, file_id):
-        """Get file type and view-only status from Google Drive"""
-        file_type = None
-        is_view_only = False
-        try:
-            async with self.context.new_page() as page:
-                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
-                # Check if view-only
-                view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
-                is_view_only = view_only_text is not None
-                # Check for Google Docs viewer
-                gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
-                gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
-                gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
-                if gdocs_viewer:
-                    file_type = 'docx'
-                elif gsheets_viewer:
-                    file_type = 'xlsx'
-                elif gslides_viewer:
-                    file_type = 'pptx'
-                else:
-                    # Check for PDF viewer
-                    pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
-                    if pdf_viewer:
-                        file_type = 'pdf'
-                    else:
-                        # Check for image viewer
-                        img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
-                        if img_viewer:
-                            # Get image type from src
-                            img_src = await img_viewer.get_attribute('src')
-                            if 'jpg' in img_src or 'jpeg' in img_src:
-                                file_type = 'jpg'
-                            elif 'png' in img_src:
-                                file_type = 'png'
-                            else:
-                                file_type = 'jpg'  # Default to jpg
-                        else:
-                            # Generic file type fallback
-                            file_type = 'pdf'  # Default to PDF
-                # If still no type, check filename
-                if not file_type:
-                    title_element = await page.query_selector('div[role="heading"]')
-                    if title_element:
-                        title = await title_element.text_content()
-                        if title:
-                            ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
-                            if ext_match:
-                                file_type = ext_match.group(1).lower()
-        except Exception as e:
-            logger.error(f"Error getting Google Drive file info: {e}")
-            file_type = 'pdf'  # Default to PDF if we can't determine
-        return file_type, is_view_only
     async def download_viewonly_pdf_with_js(self, file_id, save_path):
         """Download view-only PDF using JavaScript approach - improved version"""
         try:
@@ -954,245 +930,134 @@ class DownloadManager:
                 view_url = f"https://drive.google.com/file/d/{file_id}/view"
                 await page.goto(view_url, wait_until='networkidle', timeout=60000)
-                # Wait for rendering
                 await page.wait_for_timeout(2000)
-                # Inject required libraries - use CDN for jsPDF
-                await page.evaluate("""
-                    async function injectLibraries() {
-                        // Add jsPDF
-                        return new Promise((resolve) => {
-                            const jspdfScript = document.createElement('script');
-                            jspdfScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
-                            jspdfScript.onload = () => resolve(true);
-                            document.head.appendChild(jspdfScript);
-                        });
-                    }
-                    return injectLibraries();
-                """)
-                # Wait for libraries to load
-                await page.wait_for_timeout(2000)
-                # Scroll through document to load all pages
                 await page.evaluate("""
                     async function scrollThroughDocument() {
                         const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
                         const container = document.querySelector('.drive-viewer-paginated-scrollable');
                         if (!container) return false;
                         const scrollHeight = container.scrollHeight;
                         const viewportHeight = container.clientHeight;
-                        const scrollStep = viewportHeight / 2;
-                        for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
                             container.scrollTo(0, scrollPos);
-                            await delay(500);
                         }
                         // One final scroll to bottom to ensure everything is loaded
                         container.scrollTo(0, scrollHeight);
-                        await delay(1000);
                         // Scroll back to top for PDF creation
                         container.scrollTo(0, 0);
-                        await delay(500);
                         return true;
                     }
                     return scrollThroughDocument();
                 """)
-                # Wait after scrolling
-                await page.wait_for_timeout(2000)
-                # Use the improved PDF creation script that captures all pages
                 pdf_base64 = await page.evaluate("""
                     async function createPDF() {
                         try {
-                            // Make sure jsPDF is loaded
-                            if (typeof window.jspdf === 'undefined') {
-                                console.error('jsPDF not loaded');
-                                return null;
-                            }
                             const { jsPDF } = window.jspdf;
-                            const pdf = new jsPDF();
-                            // Get all page elements
-                            const pages = document.querySelectorAll('.drive-viewer-paginated-page');
-                            console.log('Found pages:', pages.length);
-                            if (pages.length === 0) {
-                                // Alternative: try to find images directly
-                                const images = Array.from(document.querySelectorAll('img')).filter(img =>
-                                    img.src.startsWith('blob:') && img.width > 100 && img.height > 100
-                                );
-                                console.log('Found images:', images.length);
-                                if (images.length === 0) {
-                                    return null;
-                                }
-                                // Process each image
-                                for (let i = 0; i < images.length; i++) {
-                                    const img = images[i];
-                                    if (i > 0) {
-                                        pdf.addPage();
-                                    }
-                                    // Create canvas and draw image
-                                    const canvas = document.createElement('canvas');
-                                    canvas.width = img.width;
-                                    canvas.height = img.height;
-                                    const ctx = canvas.getContext('2d');
-                                    ctx.drawImage(img, 0, 0, img.width, img.height);
-                                    // Add to PDF
-                                    const imgData = canvas.toDataURL('image/jpeg', 0.95);
-                                    // Calculate dimensions
-                                    const pageWidth = pdf.internal.pageSize.getWidth();
-                                    const pageHeight = pdf.internal.pageSize.getHeight();
-                                    const imgRatio = img.height / img.width;
-                                    let imgWidth = pageWidth - 10;
-                                    let imgHeight = imgWidth * imgRatio;
-                                    if (imgHeight > pageHeight - 10) {
-                                        imgHeight = pageHeight - 10;
-                                        imgWidth = imgHeight / imgRatio;
-                                    }
-                                    // Center on page
-                                    const x = (pageWidth - imgWidth) / 2;
-                                    const y = (pageHeight - imgHeight) / 2;
-                                    pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
                                 }
-                            } else {
-                                // Process each page
-                                const container = document.querySelector('.drive-viewer-paginated-scrollable');
-                                const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
-                                for (let i = 0; i < pages.length; i++) {
-                                    // Add a new page for each page after the first
-                                    if (i > 0) {
-                                        pdf.addPage();
-                                    }
-                                    // Scroll to the page and wait for it to render
-                                    pages[i].scrollIntoView();
-                                    await delay(300);
-                                    // Find the image element inside the page
-                                    const pageImages = pages[i].querySelectorAll('img');
-                                    let targetImage = null;
-                                    for (const img of pageImages) {
-                                        if (img.src.startsWith('blob:') && img.width > 50 && img.height > 50) {
-                                            targetImage = img;
-                                            break;
-                                        }
-                                    }
-                                    if (!targetImage) {
-                                        // If no image found, try taking a screenshot of the page instead
-                                        const pageCanvas = document.createElement('canvas');
-                                        pageCanvas.width = pages[i].clientWidth;
-                                        pageCanvas.height = pages[i].clientHeight;
-                                        const ctx = pageCanvas.getContext('2d');
-                                        // Draw the page background
-                                        ctx.fillStyle = 'white';
-                                        ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
-                                        // Use html2canvas approach
-                                        try {
-                                            await delay(100);
-                                            // Just draw what we can see
-                                            const allElements = pages[i].querySelectorAll('*');
-                                            for (const el of allElements) {
-                                                if (el.tagName === 'IMG' && el.complete && el.src) {
-                                                    const rect = el.getBoundingClientRect();
-                                                    try {
-                                                        ctx.drawImage(el, rect.left, rect.top, rect.width, rect.height);
-                                                    } catch (e) {
-                                                        console.error('Draw error:', e);
-                                                    }
-                                                }
-                                            }
-                                        } catch (e) {
-                                            console.error('Canvas error:', e);
-                                        }
-                                        // Add the canvas to the PDF
-                                        const imgData = pageCanvas.toDataURL('image/jpeg', 0.95);
-                                        // Calculate dimensions
-                                        const pageWidth = pdf.internal.pageSize.getWidth();
-                                        const pageHeight = pdf.internal.pageSize.getHeight();
-                                        const imgRatio = pageCanvas.height / pageCanvas.width;
-                                        let imgWidth = pageWidth - 10;
-                                        let imgHeight = imgWidth * imgRatio;
-                                        if (imgHeight > pageHeight - 10) {
-                                            imgHeight = pageHeight - 10;
-                                            imgWidth = imgHeight / imgRatio;
-                                        }
-                                        // Center on page
-                                        const x = (pageWidth - imgWidth) / 2;
-                                        const y = (pageHeight - imgHeight) / 2;
-                                        pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
-                                    } else {
-                                        // Use the found image
-                                        const canvas = document.createElement('canvas');
-                                        canvas.width = targetImage.naturalWidth || targetImage.width;
-                                        canvas.height = targetImage.naturalHeight || targetImage.height;
-                                        const ctx = canvas.getContext('2d');
-                                        // Draw image to canvas
-                                        try {
-                                            ctx.drawImage(targetImage, 0, 0, canvas.width, canvas.height);
-                                        } catch (e) {
-                                            console.error('Error drawing image:', e);
-                                            continue;
-                                        }
-                                        // Add to PDF
-                                        const imgData = canvas.toDataURL('image/jpeg', 0.95);
-                                        // Calculate dimensions
-                                        const pageWidth = pdf.internal.pageSize.getWidth();
-                                        const pageHeight = pdf.internal.pageSize.getHeight();
-                                        const imgRatio = canvas.height / canvas.width;
-                                        let imgWidth = pageWidth - 10;
-                                        let imgHeight = imgWidth * imgRatio;
-                                        if (imgHeight > pageHeight - 10) {
-                                            imgHeight = pageHeight - 10;
-                                            imgWidth = imgHeight / imgRatio;
-                                        }
-                                        // Center on page
-                                        const x = (pageWidth - imgWidth) / 2;
-                                        const y = (pageHeight - imgHeight) / 2;
-                                        pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
-                                    }
                                 }
                             }
                             // Return as base64
                             return pdf.output('datauristring');
                         } catch (e) {
-                            console.error('PDF creation error:', e);
                             return null;
                         }
                     }
@@ -1200,7 +1065,6 @@ class DownloadManager:
                 """)
                 if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
-                    # If script method failed, try screenshot approach
                     logger.warning("PDF creation script failed, trying fallback method")
                     return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
@@ -1222,7 +1086,7 @@ class DownloadManager:
                 except Exception as e:
                     logger.error(f"Error saving PDF: {e}")
                     return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
         except Exception as e:
             logger.error(f"Error in view-only PDF download: {e}")
             # Try fallback method
@@ -1409,72 +1273,365 @@ class DownloadManager:
             logger.error(f"Error exporting Google Doc: {e}")
             return False
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
         try:
             progress_text.text("Analyzing main page...")
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
             file_count_text.text(f"Found {initial_count} files on main page")
             progress_text.text("Getting sublinks...")
             sublinks = await self.get_sublinks(url, sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
             if not sublinks:
                 progress_bar.progress(1.0)
                 return main_files
             all_files = main_files
             for i, sublink in enumerate(sublinks, 1):
                 progress = i / total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
                 progress_bar.progress(progress)
-                sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
-                all_files.extend(sub_files)
-                file_count_text.text(f"Found {len(all_files)} total files")
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
             progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
                 progress_bar.empty()
-    async def get_sublinks(self, url, limit=10000):
-        try:
-            await self.page.goto(url, timeout=30000)
-            content = await self.page.content()
-            soup = BeautifulSoup(content, 'html.parser')
-            parsed_base = urlparse(url)
-            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            links = set()
-            for a in soup.find_all('a', href=True):
-                href = a['href'].strip()
-                if href.startswith('http'):
-                    links.add(href)
-                elif href.startswith('/'):
-                    links.add(f"{base_url}{href}")
-            return list(links)[:limit]
-        except Exception as e:
-            logger.error(f"Error getting sublinks: {e}")
-            return []
 # Utility Functions for New Features
 def extract_keywords(text, n=5):
     doc = nlp_model(text)

         # Load SentenceTransformer
         try:
+            semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
         except Exception as e:
             st.error(f"Error loading SentenceTransformer: {e}")
             semantic_model = None
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            path_base = os.path.dirname(parsed_base.path)
+            # Process all anchor tags
             for a in soup.find_all('a', href=True):
                 href = a['href'].strip()
                 if '.php' in href.lower() or 'download' in href.lower():
+                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                     real_url = await self.extract_real_download_url(full_url)
                     if real_url and real_url != full_url:
                         found_files.append({
                         continue
                 if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
                     size_str = await self.get_file_size(file_url)
                     meta = {}
                     if file_url.lower().endswith('.pdf'):
                             }
                         })
+            # Also check for files in other elements (iframe, embed, object, etc.)
+            other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
+            for elem in other_elements:
+                src = elem.get('src') or elem.get('data')
+                if src and any(src.lower().endswith(ext) for ext in all_exts):
+                    file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
+                    size_str = await self.get_file_size(file_url)
+                    meta = {}
+                    if file_url.lower().endswith('.pdf'):
+                        meta = await self.get_pdf_metadata(file_url)
+                    found_files.append({
+                        'url': file_url,
+                        'filename': os.path.basename(file_url.split('?')[0]),
+                        'size': size_str,
+                        'metadata': meta
+                    })
+            # Check for file links in onclick attributes
+            onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
+            for elem in onclick_elements:
+                onclick = await elem.get_attribute('onclick')
+                urls = re.findall(r'(https?://[^\'"]+)', onclick)
+                for url_match in urls:
+                    if any(url_match.lower().endswith(ext) for ext in all_exts):
+                        size_str = await self.get_file_size(url_match)
+                        meta = {}
+                        if url_match.lower().endswith('.pdf'):
+                            meta = await self.get_pdf_metadata(url_match)
+                        found_files.append({
+                            'url': url_match,
+                            'filename': os.path.basename(url_match.split('?')[0]),
+                            'size': size_str,
+                            'metadata': meta
+                        })
             seen_urls = set()
             unique_files = []
             for f in found_files:
         logger.warning("Standard download methods failed")
         return False
     async def download_viewonly_pdf_with_js(self, file_id, save_path):
         """Download view-only PDF using JavaScript approach - improved version"""
         try:
                 view_url = f"https://drive.google.com/file/d/{file_id}/view"
                 await page.goto(view_url, wait_until='networkidle', timeout=60000)
+                # Wait for initial rendering
                 await page.wait_for_timeout(2000)
+                # CRITICAL: Scroll through entire document to ensure all content is cached
                 await page.evaluate("""
                     async function scrollThroughDocument() {
                         const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
                         const container = document.querySelector('.drive-viewer-paginated-scrollable');
                         if (!container) return false;
+                        // Get total scroll height
                         const scrollHeight = container.scrollHeight;
                         const viewportHeight = container.clientHeight;
+                        // Scroll down in increments to load all pages
+                        for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += viewportHeight) {
                             container.scrollTo(0, scrollPos);
+                            await delay(800); // Wait for content to load
                         }
                         // One final scroll to bottom to ensure everything is loaded
                         container.scrollTo(0, scrollHeight);
+                        await delay(1500);
                         // Scroll back to top for PDF creation
                         container.scrollTo(0, 0);
+                        await delay(800);
                         return true;
                     }
                     return scrollThroughDocument();
                 """)
+                # Use simplified script similar to the one provided
                 pdf_base64 = await page.evaluate("""
                     async function createPDF() {
                         try {
+                            // Create jsPDF script element
+                            const loadJsPDF = () => new Promise((resolve, reject) => {
+                                let jspdf = document.createElement("script");
+                                jspdf.onload = () => resolve();
+                                jspdf.onerror = () => reject(new Error("Failed to load jsPDF"));
+                                jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
+                                document.body.appendChild(jspdf);
+                            });
+                            await loadJsPDF();
+                            // Create PDF
                             const { jsPDF } = window.jspdf;
+                            let pdf = new jsPDF();
+                            let elements = document.getElementsByTagName("img");
+                            let pageCount = 0;
+                            // First pass to find and sort all valid page images
+                            let pageImages = [];
+                            for (let i = 0; i < elements.length; i++) {
+                                let img = elements[i];
+                                // Only process blob images (these are the PDF pages)
+                                if (!/^blob:/.test(img.src)) continue;
+                                // Skip tiny images (usually icons, not content)
+                                if (img.width < 100 || img.height < 100) continue;
+                                pageImages.push(img);
+                            }
+                            // Sort images by their position if possible
+                            try {
+                                pageImages.sort((a, b) => {
+                                    const rectA = a.getBoundingClientRect();
+                                    const rectB = b.getBoundingClientRect();
+                                    return rectA.top - rectB.top;
+                                });
+                            } catch (e) {
+                                console.error("Error sorting images:", e);
+                            }
+                            // Process each image as a page
+                            for (let i = 0; i < pageImages.length; i++) {
+                                let img = pageImages[i];
+                                // Create canvas to draw the image
+                                let canvasElement = document.createElement('canvas');
+                                let con = canvasElement.getContext("2d");
+                                canvasElement.width = img.width;
+                                canvasElement.height = img.height;
+                                // Draw image to canvas
+                                con.drawImage(img, 0, 0, img.width, img.height);
+                                // Add image to PDF
+                                let imgData = canvasElement.toDataURL("image/jpeg", 0.95);
+                                // Add a new page for each page after the first
+                                if (pageCount > 0) {
+                                    pdf.addPage();
                                 }
+                                // Calculate dimensions to fit the page
+                                const pageWidth = pdf.internal.pageSize.getWidth();
+                                const pageHeight = pdf.internal.pageSize.getHeight();
+                                const imgRatio = img.height / img.width;
+                                let imgWidth = pageWidth;
+                                let imgHeight = imgWidth * imgRatio;
+                                if (imgHeight > pageHeight) {
+                                    imgHeight = pageHeight;
+                                    imgWidth = imgHeight / imgRatio;
                                 }
+                                // Center on page
+                                const x = (pageWidth - imgWidth) / 2;
+                                const y = (pageHeight - imgHeight) / 2;
+                                pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
+                                pageCount++;
+                            }
+                            if (pageCount === 0) {
+                                return null; // No pages found
                             }
                             // Return as base64
                             return pdf.output('datauristring');
                         } catch (e) {
+                            console.error("PDF creation error:", e);
                             return null;
                         }
                     }
                 """)
                 if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
                     logger.warning("PDF creation script failed, trying fallback method")
                     return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
                 except Exception as e:
                     logger.error(f"Error saving PDF: {e}")
                     return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
         except Exception as e:
             logger.error(f"Error in view-only PDF download: {e}")
             # Try fallback method
             logger.error(f"Error exporting Google Doc: {e}")
             return False
+    async def get_google_drive_file_info(self, file_id):
+        """Get file type and view-only status from Google Drive"""
+        file_type = None
+        is_view_only = False
+        try:
+            async with self.context.new_page() as page:
+                await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
+                # Check if view-only
+                view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
+                is_view_only = view_only_text is not None
+                # Check for Google Docs viewer
+                gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
+                gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
+                gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
+                if gdocs_viewer:
+                    file_type = 'docx'
+                elif gsheets_viewer:
+                    file_type = 'xlsx'
+                elif gslides_viewer:
+                    file_type = 'pptx'
+                else:
+                    # Check for PDF viewer
+                    pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
+                    if pdf_viewer:
+                        file_type = 'pdf'
+                    else:
+                        # Check for image viewer
+                        img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
+                        if img_viewer:
+                            # Get image type from src
+                            img_src = await img_viewer.get_attribute('src')
+                            if 'jpg' in img_src or 'jpeg' in img_src:
+                                file_type = 'jpg'
+                            elif 'png' in img_src:
+                                file_type = 'png'
+                            else:
+                                file_type = 'jpg'  # Default to jpg
+                        else:
+                            # Generic file type fallback
+                            file_type = 'pdf'  # Default to PDF
+                # If still no type, check filename
+                if not file_type:
+                    title_element = await page.query_selector('div[role="heading"]')
+                    if title_element:
+                        title = await title_element.text_content()
+                        if title:
+                            ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
+                            if ext_match:
+                                file_type = ext_match.group(1).lower()
+        except Exception as e:
+            logger.error(f"Error getting Google Drive file info: {e}")
+            file_type = 'pdf'  # Default to PDF if we can't determine
+        return file_type, is_view_only
+    async def get_sublinks(self, url, limit=10000):
+        """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
+        links = set()
+        try:
+            logger.info(f"Fetching sublinks from: {url}")
+            # Go to page and wait for full load
+            await self.page.goto(url, timeout=30000, wait_until='networkidle')
+            # Get base URL for resolving relative links
+            parsed_base = urlparse(url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            path_base = os.path.dirname(parsed_base.path)
+            # Check if page has ASP.NET elements which might need special handling
+            is_aspnet = await self.page.evaluate('''
+                () => {
+                    return document.querySelector('form#aspnetForm') !== null ||
+                           document.querySelector('input[name="__VIEWSTATE"]') !== null;
+                }
+            ''')
+            if is_aspnet:
+                logger.info("Detected ASP.NET page, using enhanced extraction method")
+                # Try to interact with ASP.NET controls that might reveal more links
+                # Look for dropdowns, buttons, and grid elements
+                dropdowns = await self.page.query_selector_all('select')
+                buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
+                # Try interacting with dropdowns first
+                for dropdown in dropdowns:
+                    try:
+                        # Get all options
+                        options = await self.page.evaluate('''
+                            (dropdown) => {
+                                return Array.from(dropdown.options).map(o => o.value);
+                            }
+                        ''', dropdown)
+                        # Try selecting each option
+                        for option in options:
+                            if option:
+                                await dropdown.select_option(value=option)
+                                await self.page.wait_for_timeout(1000)
+                                await self.page.wait_for_load_state('networkidle', timeout=5000)
+                                # Extract any new links that appeared
+                                await self.extract_all_link_types(links, base_url, path_base)
+                    except Exception as e:
+                        logger.warning(f"Error interacting with dropdown: {e}")
+                # Try clicking buttons (but avoid dangerous ones like "delete")
+                safe_buttons = []
+                for button in buttons:
+                    button_text = await button.text_content() or ""
+                    button_value = await button.get_attribute("value") or ""
+                    button_id = await button.get_attribute("id") or ""
+                    combined_text = (button_text + button_value + button_id).lower()
+                    # Skip potentially destructive buttons
+                    if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
+                        continue
+                    # Prioritize buttons that might show more content
+                    if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
+                        safe_buttons.append(button)
+                # Click the safe buttons
+                for button in safe_buttons[:5]:  # Limit to first 5 to avoid too many clicks
+                    try:
+                        await button.click()
+                        await self.page.wait_for_timeout(1000)
+                        await self.page.wait_for_load_state('networkidle', timeout=5000)
+                        # Extract any new links that appeared
+                        await self.extract_all_link_types(links, base_url, path_base)
+                    except Exception as e:
+                        logger.warning(f"Error clicking button: {e}")
+            # Extract links from the initial page state
+            await self.extract_all_link_types(links, base_url, path_base)
+            # Look specifically for links inside grid/table views which are common in ASP.NET applications
+            grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
+            for cell in grid_cells:
+                try:
+                    href = await cell.get_attribute('href')
+                    if href:
+                        full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                        links.add(full_url)
+                except Exception as e:
+                    logger.warning(f"Error extracting grid link: {e}")
+            # Extract links from onclick attributes and javascript:__doPostBack calls
+            postback_links = await self.page.evaluate('''
+                () => {
+                    const results = [];
+                    // Find elements with onclick containing __doPostBack
+                    const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
+                    for (const el of elements) {
+                        // Extract the postback target
+                        const onclick = el.getAttribute('onclick') || '';
+                        const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
+                        if (match && match[1]) {
+                            // Get the visible text to use as description
+                            const text = el.innerText || el.textContent || 'Link';
+                            results.push({
+                                id: match[1],
+                                text: text.trim()
+                            });
+                        }
+                    }
+                    return results;
+                }
+            ''')
+            # Try interacting with some of the postback links
+            for postback in postback_links[:10]:  # Limit to first 10 to avoid too many interactions
+                try:
+                    logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
+                    await self.page.evaluate(f'''
+                        () => {{
+                            if (typeof __doPostBack === 'function') {{
+                                __doPostBack('{postback["id"]}', '');
+                            }}
+                        }}
+                    ''')
+                    await self.page.wait_for_timeout(1500)
+                    await self.page.wait_for_load_state('networkidle', timeout=5000)
+                    # Extract any new links that appeared
+                    await self.extract_all_link_types(links, base_url, path_base)
+                except Exception as e:
+                    logger.warning(f"Error with postback: {e}")
+            logger.info(f"Found {len(links)} sublinks")
+            return list(links)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting sublinks from {url}: {e}")
+            return list(links)[:limit]  # Return what we have so far
+    async def extract_all_link_types(self, links_set, base_url, path_base):
+        """Extract all types of links from the current page"""
+        # Get all <a> tag links
+        a_links = await self.page.query_selector_all('a[href]')
+        for a in a_links:
+            try:
+                href = await a.get_attribute('href')
+                if href and not href.startswith('javascript:') and not href.startswith('#'):
+                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                    links_set.add(full_url)
+            except Exception:
+                pass
+        # Get iframe sources
+        iframes = await self.page.query_selector_all('iframe[src]')
+        for iframe in iframes:
+            try:
+                src = await iframe.get_attribute('src')
+                if src and not src.startswith('javascript:') and not src.startswith('about:'):
+                    full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
+                    links_set.add(full_url)
+            except Exception:
+                pass
+        # Get links from onclick attributes that reference URLs
+        onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
+        for el in onclick_elements:
+            try:
+                onclick = await el.get_attribute('onclick')
+                urls = re.findall(r'(https?://[^\'"]+)', onclick)
+                for url in urls:
+                    links_set.add(url)
+            except Exception:
+                pass
+        # Look for URLs in data-* attributes
+        data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
+        for el in data_elements:
+            for attr in ['data-url', 'data-href', 'data-src']:
+                try:
+                    value = await el.get_attribute(attr)
+                    if value and not value.startswith('javascript:'):
+                        full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
+                        links_set.add(full_url)
+                except Exception:
+                    pass
+        # Look for special anchor links that might not have href attributes
+        special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
+        for anchor in special_anchors:
+            try:
+                href = await anchor.get_attribute('href')
+                if href and not href.startswith('javascript:') and not href.startswith('#'):
+                    full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
+                    links_set.add(full_url)
+            except Exception:
+                pass
+    def resolve_relative_url(self, relative_url, base_url, path_base):
+        """Properly resolve relative URLs considering multiple formats"""
+        if relative_url.startswith('/'):
+            # Absolute path relative to domain
+            return f"{base_url}{relative_url}"
+        elif relative_url.startswith('./'):
+            # Explicit relative path
+            return f"{base_url}{path_base}/{relative_url[2:]}"
+        elif relative_url.startswith('../'):
+            # Parent directory
+            parent_path = '/'.join(path_base.split('/')[:-1])
+            return f"{base_url}{parent_path}/{relative_url[3:]}"
+        else:
+            # Regular relative path
+            return f"{base_url}{path_base}/{relative_url}"
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
         try:
             progress_text.text("Analyzing main page...")
+            # Special handling for ASP.NET pages
+            is_aspnet = False
+            try:
+                await self.page.goto(url, timeout=30000, wait_until='networkidle')
+                is_aspnet = await self.page.evaluate('''
+                    () => {
+                        return document.querySelector('form#aspnetForm') !== null ||
+                               document.querySelector('input[name="__VIEWSTATE"]') !== null;
+                    }
+                ''')
+            except Exception:
+                pass
+            # Extract files from main page
             main_files = await self.extract_downloadable_files(url, custom_ext_list)
             initial_count = len(main_files)
             file_count_text.text(f"Found {initial_count} files on main page")
+            # Get sublinks with enhanced method
             progress_text.text("Getting sublinks...")
             sublinks = await self.get_sublinks(url, sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
             if not sublinks:
                 progress_bar.progress(1.0)
                 return main_files
+            # Process each sublink
             all_files = main_files
             for i, sublink in enumerate(sublinks, 1):
                 progress = i / total_links
                 progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
                 progress_bar.progress(progress)
+                try:
+                    # Use a longer timeout for ASP.NET pages which can be slower
+                    sub_timeout = timeout * 2 if is_aspnet else timeout
+                    # Extract files from sublink with appropriate timeout
+                    async with async_timeout(sub_timeout):
+                        sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
+                        all_files.extend(sub_files)
+                        file_count_text.text(f"Found {len(all_files)} total files")
+                except Exception as e:
+                    logger.warning(f"Error processing sublink {sublink}: {e}")
+            # Deduplicate files
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
             progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
             await asyncio.sleep(2)
             if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
                 progress_bar.empty()
 # Utility Functions for New Features
 def extract_keywords(text, n=5):
     doc = nlp_model(text)