Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 16

Commit

5a2226b

verified ·

1 Parent(s): ea2ebb1

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -331

app.py CHANGED Viewed

@@ -217,7 +217,6 @@ class DownloadManager:
         self.browser = None
         self.context = None
         self.page = None
-        self.base_domains = set()  # Store base domains and their variations
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
@@ -251,63 +250,6 @@ class DownloadManager:
         if self.playwright:
             await self.playwright.stop()
-    def get_base_domain(self, url):
-        """Extract base domain and add variations to self.base_domains"""
-        parsed = urlparse(url)
-        domain = parsed.netloc.split(':')[0]  # Remove port if present
-        # Add the main domain and possible variations
-        base_parts = domain.split('.')
-        if len(base_parts) > 2:
-            main_domain = '.'.join(base_parts[-2:])
-            self.base_domains.add(main_domain)
-            # Add variations like files.domain.com for domain.com
-            self.base_domains.add(domain)
-            # Handle www and non-www versions
-            if base_parts[0] == 'www':
-                self.base_domains.add('.'.join(base_parts[1:]))
-            else:
-                self.base_domains.add(f"www.{domain}")
-        else:
-            self.base_domains.add(domain)
-        return domain
-    def is_related_domain(self, url):
-        """Check if URL belongs to any of the known domain variations"""
-        parsed = urlparse(url)
-        domain = parsed.netloc.split(':')[0]
-        # Check if this domain or any of its parts match our base domains
-        parts = domain.split('.')
-        for i in range(len(parts) - 1):
-            check_domain = '.'.join(parts[i:])
-            if check_domain in self.base_domains:
-                return True
-        return False
-    async def get_real_url(self, url):
-        """Follow redirects and get the final URL"""
-        try:
-            async with self.context.new_page() as page:
-                response = await page.goto(url, wait_until='networkidle', timeout=30000)
-                final_url = page.url
-                # Check for meta refresh redirects
-                content = await page.content()
-                soup = BeautifulSoup(content, 'html.parser')
-                meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
-                if meta_refresh:
-                    content = meta_refresh.get('content', '')
-                    if 'url=' in content.lower():
-                        final_url = content.split('url=')[-1].strip("'").strip('"')
-                return final_url, response.headers if response else {}
-        except Exception as e:
-            logger.error(f"Error getting real URL for {url}: {e}")
-            return url, {}
     async def get_file_size(self, url):
         try:
             async with self.context.new_page() as page:
@@ -338,32 +280,40 @@ class DownloadManager:
         except Exception:
             return {}
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
-            # Follow redirects and get the final URL
-            final_url, headers = await self.get_real_url(url)
-            # Add this domain to our known domains
-            self.get_base_domain(final_url)
-            # Check if the URL itself is a file
-            content_type = headers.get('content-type', '').lower()
-            if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
-                found_files.append({
-                    'url': final_url,
-                    'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
-                    'size': await self.get_file_size(final_url),
-                    'metadata': {}
-                })
-                return found_files
-            # Load the page
-            await self.page.goto(final_url, timeout=30000, wait_until='networkidle')
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
-            # Define extensions to look for
             default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
                           '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
@@ -371,73 +321,70 @@ class DownloadManager:
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            # Find all links including those in scripts and other elements
-            links = set()
-            # Regular links
             for a in soup.find_all('a', href=True):
-                links.add(a['href'])
-            # Script-embedded links
-            scripts = soup.find_all('script')
-            for script in scripts:
-                if script.string:
-                    urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
-                    links.update(urls)
-            for href in links:
-                href = href.strip()
-                # Skip empty or javascript links
-                if not href or href.startswith(('javascript:', '#', 'mailto:')):
-                    continue
-                # Handle both direct file links and PHP/script downloads
-                if '.php' in href.lower() or 'download' in href.lower() or 'visit' in href.lower():
-                    try:
-                        # Convert to absolute URL if needed
-                        if not href.startswith(('http://', 'https://')):
-                            if href.startswith('/'):
-                                href = base_url + href
-                            else:
-                                href = base_url + '/' + href
-                        # Follow the link to get the real file
-                        real_url, real_headers = await self.get_real_url(href)
-                        # Check if it leads to a file
-                        content_type = real_headers.get('content-type', '').lower()
-                        if any(ext in content_type for ext in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
-                            found_files.append({
-                                'url': real_url,
-                                'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                                'size': await self.get_file_size(real_url),
-                                'metadata': {}
-                            })
-                    except Exception as e:
-                        logger.error(f"Error processing PHP/script link {href}: {e}")
                         continue
                 # Handle direct file links
-                elif any(href.lower().endswith(ext) for ext in all_exts):
-                    # Convert to absolute URL if needed
-                    if not href.startswith(('http://', 'https://')):
-                        if href.startswith('/'):
-                            href = base_url + href
-                        else:
-                            href = base_url + '/' + href
-                    # Verify if it's from a related domain
-                    if self.is_related_domain(href):
-                        size_str = await self.get_file_size(href)
-                        meta = {}
-                        if href.lower().endswith('.pdf'):
-                            meta = await self.get_pdf_metadata(href)
-                        found_files.append({
-                            'url': href,
-                            'filename': os.path.basename(href.split('?')[0]),
-                            'size': size_str,
-                            'metadata': meta
-                        })
             # Make results unique based on URLs
             seen_urls = set()
@@ -448,10 +395,10 @@ class DownloadManager:
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
     async def download_file(self, file_info, save_dir, referer):
         file_url = file_info['url']
         fname = file_info['filename']
@@ -465,14 +412,11 @@ class DownloadManager:
         os.makedirs(save_dir, exist_ok=True)
         try:
-            # Get the real URL first
-            real_url, _ = await self.get_real_url(file_url)
-            if "drive.google.com" in real_url:
                 import gdown
                 try:
                     st.write(f"Downloading from Google Drive: {fname}")
-                    output = gdown.download(real_url, path, quiet=False)
                     if output:
                         return path
                     return None
@@ -489,7 +433,7 @@ class DownloadManager:
                     'Referer': referer
                 }
-                response = await page.request.get(real_url, headers=headers, timeout=30000)
                 if response.status == 200:
                     content = await response.body()
@@ -497,224 +441,102 @@ class DownloadManager:
                         f.write(content)
                     return path
                 else:
-                    logger.error(f"Download failed with status {response.status}: {real_url}")
                     return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
-    async def get_sublinks(self, url, limit=10000):
-        try:
-            # Get the real URL first
-            real_url, _ = await self.get_real_url(url)
-            await self.page.goto(real_url, timeout=30000)
-            # Wait for dynamic content
-            await self.page.wait_for_load_state('networkidle')
-            content = await self.page.content()
-            soup = BeautifulSoup(content, 'html.parser')
-            parsed_base = urlparse(real_url)
-            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
-            current_path = os.path.dirname(parsed_base.path)
-            links = set()
-            # Find links from various sources
-            # 1. Regular links
-            for a in soup.find_all('a', href=True):
-                href = a['href'].strip()
-                if href and not href.startswith(('javascript:', '#', 'mailto:')):
-                    links.add(href)
-            # 2. Script-embedded links
-            scripts = soup.find_all('script')
-            for script in scripts:
-                if script.string:
-                    urls = re.findall(r'(?:href=|url=|link=|src=)["\']([^"\']+)["\']', script.string)
-                    links.update(urls)
-            # 3. Form actions
-            forms = soup.find_all('form', action=True)
-            for form in forms:
-                links.add(form['action'])
-            # Process and clean links
-            clean_links = set()
-            for href in links:
-                try:
-                    # Skip empty links
-                    if not href.strip():
-                        continue
-                    # Convert to absolute URL
-                    if href.startswith('http'):
-                        full_url = href
-                    elif href.startswith('//'):
-                        full_url = parsed_base.scheme + ':' + href
-                    elif href.startswith('/'):
-                        full_url = base_url + href
-                    else:
-                        # Handle relative paths
-                        if current_path and current_path != '/':
-                            full_url = base_url + current_path + '/' + href
-                        else:
-                            full_url = base_url + '/' + href
-                    # Clean the URL
-                    full_url = full_url.split('#')[0]  # Remove fragments
-                    # Only add if it's a related domain
-                    if self.is_related_domain(full_url):
-                        clean_links.add(full_url)
-                except Exception as e:
-                    logger.error(f"Error processing link {href}: {e}")
-                    continue
-            # Sort links for consistency
-            sorted_links = sorted(list(clean_links))
-            return sorted_links[:limit]
-        except Exception as e:
-            logger.error(f"Error getting sublinks: {e}")
-            return []
-    async def deep_search(self, url, custom_ext_list=None, sublink_limit=100, timeout=30):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
-        try:
-            # Initialize base domains with the original URL
-            self.get_base_domain(url)
-            # First step: Get all sublinks
-            progress_text.text("Getting all sublinks from main page...")
-            sublinks = await self.get_sublinks(url, limit=sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
-            if total_links == 0:
                 progress_bar.progress(1.0)
-                # If no sublinks, try direct file search
-                return await self.extract_downloadable_files(url, custom_ext_list)
-            # Process main page and sublinks
-            all_files = []
-            # Create semaphore for concurrent processing
-            sem = asyncio.Semaphore(10)
-            async def process_sublink(sublink, index):
-                async with sem:
-                    try:
-                        progress = (index) / total_links
-                        progress_text.text(f"Processing sublink {index}/{total_links}: {sublink}")
-                        progress_bar.progress(progress)
-                        async with async_timeout.timeout(timeout):
-                            # Get the final URL and headers for this sublink
-                            real_url, headers = await self.get_real_url(sublink)
-                            content_type = headers.get('content-type', '').lower()
-                            # If the sublink itself is a downloadable file, return it
-                            if any(x in content_type for x in ['pdf', 'zip', 'rar', 'mp3', 'mp4']):
-                                return [{
-                                    'url': real_url,
-                                    'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
-                                    'size': await self.get_file_size(real_url),
-                                    'metadata': {}
-                                }]
-                            # Otherwise, treat it as a webpage and search for file links
-                            await self.page.goto(real_url, timeout=30000, wait_until='networkidle')
-                            content = await self.page.content()
-                            soup = BeautifulSoup(content, 'html.parser')
-                            # Define default and custom file extensions
-                            default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
-                                            '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
-                            custom_exts = [ext.strip().lower() for ext in custom_ext_list if ext.strip()]
-                            file_exts = set(default_exts + custom_exts)
-                            sublink_files = []
-                            # Iterate over all anchor tags found on the page
-                            for a in soup.find_all('a', href=True):
-                                href = a['href'].strip()
-                                if not href:
-                                    continue
-                                # Convert any relative URL to an absolute URL
-                                full_url = urljoin(real_url, href)
-                                if any(full_url.lower().endswith(ext) for ext in file_exts):
-                                    final_url, _ = await self.get_real_url(full_url)
-                                    file_info = {
-                                        'url': final_url,
-                                        'filename': os.path.basename(urlparse(final_url).path) or 'downloaded_file',
-                                        'size': await self.get_file_size(final_url),
-                                        'metadata': {}
-                                    }
-                                    if final_url.lower().endswith('.pdf'):
-                                        file_info['metadata'] = await self.get_pdf_metadata(final_url)
-                                    sublink_files.append(file_info)
-                            if sublink_files:
-                                logger.info(f"Found {len(sublink_files)} files at {real_url}")
-                                st.write(f"Found {len(sublink_files)} files at {real_url}")
-                            return sublink_files
-                    except asyncio.TimeoutError:
-                        logger.warning(f"Timeout processing sublink: {sublink}")
-                        return []
-                    except Exception as e:
-                        logger.error(f"Error processing sublink {sublink}: {e}")
-                        return []
-            # Process all sublinks concurrently
-            tasks = [process_sublink(sublink, i + 1) for i, sublink in enumerate(sublinks)]
-            sub_results = await asyncio.gather(*tasks)
-            # Combine all results
-            for sub_files in sub_results:
                 all_files.extend(sub_files)
                 file_count_text.text(f"Found {len(all_files)} total files")
-            # Remove duplicates based on URL
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
-            progress_text.text("Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
-            # Sort files by filename for consistency
-            unique_files.sort(key=lambda x: x['filename'].lower())
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
             await asyncio.sleep(2)
-            try:
                 progress_text.empty()
                 progress_bar.empty()
-                file_count_text.empty()
-            except:
-                pass
 def main():
     if 'initialized' not in st.session_state:
@@ -741,8 +563,8 @@ def main():
             max_sublinks = st.number_input(
                 "Maximum Sublinks to Process",
                 min_value=1,
-                max_value=10000,
-                value=100,
                 step=50,
                 help="Maximum number of sublinks to process from the main page",
                 key="max_sublinks_input"

         self.browser = None
         self.context = None
         self.page = None
     async def __aenter__(self):
         self.playwright = await async_playwright().start()
         if self.playwright:
             await self.playwright.stop()
     async def get_file_size(self, url):
         try:
             async with self.context.new_page() as page:
         except Exception:
             return {}
+    async def extract_real_download_url(self, url):
+        try:
+            async with self.context.new_page() as page:
+                response = await page.goto(url, wait_until='networkidle', timeout=30000)
+                if response and response.headers.get('location'):
+                    return response.headers['location']
+                return page.url
+        except Exception as e:
+            logger.error(f"Error extracting real download URL: {e}")
+            return url
     async def extract_downloadable_files(self, url, custom_ext_list):
         found_files = []
         try:
+            response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
+            if not response:
+                return []
+            final_url = self.page.url
+            if '.php' in final_url or 'download' in final_url:
+                real_url = await self.extract_real_download_url(final_url)
+                if real_url != final_url:
+                    found_files.append({
+                        'url': real_url,
+                        'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                        'size': await self.get_file_size(real_url),
+                        'metadata': {}
+                    })
+                    return found_files
+            await self.page.wait_for_load_state('networkidle', timeout=30000)
             content = await self.page.content()
             soup = BeautifulSoup(content, 'html.parser')
             default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
                           '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif']
             all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
             parsed_base = urlparse(final_url)
             base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
             for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                # Handle PHP scripts and redirects
+                if '.php' in href.lower() or 'download' in href.lower():
+                    full_url = href if href.startswith('http') else (
+                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
+                    )
+                    real_url = await self.extract_real_download_url(full_url)
+                    if real_url and real_url != full_url:
+                        found_files.append({
+                            'url': real_url,
+                            'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
+                            'size': await self.get_file_size(real_url),
+                            'metadata': {}
+                        })
                         continue
                 # Handle direct file links
+                if any(href.lower().endswith(ext) for ext in all_exts):
+                    file_url = href if href.startswith('http') else (
+                        f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
+                    )
+                    size_str = await self.get_file_size(file_url)
+                    meta = {}
+                    if file_url.lower().endswith('.pdf'):
+                        meta = await self.get_pdf_metadata(file_url)
+                    found_files.append({
+                        'url': file_url,
+                        'filename': os.path.basename(file_url.split('?')[0]),
+                        'size': size_str,
+                        'metadata': meta
+                    })
+                # Handle Google Drive links
+                elif ("drive.google.com" in href) or ("docs.google.com" in href):
+                    file_id = None
+                    for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
+                        match = re.search(pattern, href)
+                        if match:
+                            file_id = match.group(1)
+                            break
+                    if file_id:
+                        direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+                        filename = file_id
+                        try:
+                            response = await self.page.request.head(direct_url, timeout=15000)
+                            cd = response.headers.get("Content-Disposition", "")
+                            if cd:
+                                mt = re.search(r'filename\*?="?([^";]+)', cd)
+                                if mt:
+                                    filename = mt.group(1).strip('"').strip()
+                            found_files.append({
+                                'url': direct_url,
+                                'filename': filename,
+                                'size': await self.get_file_size(direct_url),
+                                'metadata': {}
+                            })
+                        except Exception as e:
+                            logger.error(f"Error processing Google Drive link: {e}")
             # Make results unique based on URLs
             seen_urls = set()
                     unique_files.append(f)
             return unique_files
         except Exception as e:
             logger.error(f"Error extracting files from {url}: {e}")
             return []
     async def download_file(self, file_info, save_dir, referer):
         file_url = file_info['url']
         fname = file_info['filename']
         os.makedirs(save_dir, exist_ok=True)
         try:
+            if "drive.google.com" in file_url:
                 import gdown
                 try:
                     st.write(f"Downloading from Google Drive: {fname}")
+                    output = gdown.download(file_url, path, quiet=False)
                     if output:
                         return path
                     return None
                     'Referer': referer
                 }
+                response = await page.request.get(file_url, headers=headers, timeout=30000)
                 if response.status == 200:
                     content = await response.body()
                         f.write(content)
                     return path
                 else:
+                    logger.error(f"Download failed with status {response.status}: {file_url}")
                     return None
         except Exception as e:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
+    async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000):
         if not custom_ext_list:
             custom_ext_list = []
         progress_text = st.empty()
         progress_bar = st.progress(0)
         file_count_text = st.empty()
+        try:
+            # Search main page
+            progress_text.text("Analyzing main page...")
+            main_files = await self.extract_downloadable_files(url, custom_ext_list)
+            initial_count = len(main_files)
+            file_count_text.text(f"Found {initial_count} files on main page")
+            # Get and search sublinks
+            progress_text.text("Getting sublinks...")
+            sublinks = await self.get_sublinks(url, sublink_limit)
             total_links = len(sublinks)
             progress_text.text(f"Found {total_links} sublinks to process")
+            if not sublinks:
                 progress_bar.progress(1.0)
+                return main_files
+            # Process sublinks
+            all_files = main_files
+            for i, sublink in enumerate(sublinks, 1):
+                progress = i/total_links
+                progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
+                progress_bar.progress(progress)
+                sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
                 all_files.extend(sub_files)
+                # Update count in real-time
                 file_count_text.text(f"Found {len(all_files)} total files")
+            # Make results unique
             seen_urls = set()
             unique_files = []
             for f in all_files:
                 if f['url'] not in seen_urls:
                     seen_urls.add(f['url'])
                     unique_files.append(f)
             final_count = len(unique_files)
+            progress_text.text(f"Deep search complete!")
             file_count_text.text(f"Found {final_count} unique files")
             progress_bar.progress(1.0)
             return unique_files
         except Exception as e:
             logger.error(f"Deep search error: {e}")
             progress_text.text(f"Error during deep search: {str(e)}")
             return []
         finally:
+            # Clean up progress indicators after a delay
             await asyncio.sleep(2)
+            if not st.session_state.get('keep_progress', False):
                 progress_text.empty()
                 progress_bar.empty()
+    async def get_sublinks(self, url, limit=10000):
+        try:
+            await self.page.goto(url, timeout=30000)
+            content = await self.page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            parsed_base = urlparse(url)
+            base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
+            links = set()
+            for a in soup.find_all('a', href=True):
+                href = a['href'].strip()
+                if href.startswith('http'):
+                    links.add(href)
+                elif href.startswith('/'):
+                    links.add(f"{base_url}{href}")
+            return list(links)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting sublinks: {e}")
+            return []
 def main():
     if 'initialized' not in st.session_state:
             max_sublinks = st.number_input(
                 "Maximum Sublinks to Process",
                 min_value=1,
+                max_value=100000,
+                value=10000,
                 step=50,
                 help="Maximum number of sublinks to process from the main page",
                 key="max_sublinks_input"