Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Mar 6

Commit

0f88c1d

verified ·

1 Parent(s): 573acd3

Update app.py

Browse files

Files changed (1) hide show

app.py +276 -7

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from sklearn.cluster import KMeans
 import numpy as np
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     filename='advanced_download_log.txt',
@@ -388,12 +388,12 @@ class DownloadManager:
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
-            if "drive.google.com" in file_url:
-                import gdown
-                output = gdown.download(file_url, path, quiet=False)
-                if output:
-                    return path
-                return None
             async with self.context.new_page() as page:
                 headers = {
                     'Accept': '*/*',
@@ -413,6 +413,275 @@ class DownloadManager:
             logger.error(f"Error downloading {file_url}: {e}")
             return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:
             custom_ext_list = []

 from reportlab.pdfgen import canvas
 from sklearn.cluster import KMeans
 import numpy as np
+import base64
 # -------------------- Logging Setup --------------------
 logging.basicConfig(
     filename='advanced_download_log.txt',
             counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
+            if "drive.google.com" in file_url or "docs.google.com" in file_url:
+                # Use enhanced Google Drive downloader
+                success = await self.download_from_google_drive(file_url, path)
+                return path if success else None
+            # Original code for non-Google Drive downloads
             async with self.context.new_page() as page:
                 headers = {
                     'Accept': '*/*',
             logger.error(f"Error downloading {file_url}: {e}")
             return None
+    async def download_from_google_drive(self, url, save_path):
+        """Enhanced method to download from Google Drive with multiple fallback approaches"""
+        # Extract the file ID from different URL formats
+        file_id = None
+        url_patterns = [
+            r'drive\.google\.com/file/d/([^/]+)',
+            r'drive\.google\.com/open\?id=([^&]+)',
+            r'docs\.google\.com/\w+/d/([^/]+)',
+            r'id=([^&]+)',
+            r'drive\.google\.com/uc\?id=([^&]+)',
+        ]
+        for pattern in url_patterns:
+            match = re.search(pattern, url)
+            if match:
+                file_id = match.group(1)
+                break
+        if not file_id:
+            logger.error(f"Could not extract file ID from URL: {url}")
+            return False
+        # Approach 1: Try with gdown first (when it works)
+        try:
+            import gdown
+            output = gdown.download(url, save_path, quiet=False, fuzzy=True)
+            if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                logger.info(f"Successfully downloaded with gdown: {url}")
+                return True
+        except Exception as e:
+            logger.warning(f"gdown download failed: {e}")
+        # Approach 2: Use Playwright session with cookies
+        try:
+            async with self.context.new_page() as page:
+                # Visit the file viewing page to get cookies
+                view_url = f"https://drive.google.com/file/d/{file_id}/view"
+                await page.goto(view_url, wait_until='networkidle', timeout=60000)
+                # Check for view-only permissions
+                if await page.query_selector('text="the owner has not granted you permission to download this file"'):
+                    logger.warning("File has view-only permissions, attempting workaround")
+                    # Check if it's a PDF (we can use the JS method)
+                    is_pdf = await page.query_selector('embed[type="application/pdf"]') is not None
+                    if is_pdf:
+                        # Try JavaScript PDF capture approach for PDFs
+                        success = await self.download_viewonly_pdf_with_js(page, save_path)
+                        if success:
+                            return True
+                    # Try direct download attempt for view-only files
+                    cookies = await page.context.cookies()
+                    cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
+                    # Try download URL with custom headers and cookies
+                    download_url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm=t"
+                    await page.goto(download_url, wait_until='networkidle', timeout=60000)
+                    headers = {
+                        'User-Agent': get_random_user_agent(),
+                        'Cookie': cookie_str,
+                        'Accept': '*/*',
+                    }
+                    response = await page.request.get(download_url, headers=headers)
+                    if response.status == 200:
+                        content = await response.body()
+                        with open(save_path, 'wb') as f:
+                            f.write(content)
+                        return True
+                # Standard download flow for files with download permission
+                download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+                await page.goto(download_url, wait_until='networkidle', timeout=60000)
+                # Handle large files with confirmation
+                confirm_form = await page.query_selector('form#download-form')
+                if confirm_form:
+                    await confirm_form.evaluate('form => form.submit()')
+                    await page.wait_for_load_state('networkidle')
+                    # Get cookies after confirmation
+                    cookies = await page.context.cookies()
+                    cookie_str = "; ".join([f"{c['name']}={c['value']}" for c in cookies])
+                    # Get final download URL with confirmation token
+                    download_url = f"https://drive.google.com/uc?export=download&id={file_id}&confirm=t"
+                    response = await page.request.get(download_url, headers={'Cookie': cookie_str})
+                    if response.status == 200:
+                        content = await response.body()
+                        with open(save_path, 'wb') as f:
+                            f.write(content)
+                        return True
+        except Exception as e:
+            logger.warning(f"Playwright download approach failed: {e}")
+        # Approach 3: Try with requests and session cookies
+        try:
+            import requests
+            session = requests.Session()
+            session.headers.update({'User-Agent': get_random_user_agent()})
+            # Get the initial page to obtain cookies
+            url = f"https://drive.google.com/uc?id={file_id}&export=download"
+            response = session.get(url, stream=True, timeout=30)
+            # Check for the download confirmation
+            confirmation_token = None
+            for k, v in response.cookies.items():
+                if k.startswith('download_warning'):
+                    confirmation_token = v
+                    break
+            # Use the confirmation token if found
+            if confirmation_token:
+                url = f"https://drive.google.com/uc?id={file_id}&export=download&confirm={confirmation_token}"
+            # Download the file
+            response = session.get(url, stream=True, timeout=60)
+            with open(save_path, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=1024*1024):
+                    if chunk:
+                        f.write(chunk)
+            if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                return True
+        except Exception as e:
+            logger.warning(f"Requests session download failed: {e}")
+        # All approaches failed
+        logger.error(f"All download attempts failed for: {url}")
+        return False
+    async def download_viewonly_pdf_with_js(self, page, save_path):
+        """Use JavaScript approach to download view-only PDFs from Google Drive"""
+        try:
+            logger.info("Attempting to download view-only PDF using JavaScript method")
+            # Scroll to ensure all pages are loaded
+            await page.evaluate("""
+                async function scrollToBottom() {
+                    const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                    const container = document.querySelector('.drive-viewer-paginated-scrollable');
+                    if (!container) return;
+                    const scrollHeight = container.scrollHeight;
+                    const viewportHeight = container.clientHeight;
+                    const scrollStep = viewportHeight / 2;
+                    for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
+                        container.scrollTo(0, scrollPos);
+                        await delay(500);
+                    }
+                    // Final scroll to ensure we reached the bottom
+                    container.scrollTo(0, scrollHeight);
+                    await delay(1000);
+                }
+                return scrollToBottom();
+            """)
+            # Wait for a moment to ensure all images are loaded
+            await page.wait_for_timeout(3000)
+            # Inject the jsPDF library
+            await page.evaluate("""
+                return new Promise((resolve, reject) => {
+                    const script = document.createElement('script');
+                    script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
+                    script.onload = () => resolve(true);
+                    script.onerror = () => reject(new Error('Failed to load jsPDF'));
+                    document.head.appendChild(script);
+                });
+            """)
+            # Wait for the library to load
+            await page.wait_for_timeout(1000)
+            # Execute the PDF creation script
+            pdf_data = await page.evaluate("""
+                return new Promise(async (resolve) => {
+                    // Make sure jsPDF is loaded
+                    if (typeof window.jspdf === 'undefined') {
+                        window.jspdf = window.jspdf || {};
+                    }
+                    // Use the jsPDF library
+                    const { jsPDF } = window.jspdf;
+                    const pdf = new jsPDF();
+                    const images = Array.from(document.querySelectorAll('img')).filter(img =>
+                        img.src.startsWith('blob:') && img.width > 100 && img.height > 100
+                    );
+                    if (images.length === 0) {
+                        resolve(null);
+                        return;
+                    }
+                    for (let i = 0; i < images.length; i++) {
+                        const img = images[i];
+                        // Create canvas and draw image
+                        const canvas = document.createElement('canvas');
+                        canvas.width = img.width;
+                        canvas.height = img.height;
+                        const ctx = canvas.getContext('2d');
+                        ctx.drawImage(img, 0, 0, img.width, img.height);
+                        // Add image to PDF
+                        const imgData = canvas.toDataURL('image/jpeg', 1.0);
+                        // Add a new page for each image except the first one
+                        if (i > 0) {
+                            pdf.addPage();
+                        }
+                        // Calculate dimensions to fit page
+                        const pageWidth = pdf.internal.pageSize.getWidth();
+                        const pageHeight = pdf.internal.pageSize.getHeight();
+                        const imgRatio = img.height / img.width;
+                        let imgWidth = pageWidth;
+                        let imgHeight = imgWidth * imgRatio;
+                        // If height exceeds page, scale down
+                        if (imgHeight > pageHeight) {
+                            imgHeight = pageHeight;
+                            imgWidth = imgHeight / imgRatio;
+                        }
+                        // Center image on page
+                        const x = (pageWidth - imgWidth) / 2;
+                        const y = (pageHeight - imgHeight) / 2;
+                        pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
+                    }
+                    // Get the PDF as base64
+                    const pdfBase64 = pdf.output('datauristring');
+                    resolve(pdfBase64);
+                });
+            """)
+            if not pdf_data or not pdf_data.startswith('data:application/pdf;base64,'):
+                logger.warning("Failed to generate PDF with JavaScript method")
+                return False
+            # Extract the base64 data and save to file
+            base64_data = pdf_data.replace('data:application/pdf;base64,', '')
+            pdf_bytes = base64.b64decode(base64_data)
+            with open(save_path, 'wb') as f:
+                f.write(pdf_bytes)
+            if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                logger.info("Successfully downloaded view-only PDF using JavaScript method")
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(f"Error in JavaScript PDF download method: {e}")
+            return False
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:
             custom_ext_list = []