Spaces:

euler314
/

craw_web

Sleeping

App Files Files Community

euler314 commited on Feb 25

Commit

7dc96a3

verified ·

1 Parent(s): 1f8c849

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -76

app.py CHANGED Viewed

@@ -207,6 +207,7 @@ def google_drive_upload(zip_path: str, credentials):
     except Exception as e:
         return f"Error uploading to Drive: {str(e)}"
 # DownloadManager Class
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
@@ -243,29 +244,31 @@ class DownloadManager:
             'Referer': 'https://www.bing.com/'
         })
         return self
     async def search_bing(self):
         urls = []
         try:
             search_url = f"https://www.bing.com/search?q={self.query}"
             await self.page.goto(search_url, timeout=30000)
             await self.page.wait_for_load_state('networkidle')
             # Extract search result links
             links = await self.page.query_selector_all("li.b_algo h2 a")
             for link in links[:self.num_results]:
                 href = await link.get_attribute('href')
                 if href:
                     urls.append(href)
             return urls
         except Exception as e:
             logger.error(f"Error searching Bing: {e}")
             return []
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if self.browser:
-            await self.browser.close()
-        if self.playwright:
-            await self.playwright.stop()
     async def get_file_size(self, url):
         try:
@@ -424,77 +427,110 @@ class DownloadManager:
         counter = 1
         while os.path.exists(path):
             path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-           	counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
-           	if "drive.google.com" in file_url:
-           		import gdown
-           		try:
-           			st.write(f"Downloading from Google Drive: {fname}")
-           			# Determine file extension or use a default if none available
-           			if not ext or ext == "":
-       				# Try to determine file type from content-type header
-           				async with self.context.new_page() as page:
-           					response = await page.request.head(file_url, timeout=15000)
-           					content_type = response.headers.get('Content-Type', '')
-           					# Map content types to extensions
-           					extension_map = {
-           						'application/pdf': '.pdf',
-           						'image/jpeg': '.jpg',
-           						'image/png': '.png',
-           						'application/msword': '.doc',
-           						'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
-           						'application/zip': '.zip',
-           						'text/plain': '.txt',
-           						'application/vnd.ms-excel': '.xls',
-           						'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx'
-           					}
-           					# Get extension from content type or use .bin as fallback
-           					ext = extension_map.get(content_type.split(';')[0], '.bin')
-           					path = os.path.join(save_dir, f"{base}{ext}")
-           					# Handle name collisions
-           					counter = 1
-           					while os.path.exists(path):
-           						path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-           						counter += 1
-           			output = gdown.download(file_url, path, quiet=False)
-           			if output:
-           				return path
-           			return None
-           		except Exception as e:
-           			logger.error(f"Google Drive download error: {e}")
-           			return None
-   	async with self.context.new_page() as page:
-   		st.write(f"Downloading: {fname}")
-   		headers = {
-   			'Accept': '*/*',
-   			'Accept-Encoding': 'gzip, deflate, br',
-   			'Referer': referer
-   		}
-   		response = await page.request.get(file_url, headers=headers, timeout=30000)
-   		if response.status == 200:
-   			content = await response.body()
-   			with open(path, 'wb') as f:
-   				f.write(content)
-   			return path
-   		else:
-   			logger.error(f"Download failed with status {response.status}: {file_url}")
-   			return None
-   except Exception as e:
-   	logger.error(f"Error downloading {file_url}: {e}")
-   	return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:

     except Exception as e:
         return f"Error uploading to Drive: {str(e)}"
 # DownloadManager Class
+# DownloadManager Class
 class DownloadManager:
     def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
         self.use_proxy = use_proxy
             'Referer': 'https://www.bing.com/'
         })
         return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
     async def search_bing(self):
         urls = []
         try:
             search_url = f"https://www.bing.com/search?q={self.query}"
             await self.page.goto(search_url, timeout=30000)
             await self.page.wait_for_load_state('networkidle')
             # Extract search result links
             links = await self.page.query_selector_all("li.b_algo h2 a")
             for link in links[:self.num_results]:
                 href = await link.get_attribute('href')
                 if href:
                     urls.append(href)
             return urls
         except Exception as e:
             logger.error(f"Error searching Bing: {e}")
             return []
     async def get_file_size(self, url):
         try:
         counter = 1
         while os.path.exists(path):
             path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+            counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
+            if "drive.google.com" in file_url:
+                import gdown
+                try:
+                    st.write(f"Downloading from Google Drive: {fname}")
+                    # Determine file extension or use a default if none available
+                    if not ext or ext == "":
+                        # Try to determine file type from content-type header
+                        async with self.context.new_page() as page:
+                            response = await page.request.head(file_url, timeout=15000)
+                            content_type = response.headers.get('Content-Type', '')
+                            # Map content types to extensions
+                            extension_map = {
+                                'application/pdf': '.pdf',
+                                'image/jpeg': '.jpg',
+                                'image/png': '.png',
+                                'application/msword': '.doc',
+                                'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+                                'application/zip': '.zip',
+                                'text/plain': '.txt',
+                                'application/vnd.ms-excel': '.xls',
+                                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
+                                'video/mp4': '.mp4',
+                                'audio/mpeg': '.mp3',
+                                'video/x-msvideo': '.avi',
+                                'video/x-matroska': '.mkv'
+                            }
+                            # Get extension from content type or use .bin as fallback
+                            ext = extension_map.get(content_type.split(';')[0], '.bin')
+                            path = os.path.join(save_dir, f"{base}{ext}")
+                            # Handle name collisions
+                            counter = 1
+                            while os.path.exists(path):
+                                path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+                                counter += 1
+                    output = gdown.download(file_url, path, quiet=False)
+                    if output:
+                        return path
+                    return None
+                except Exception as e:
+                    logger.error(f"Google Drive download error: {e}")
+                    return None
+            async with self.context.new_page() as page:
+                st.write(f"Downloading: {fname}")
+                headers = {
+                    'Accept': '*/*',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Referer': referer
+                }
+                response = await page.request.get(file_url, headers=headers, timeout=30000)
+                if response.status == 200:
+                    content = await response.body()
+                    # Check if we need to add an extension based on content type
+                    if not ext or ext == "":
+                        content_type = response.headers.get('Content-Type', '')
+                        extension_map = {
+                            'application/pdf': '.pdf',
+                            'image/jpeg': '.jpg',
+                            'image/png': '.png',
+                            'application/msword': '.doc',
+                            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+                            'application/zip': '.zip',
+                            'text/plain': '.txt',
+                            'application/vnd.ms-excel': '.xls',
+                            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
+                            'video/mp4': '.mp4',
+                            'audio/mpeg': '.mp3',
+                            'video/x-msvideo': '.avi',
+                            'video/x-matroska': '.mkv'
+                        }
+                        ext = extension_map.get(content_type.split(';')[0], '.bin')
+                        path = os.path.join(save_dir, f"{base}{ext}")
+                        # Handle name collisions again
+                        counter = 1
+                        while os.path.exists(path):
+                            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+                            counter += 1
+                    with open(path, 'wb') as f:
+                        f.write(content)
+                    return path
+                else:
+                    logger.error(f"Download failed with status {response.status}: {file_url}")
+                    return None
+        except Exception as e:
+            logger.error(f"Error downloading {file_url}: {e}")
+            return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list: