Spaces:

euler314
/

craw_web

Running

App Files Files Community

euler314 commited on Feb 25

Commit

f81cb89

verified ·

1 Parent(s): 592abec

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -40

app.py CHANGED Viewed

@@ -243,7 +243,24 @@ class DownloadManager:
             'Referer': 'https://www.bing.com/'
         })
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         if self.browser:
             await self.browser.close()
@@ -406,47 +423,78 @@ class DownloadManager:
         base, ext = os.path.splitext(fname)
         counter = 1
         while os.path.exists(path):
-            path = os.path.join(save_dir, f"{base}_{counter}{ext}")
-            counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
-            if "drive.google.com" in file_url:
-                import gdown
-                try:
-                    st.write(f"Downloading from Google Drive: {fname}")
-                    output = gdown.download(file_url, path, quiet=False)
-                    if output:
-                        return path
-                    return None
-                except Exception as e:
-                    logger.error(f"Google Drive download error: {e}")
-                    return None
-            async with self.context.new_page() as page:
-                st.write(f"Downloading: {fname}")
-                headers = {
-                    'Accept': '*/*',
-                    'Accept-Encoding': 'gzip, deflate, br',
-                    'Referer': referer
-                }
-                response = await page.request.get(file_url, headers=headers, timeout=30000)
-                if response.status == 200:
-                    content = await response.body()
-                    with open(path, 'wb') as f:
-                        f.write(content)
-                    return path
-                else:
-                    logger.error(f"Download failed with status {response.status}: {file_url}")
-                    return None
-        except Exception as e:
-            logger.error(f"Error downloading {file_url}: {e}")
-            return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list:

             'Referer': 'https://www.bing.com/'
         })
         return self
+    async def search_bing(self):
+        urls = []
+        try:
+            search_url = f"https://www.bing.com/search?q={self.query}"
+            await self.page.goto(search_url, timeout=30000)
+            await self.page.wait_for_load_state('networkidle')
+            # Extract search result links
+            links = await self.page.query_selector_all("li.b_algo h2 a")
+            for link in links[:self.num_results]:
+                href = await link.get_attribute('href')
+                if href:
+                    urls.append(href)
+            return urls
+        except Exception as e:
+            logger.error(f"Error searching Bing: {e}")
+            return []
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         if self.browser:
             await self.browser.close()
         base, ext = os.path.splitext(fname)
         counter = 1
         while os.path.exists(path):
+     	path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+       	counter += 1
         os.makedirs(save_dir, exist_ok=True)
         try:
+       	if "drive.google.com" in file_url:
+       		import gdown
+       		try:
+       			st.write(f"Downloading from Google Drive: {fname}")
+       			# Determine file extension or use a default if none available
+       			if not ext or ext == "":
+       				# Try to determine file type from content-type header
+       				async with self.context.new_page() as page:
+       					response = await page.request.head(file_url, timeout=15000)
+       					content_type = response.headers.get('Content-Type', '')
+       					# Map content types to extensions
+       					extension_map = {
+       						'application/pdf': '.pdf',
+       						'image/jpeg': '.jpg',
+       						'image/png': '.png',
+       						'application/msword': '.doc',
+       						'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+       						'application/zip': '.zip',
+       						'text/plain': '.txt',
+       						'application/vnd.ms-excel': '.xls',
+       						'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx'
+       					}
+       					# Get extension from content type or use .bin as fallback
+       					ext = extension_map.get(content_type.split(';')[0], '.bin')
+       					path = os.path.join(save_dir, f"{base}{ext}")
+       					# Handle name collisions
+       					counter = 1
+       					while os.path.exists(path):
+       						path = os.path.join(save_dir, f"{base}_{counter}{ext}")
+       						counter += 1
+       			output = gdown.download(file_url, path, quiet=False)
+       			if output:
+       				return path
+       			return None
+       		except Exception as e:
+       			logger.error(f"Google Drive download error: {e}")
+       			return None
+   	async with self.context.new_page() as page:
+   		st.write(f"Downloading: {fname}")
+   		headers = {
+   			'Accept': '*/*',
+   			'Accept-Encoding': 'gzip, deflate, br',
+   			'Referer': referer
+   		}
+   		response = await page.request.get(file_url, headers=headers, timeout=30000)
+   		if response.status == 200:
+   			content = await response.body()
+   			with open(path, 'wb') as f:
+   				f.write(content)
+   			return path
+   		else:
+   			logger.error(f"Download failed with status {response.status}: {file_url}")
+   			return None
+   except Exception as e:
+   	logger.error(f"Error downloading {file_url}: {e}")
+   	return None
     async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
         if not custom_ext_list: