Update app.py
Browse files
app.py
CHANGED
@@ -207,6 +207,7 @@ def google_drive_upload(zip_path: str, credentials):
|
|
207 |
except Exception as e:
|
208 |
return f"Error uploading to Drive: {str(e)}"
|
209 |
# DownloadManager Class
|
|
|
210 |
class DownloadManager:
|
211 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
212 |
self.use_proxy = use_proxy
|
@@ -243,29 +244,31 @@ class DownloadManager:
|
|
243 |
'Referer': 'https://www.bing.com/'
|
244 |
})
|
245 |
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
async def search_bing(self):
|
247 |
urls = []
|
248 |
try:
|
249 |
search_url = f"https://www.bing.com/search?q={self.query}"
|
250 |
await self.page.goto(search_url, timeout=30000)
|
251 |
await self.page.wait_for_load_state('networkidle')
|
252 |
-
|
253 |
# Extract search result links
|
254 |
links = await self.page.query_selector_all("li.b_algo h2 a")
|
255 |
for link in links[:self.num_results]:
|
256 |
href = await link.get_attribute('href')
|
257 |
if href:
|
258 |
urls.append(href)
|
259 |
-
|
260 |
return urls
|
261 |
except Exception as e:
|
262 |
logger.error(f"Error searching Bing: {e}")
|
263 |
return []
|
264 |
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
265 |
-
if self.browser:
|
266 |
-
await self.browser.close()
|
267 |
-
if self.playwright:
|
268 |
-
await self.playwright.stop()
|
269 |
|
270 |
async def get_file_size(self, url):
|
271 |
try:
|
@@ -424,77 +427,110 @@ class DownloadManager:
|
|
424 |
counter = 1
|
425 |
while os.path.exists(path):
|
426 |
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
427 |
-
|
428 |
-
|
429 |
os.makedirs(save_dir, exist_ok=True)
|
430 |
-
|
431 |
try:
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
|
499 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
500 |
if not custom_ext_list:
|
|
|
207 |
except Exception as e:
|
208 |
return f"Error uploading to Drive: {str(e)}"
|
209 |
# DownloadManager Class
|
210 |
+
# DownloadManager Class
|
211 |
class DownloadManager:
|
212 |
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
213 |
self.use_proxy = use_proxy
|
|
|
244 |
'Referer': 'https://www.bing.com/'
|
245 |
})
|
246 |
return self
|
247 |
+
|
248 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
249 |
+
if self.browser:
|
250 |
+
await self.browser.close()
|
251 |
+
if self.playwright:
|
252 |
+
await self.playwright.stop()
|
253 |
+
|
254 |
async def search_bing(self):
|
255 |
urls = []
|
256 |
try:
|
257 |
search_url = f"https://www.bing.com/search?q={self.query}"
|
258 |
await self.page.goto(search_url, timeout=30000)
|
259 |
await self.page.wait_for_load_state('networkidle')
|
260 |
+
|
261 |
# Extract search result links
|
262 |
links = await self.page.query_selector_all("li.b_algo h2 a")
|
263 |
for link in links[:self.num_results]:
|
264 |
href = await link.get_attribute('href')
|
265 |
if href:
|
266 |
urls.append(href)
|
267 |
+
|
268 |
return urls
|
269 |
except Exception as e:
|
270 |
logger.error(f"Error searching Bing: {e}")
|
271 |
return []
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
async def get_file_size(self, url):
|
274 |
try:
|
|
|
427 |
counter = 1
|
428 |
while os.path.exists(path):
|
429 |
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
430 |
+
counter += 1
|
431 |
+
|
432 |
os.makedirs(save_dir, exist_ok=True)
|
433 |
+
|
434 |
try:
|
435 |
+
if "drive.google.com" in file_url:
|
436 |
+
import gdown
|
437 |
+
try:
|
438 |
+
st.write(f"Downloading from Google Drive: {fname}")
|
439 |
+
|
440 |
+
# Determine file extension or use a default if none available
|
441 |
+
if not ext or ext == "":
|
442 |
+
# Try to determine file type from content-type header
|
443 |
+
async with self.context.new_page() as page:
|
444 |
+
response = await page.request.head(file_url, timeout=15000)
|
445 |
+
content_type = response.headers.get('Content-Type', '')
|
446 |
+
|
447 |
+
# Map content types to extensions
|
448 |
+
extension_map = {
|
449 |
+
'application/pdf': '.pdf',
|
450 |
+
'image/jpeg': '.jpg',
|
451 |
+
'image/png': '.png',
|
452 |
+
'application/msword': '.doc',
|
453 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
454 |
+
'application/zip': '.zip',
|
455 |
+
'text/plain': '.txt',
|
456 |
+
'application/vnd.ms-excel': '.xls',
|
457 |
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
458 |
+
'video/mp4': '.mp4',
|
459 |
+
'audio/mpeg': '.mp3',
|
460 |
+
'video/x-msvideo': '.avi',
|
461 |
+
'video/x-matroska': '.mkv'
|
462 |
+
}
|
463 |
+
|
464 |
+
# Get extension from content type or use .bin as fallback
|
465 |
+
ext = extension_map.get(content_type.split(';')[0], '.bin')
|
466 |
+
path = os.path.join(save_dir, f"{base}{ext}")
|
467 |
+
|
468 |
+
# Handle name collisions
|
469 |
+
counter = 1
|
470 |
+
while os.path.exists(path):
|
471 |
+
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
472 |
+
counter += 1
|
473 |
+
|
474 |
+
output = gdown.download(file_url, path, quiet=False)
|
475 |
+
if output:
|
476 |
+
return path
|
477 |
+
return None
|
478 |
+
except Exception as e:
|
479 |
+
logger.error(f"Google Drive download error: {e}")
|
480 |
+
return None
|
481 |
+
|
482 |
+
async with self.context.new_page() as page:
|
483 |
+
st.write(f"Downloading: {fname}")
|
484 |
+
|
485 |
+
headers = {
|
486 |
+
'Accept': '*/*',
|
487 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
488 |
+
'Referer': referer
|
489 |
+
}
|
490 |
+
|
491 |
+
response = await page.request.get(file_url, headers=headers, timeout=30000)
|
492 |
+
|
493 |
+
if response.status == 200:
|
494 |
+
content = await response.body()
|
495 |
+
|
496 |
+
# Check if we need to add an extension based on content type
|
497 |
+
if not ext or ext == "":
|
498 |
+
content_type = response.headers.get('Content-Type', '')
|
499 |
+
extension_map = {
|
500 |
+
'application/pdf': '.pdf',
|
501 |
+
'image/jpeg': '.jpg',
|
502 |
+
'image/png': '.png',
|
503 |
+
'application/msword': '.doc',
|
504 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
505 |
+
'application/zip': '.zip',
|
506 |
+
'text/plain': '.txt',
|
507 |
+
'application/vnd.ms-excel': '.xls',
|
508 |
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
509 |
+
'video/mp4': '.mp4',
|
510 |
+
'audio/mpeg': '.mp3',
|
511 |
+
'video/x-msvideo': '.avi',
|
512 |
+
'video/x-matroska': '.mkv'
|
513 |
+
}
|
514 |
+
|
515 |
+
ext = extension_map.get(content_type.split(';')[0], '.bin')
|
516 |
+
path = os.path.join(save_dir, f"{base}{ext}")
|
517 |
+
|
518 |
+
# Handle name collisions again
|
519 |
+
counter = 1
|
520 |
+
while os.path.exists(path):
|
521 |
+
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
522 |
+
counter += 1
|
523 |
+
|
524 |
+
with open(path, 'wb') as f:
|
525 |
+
f.write(content)
|
526 |
+
return path
|
527 |
+
else:
|
528 |
+
logger.error(f"Download failed with status {response.status}: {file_url}")
|
529 |
+
return None
|
530 |
+
|
531 |
+
except Exception as e:
|
532 |
+
logger.error(f"Error downloading {file_url}: {e}")
|
533 |
+
return None
|
534 |
|
535 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
536 |
if not custom_ext_list:
|