euler314 commited on
Commit
7dc96a3
·
verified ·
1 Parent(s): 1f8c849

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -76
app.py CHANGED
@@ -207,6 +207,7 @@ def google_drive_upload(zip_path: str, credentials):
207
  except Exception as e:
208
  return f"Error uploading to Drive: {str(e)}"
209
  # DownloadManager Class
 
210
  class DownloadManager:
211
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
212
  self.use_proxy = use_proxy
@@ -243,29 +244,31 @@ class DownloadManager:
243
  'Referer': 'https://www.bing.com/'
244
  })
245
  return self
 
 
 
 
 
 
 
246
  async def search_bing(self):
247
  urls = []
248
  try:
249
  search_url = f"https://www.bing.com/search?q={self.query}"
250
  await self.page.goto(search_url, timeout=30000)
251
  await self.page.wait_for_load_state('networkidle')
252
-
253
  # Extract search result links
254
  links = await self.page.query_selector_all("li.b_algo h2 a")
255
  for link in links[:self.num_results]:
256
  href = await link.get_attribute('href')
257
  if href:
258
  urls.append(href)
259
-
260
  return urls
261
  except Exception as e:
262
  logger.error(f"Error searching Bing: {e}")
263
  return []
264
- async def __aexit__(self, exc_type, exc_val, exc_tb):
265
- if self.browser:
266
- await self.browser.close()
267
- if self.playwright:
268
- await self.playwright.stop()
269
 
270
  async def get_file_size(self, url):
271
  try:
@@ -424,77 +427,110 @@ class DownloadManager:
424
  counter = 1
425
  while os.path.exists(path):
426
  path = os.path.join(save_dir, f"{base}_{counter}{ext}")
427
- counter += 1
428
-
429
  os.makedirs(save_dir, exist_ok=True)
430
-
431
  try:
432
- if "drive.google.com" in file_url:
433
- import gdown
434
- try:
435
- st.write(f"Downloading from Google Drive: {fname}")
436
-
437
- # Determine file extension or use a default if none available
438
- if not ext or ext == "":
439
- # Try to determine file type from content-type header
440
- async with self.context.new_page() as page:
441
- response = await page.request.head(file_url, timeout=15000)
442
- content_type = response.headers.get('Content-Type', '')
443
-
444
- # Map content types to extensions
445
- extension_map = {
446
- 'application/pdf': '.pdf',
447
- 'image/jpeg': '.jpg',
448
- 'image/png': '.png',
449
- 'application/msword': '.doc',
450
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
451
- 'application/zip': '.zip',
452
- 'text/plain': '.txt',
453
- 'application/vnd.ms-excel': '.xls',
454
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx'
455
- }
456
-
457
- # Get extension from content type or use .bin as fallback
458
- ext = extension_map.get(content_type.split(';')[0], '.bin')
459
- path = os.path.join(save_dir, f"{base}{ext}")
460
-
461
- # Handle name collisions
462
- counter = 1
463
- while os.path.exists(path):
464
- path = os.path.join(save_dir, f"{base}_{counter}{ext}")
465
- counter += 1
466
-
467
- output = gdown.download(file_url, path, quiet=False)
468
- if output:
469
- return path
470
- return None
471
- except Exception as e:
472
- logger.error(f"Google Drive download error: {e}")
473
- return None
474
-
475
- async with self.context.new_page() as page:
476
- st.write(f"Downloading: {fname}")
477
-
478
- headers = {
479
- 'Accept': '*/*',
480
- 'Accept-Encoding': 'gzip, deflate, br',
481
- 'Referer': referer
482
- }
483
-
484
- response = await page.request.get(file_url, headers=headers, timeout=30000)
485
-
486
- if response.status == 200:
487
- content = await response.body()
488
- with open(path, 'wb') as f:
489
- f.write(content)
490
- return path
491
- else:
492
- logger.error(f"Download failed with status {response.status}: {file_url}")
493
- return None
494
-
495
- except Exception as e:
496
- logger.error(f"Error downloading {file_url}: {e}")
497
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
500
  if not custom_ext_list:
 
207
  except Exception as e:
208
  return f"Error uploading to Drive: {str(e)}"
209
  # DownloadManager Class
210
+ # DownloadManager Class
211
  class DownloadManager:
212
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
213
  self.use_proxy = use_proxy
 
244
  'Referer': 'https://www.bing.com/'
245
  })
246
  return self
247
+
248
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
249
+ if self.browser:
250
+ await self.browser.close()
251
+ if self.playwright:
252
+ await self.playwright.stop()
253
+
254
  async def search_bing(self):
255
  urls = []
256
  try:
257
  search_url = f"https://www.bing.com/search?q={self.query}"
258
  await self.page.goto(search_url, timeout=30000)
259
  await self.page.wait_for_load_state('networkidle')
260
+
261
  # Extract search result links
262
  links = await self.page.query_selector_all("li.b_algo h2 a")
263
  for link in links[:self.num_results]:
264
  href = await link.get_attribute('href')
265
  if href:
266
  urls.append(href)
267
+
268
  return urls
269
  except Exception as e:
270
  logger.error(f"Error searching Bing: {e}")
271
  return []
 
 
 
 
 
272
 
273
  async def get_file_size(self, url):
274
  try:
 
427
  counter = 1
428
  while os.path.exists(path):
429
  path = os.path.join(save_dir, f"{base}_{counter}{ext}")
430
+ counter += 1
431
+
432
  os.makedirs(save_dir, exist_ok=True)
433
+
434
  try:
435
+ if "drive.google.com" in file_url:
436
+ import gdown
437
+ try:
438
+ st.write(f"Downloading from Google Drive: {fname}")
439
+
440
+ # Determine file extension or use a default if none available
441
+ if not ext or ext == "":
442
+ # Try to determine file type from content-type header
443
+ async with self.context.new_page() as page:
444
+ response = await page.request.head(file_url, timeout=15000)
445
+ content_type = response.headers.get('Content-Type', '')
446
+
447
+ # Map content types to extensions
448
+ extension_map = {
449
+ 'application/pdf': '.pdf',
450
+ 'image/jpeg': '.jpg',
451
+ 'image/png': '.png',
452
+ 'application/msword': '.doc',
453
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
454
+ 'application/zip': '.zip',
455
+ 'text/plain': '.txt',
456
+ 'application/vnd.ms-excel': '.xls',
457
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
458
+ 'video/mp4': '.mp4',
459
+ 'audio/mpeg': '.mp3',
460
+ 'video/x-msvideo': '.avi',
461
+ 'video/x-matroska': '.mkv'
462
+ }
463
+
464
+ # Get extension from content type or use .bin as fallback
465
+ ext = extension_map.get(content_type.split(';')[0], '.bin')
466
+ path = os.path.join(save_dir, f"{base}{ext}")
467
+
468
+ # Handle name collisions
469
+ counter = 1
470
+ while os.path.exists(path):
471
+ path = os.path.join(save_dir, f"{base}_{counter}{ext}")
472
+ counter += 1
473
+
474
+ output = gdown.download(file_url, path, quiet=False)
475
+ if output:
476
+ return path
477
+ return None
478
+ except Exception as e:
479
+ logger.error(f"Google Drive download error: {e}")
480
+ return None
481
+
482
+ async with self.context.new_page() as page:
483
+ st.write(f"Downloading: {fname}")
484
+
485
+ headers = {
486
+ 'Accept': '*/*',
487
+ 'Accept-Encoding': 'gzip, deflate, br',
488
+ 'Referer': referer
489
+ }
490
+
491
+ response = await page.request.get(file_url, headers=headers, timeout=30000)
492
+
493
+ if response.status == 200:
494
+ content = await response.body()
495
+
496
+ # Check if we need to add an extension based on content type
497
+ if not ext or ext == "":
498
+ content_type = response.headers.get('Content-Type', '')
499
+ extension_map = {
500
+ 'application/pdf': '.pdf',
501
+ 'image/jpeg': '.jpg',
502
+ 'image/png': '.png',
503
+ 'application/msword': '.doc',
504
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
505
+ 'application/zip': '.zip',
506
+ 'text/plain': '.txt',
507
+ 'application/vnd.ms-excel': '.xls',
508
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
509
+ 'video/mp4': '.mp4',
510
+ 'audio/mpeg': '.mp3',
511
+ 'video/x-msvideo': '.avi',
512
+ 'video/x-matroska': '.mkv'
513
+ }
514
+
515
+ ext = extension_map.get(content_type.split(';')[0], '.bin')
516
+ path = os.path.join(save_dir, f"{base}{ext}")
517
+
518
+ # Handle name collisions again
519
+ counter = 1
520
+ while os.path.exists(path):
521
+ path = os.path.join(save_dir, f"{base}_{counter}{ext}")
522
+ counter += 1
523
+
524
+ with open(path, 'wb') as f:
525
+ f.write(content)
526
+ return path
527
+ else:
528
+ logger.error(f"Download failed with status {response.status}: {file_url}")
529
+ return None
530
+
531
+ except Exception as e:
532
+ logger.error(f"Error downloading {file_url}: {e}")
533
+ return None
534
 
535
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
536
  if not custom_ext_list: