euler314 commited on
Commit
fb399cc
·
verified ·
1 Parent(s): 9fa91d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +365 -4
app.py CHANGED
@@ -193,10 +193,371 @@ def nlp_extract_entities(text: str):
193
  # ---------- AI-enhanced Query Preprocessing -------------
194
  def ai_preprocess_query(query: str) -> str:
195
  return query
196
-
197
- # Now I'll add the DownloadManager class...
198
- # ---------- Download Manager Class -------------
199
- [Previous DownloadManager class code here...] # Keep all the existing code from the DownloadManager class
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  # ---------- Main Streamlit UI Implementation -------------
202
  def main():
 
193
  # ---------- AI-enhanced Query Preprocessing -------------
194
  def ai_preprocess_query(query: str) -> str:
195
  return query
196
+ class DownloadManager:
197
+ def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
198
+ self.use_proxy = use_proxy
199
+ self.proxy = proxy
200
+ self.query = query
201
+ self.num_results = num_results
202
+ self.playwright = None
203
+ self.browser = None
204
+ self.context = None
205
+ self.page = None
206
+
207
+ async def __aenter__(self):
208
+ self.playwright = await async_playwright().start()
209
+ opts = {"headless": True}
210
+ if self.use_proxy and self.proxy:
211
+ opts["proxy"] = {"server": self.proxy}
212
+ self.browser = await self.playwright.chromium.launch(**opts)
213
+ self.context = await self.browser.new_context(user_agent=get_random_user_agent())
214
+ self.page = await self.context.new_page()
215
+ await self.page.set_extra_http_headers({
216
+ 'Accept-Language': 'en-US,en;q=0.9',
217
+ 'Accept-Encoding': 'gzip, deflate, br',
218
+ 'Referer': 'https://www.bing.com/'
219
+ })
220
+ return self
221
+
222
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
223
+ if self.browser:
224
+ await self.browser.close()
225
+ if self.playwright:
226
+ await self.playwright.stop()
227
+
228
+ async def get_file_size(self, url):
229
+ try:
230
+ async with self.context.new_page() as page:
231
+ response = await page.request.head(url, timeout=15000)
232
+ length = response.headers.get('Content-Length', None)
233
+ if length:
234
+ return sizeof_fmt(int(length))
235
+ else:
236
+ return "Unknown Size"
237
+ except Exception:
238
+ return "Unknown Size"
239
+
240
+ async def get_pdf_metadata(self, url):
241
+ try:
242
+ async with self.context.new_page() as page:
243
+ resp = await page.request.get(url, timeout=15000)
244
+ if resp.ok:
245
+ content = await resp.body()
246
+ pdf = BytesIO(content)
247
+ reader = PdfReader(pdf)
248
+ return {
249
+ 'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
250
+ 'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
251
+ 'Pages': len(reader.pages),
252
+ }
253
+ else:
254
+ return {}
255
+ except Exception:
256
+ return {}
257
+
258
+ async def extract_real_download_url(self, url):
259
+ try:
260
+ async with self.context.new_page() as page:
261
+ response = await page.goto(url, wait_until='networkidle', timeout=30000)
262
+
263
+ # Check if the response is a redirect
264
+ if response and response.headers.get('location'):
265
+ return response.headers['location']
266
+
267
+ # Check if response is a file
268
+ content_type = response.headers.get('content-type', '')
269
+ if 'text/html' not in content_type.lower():
270
+ return url
271
+
272
+ # Look for meta refresh
273
+ content = await page.content()
274
+ soup = BeautifulSoup(content, 'html.parser')
275
+ meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
276
+ if meta_refresh:
277
+ content = meta_refresh.get('content', '')
278
+ if 'url=' in content.lower():
279
+ return content.split('url=')[-1].strip()
280
+
281
+ return page.url
282
+ except Exception as e:
283
+ logger.error(f"Error extracting real download URL: {e}")
284
+ return url
285
+
286
+ async def extract_downloadable_files(self, url, custom_ext_list):
287
+ found_files = []
288
+ try:
289
+ # First try to load the page
290
+ response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
291
+ if not response:
292
+ return []
293
+
294
+ final_url = self.page.url
295
+
296
+ # Handle redirects and download scripts
297
+ if '.php' in final_url or 'download' in final_url or 'get' in final_url:
298
+ real_url = await self.extract_real_download_url(final_url)
299
+ if real_url != final_url:
300
+ content_type = (await self.page.request.head(real_url)).headers.get('content-type', '')
301
+ if content_type and 'text/html' not in content_type.lower():
302
+ found_files.append({
303
+ 'url': real_url,
304
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
305
+ 'size': await self.get_file_size(real_url),
306
+ 'metadata': {}
307
+ })
308
+ return found_files
309
+
310
+ await self.page.wait_for_load_state('networkidle', timeout=30000)
311
+ await human_like_interactions(self.page)
312
+
313
+ content = await self.page.content()
314
+ soup = BeautifulSoup(content, 'html.parser')
315
+
316
+ # Define extensions to look for
317
+ default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv',
318
+ '.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.xls', '.ppt', '.pptx', '.txt']
319
+ all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
320
+
321
+ # Parse base URL for relative links
322
+ parsed_base = urlparse(final_url)
323
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
324
+
325
+ # Find all links
326
+ for a in soup.find_all('a', href=True):
327
+ href = a['href'].strip()
328
+
329
+ # Skip empty or javascript links
330
+ if not href or href.startswith('javascript:') or href == '#':
331
+ continue
332
+
333
+ # Handle special cases (PHP scripts, download handlers)
334
+ if '.php' in href.lower() or 'download' in href.lower() or 'get' in href.lower():
335
+ full_url = href if href.startswith('http') else urljoin(base_url, href)
336
+ real_url = await self.extract_real_download_url(full_url)
337
+ if real_url and real_url != full_url:
338
+ size_str = await self.get_file_size(real_url)
339
+ found_files.append({
340
+ 'url': real_url,
341
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
342
+ 'size': size_str,
343
+ 'metadata': {}
344
+ })
345
+ continue
346
+
347
+ # Handle direct file links
348
+ if any(href.lower().endswith(ext) for ext in all_exts):
349
+ file_url = href if href.startswith('http') else urljoin(base_url, href)
350
+ size_str = await self.get_file_size(file_url)
351
+ meta = {}
352
+
353
+ if file_url.lower().endswith('.pdf'):
354
+ meta = await self.get_pdf_metadata(file_url)
355
+
356
+ found_files.append({
357
+ 'url': file_url,
358
+ 'filename': os.path.basename(urlparse(file_url).path),
359
+ 'size': size_str,
360
+ 'metadata': meta
361
+ })
362
+
363
+ # Handle Google Drive links
364
+ elif any(x in href for x in ['drive.google.com', 'docs.google.com']):
365
+ file_id = None
366
+ for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
367
+ match = re.search(pattern, href)
368
+ if match:
369
+ file_id = match.group(1)
370
+ break
371
+
372
+ if file_id:
373
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
374
+ async with self.context.new_page() as page:
375
+ try:
376
+ response = await page.request.head(direct_url, timeout=15000)
377
+ filename = file_id
378
+ content_disposition = response.headers.get('content-disposition', '')
379
+ if content_disposition:
380
+ filename_match = re.findall('filename="(.+?)"', content_disposition)
381
+ if filename_match:
382
+ filename = filename_match[0]
383
+
384
+ found_files.append({
385
+ 'url': direct_url,
386
+ 'filename': filename,
387
+ 'size': await self.get_file_size(direct_url),
388
+ 'metadata': {}
389
+ })
390
+ except Exception as e:
391
+ logger.error(f"Error processing Google Drive link: {e}")
392
+
393
+ # Make list unique based on URLs
394
+ seen_urls = set()
395
+ unique_files = []
396
+ for f in found_files:
397
+ if f['url'] not in seen_urls:
398
+ seen_urls.add(f['url'])
399
+ unique_files.append(f)
400
+
401
+ return unique_files
402
+
403
+ except Exception as e:
404
+ logger.error(f"Error extracting files from {url}: {e}")
405
+ return []
406
+
407
+ async def download_file(self, file_info, save_dir, referer):
408
+ file_url = file_info['url']
409
+ fname = file_info['filename']
410
+ path = os.path.join(save_dir, fname)
411
+
412
+ # Handle duplicate filenames
413
+ base, ext = os.path.splitext(fname)
414
+ counter = 1
415
+ while os.path.exists(path):
416
+ path = os.path.join(save_dir, f"{base}_{counter}{ext}")
417
+ counter += 1
418
+
419
+ os.makedirs(save_dir, exist_ok=True)
420
+
421
+ try:
422
+ # Special handling for Google Drive
423
+ if 'drive.google.com' in file_url:
424
+ import gdown
425
+ try:
426
+ st.write(f"Downloading from Google Drive: {fname}")
427
+ output = gdown.download(file_url, path, quiet=False)
428
+ if output:
429
+ return path
430
+ return None
431
+ except Exception as e:
432
+ logger.error(f"Google Drive download error: {e}")
433
+ return None
434
+
435
+ # Handle normal downloads
436
+ async with self.context.new_page() as page:
437
+ st.write(f"Downloading: {fname}")
438
+
439
+ headers = {
440
+ 'Accept': '*/*',
441
+ 'Accept-Encoding': 'gzip, deflate, br',
442
+ 'Referer': referer
443
+ }
444
+
445
+ response = await page.request.get(file_url, headers=headers, timeout=30000)
446
+
447
+ if response.status == 200:
448
+ content = await response.body()
449
+ with open(path, 'wb') as f:
450
+ f.write(content)
451
+ return path
452
+ else:
453
+ logger.error(f"Download failed with status {response.status}: {file_url}")
454
+ return None
455
+
456
+ except Exception as e:
457
+ logger.error(f"Error downloading {file_url}: {e}")
458
+ return None
459
+
460
+ async def search_bing(self):
461
+ if not self.query:
462
+ return [], []
463
+
464
+ search_query = self.query
465
+ if "filetype:pdf" not in search_query.lower():
466
+ search_query += " filetype:pdf"
467
+
468
+ search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
469
+
470
+ try:
471
+ await self.page.goto(search_url, timeout=30000)
472
+ await self.page.wait_for_selector('li.b_algo', timeout=30000)
473
+ await human_like_scroll(self.page)
474
+
475
+ results = []
476
+ elements = await self.page.query_selector_all('li.b_algo')
477
+
478
+ for element in elements:
479
+ link = await element.query_selector('h2 a')
480
+ if link:
481
+ url = await link.get_attribute('href')
482
+ if url:
483
+ results.append(url)
484
+
485
+ return results[:self.num_results]
486
+
487
+ except Exception as e:
488
+ logger.error(f"Bing search error: {e}")
489
+ return []
490
+
491
+ async def get_sublinks(self, url, limit=100):
492
+ try:
493
+ await self.page.goto(url, timeout=30000)
494
+ content = await self.page.content()
495
+ soup = BeautifulSoup(content, 'html.parser')
496
+
497
+ parsed_base = urlparse(url)
498
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
499
+
500
+ links = set()
501
+ for a in soup.find_all('a', href=True):
502
+ href = a['href'].strip()
503
+ if href.startswith('http'):
504
+ links.add(href)
505
+ elif href.startswith('/'):
506
+ links.add(f"{base_url}{href}")
507
+
508
+ return list(links)[:limit]
509
+
510
+ except Exception as e:
511
+ logger.error(f"Error getting sublinks: {e}")
512
+ return []
513
+
514
+ async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
515
+ if not custom_ext_list:
516
+ custom_ext_list = []
517
+
518
+ progress_text = st.empty()
519
+ progress_bar = st.progress(0)
520
+
521
+ try:
522
+ # Search main page
523
+ progress_text.text("Analyzing main page...")
524
+ main_files = await self.extract_downloadable_files(url, custom_ext_list)
525
+
526
+ # Get and search sublinks
527
+ progress_text.text("Getting sublinks...")
528
+ sublinks = await self.get_sublinks(url, sublink_limit)
529
+
530
+ if not sublinks:
531
+ progress_bar.progress(1.0)
532
+ return main_files
533
+
534
+ # Process sublinks
535
+ all_files = main_files
536
+ total_links = len(sublinks)
537
+
538
+ for i, sublink in enumerate(sublinks, 1):
539
+ progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
540
+ progress_bar.progress(i/total_links)
541
+
542
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
543
+ all_files.extend(sub_files)
544
+
545
+ # Make results unique
546
+ seen_urls = set()
547
+ unique_files = []
548
+ for f in all_files:
549
+ if f['url'] not in seen_urls:
550
+ seen_urls.add(f['url'])
551
+ unique_files.append(f)
552
+
553
+ progress_text.text(f"Found {len(unique_files)} unique files")
554
+ progress_bar.progress(1.0)
555
+
556
+ return unique_files
557
+
558
+ except Exception as e:
559
+ logger.error(f"Deep search error: {e}")
560
+ return []
561
 
562
  # ---------- Main Streamlit UI Implementation -------------
563
  def main():