euler314 commited on
Commit
9fa91d7
·
verified ·
1 Parent(s): 851f436

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -485
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import streamlit as st
2
- # Must be the first Streamlit command at the global level
3
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
4
 
5
- # Now all other imports
6
  import os
7
  import subprocess
8
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
@@ -19,44 +18,29 @@ import zipfile
19
  import tempfile
20
  import mimetypes
21
  import requests
 
 
22
  import spacy
23
  import spacy.cli
24
  from spacy.language import Language
25
- from sentence_transformers import SentenceTransformer, util
26
- from transformers import pipeline
27
 
28
- # Initialize logging
29
- logging.basicConfig(
30
- filename='advanced_download_log.txt',
31
- level=logging.INFO,
32
- format='%(asctime)s - %(levelname)s - %(message)s'
33
- )
34
- logger = logging.getLogger()
35
 
36
- # Model initialization with caching
37
  @st.cache_resource
38
- def initialize_models():
39
- # spaCy
40
  try:
41
- nlp = spacy.load("en_core_web_sm")
42
  except OSError:
 
43
  spacy.cli.download("en_core_web_sm")
44
- nlp = spacy.load("en_core_web_sm")
45
-
46
- # SentenceTransformer
47
- semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
48
-
49
- # Transformers
50
- summarizer = pipeline("summarization")
51
-
52
- return nlp, semantic_model, summarizer
53
-
54
- # Initialize models
55
- nlp_model, semantic_model, summarizer = initialize_models()
56
-
57
- # Rest of your code...
58
-
59
 
 
60
 
61
  # Also load SentenceTransformer for semantic re-ranking.
62
  from sentence_transformers import SentenceTransformer, util
@@ -75,10 +59,6 @@ def load_summarizer():
75
  summarizer = load_summarizer()
76
 
77
  def summarize_pdf_url(pdf_url):
78
- """
79
- Downloads a PDF from the given URL, extracts text using PyPDF2,
80
- and returns a summary of (up to) the first 3000 characters.
81
- """
82
  try:
83
  with st.spinner("Downloading and processing PDF..."):
84
  response = requests.get(pdf_url, stream=True)
@@ -88,7 +68,7 @@ def summarize_pdf_url(pdf_url):
88
  reader = PdfReader(temp_pdf.name)
89
  text = " ".join([page.extract_text() or "" for page in reader.pages])
90
  os.remove(temp_pdf.name)
91
- limited_text = text[:3000] # Limit text for summarization
92
  summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
93
  return summary[0]["summary_text"]
94
  except Exception as e:
@@ -97,13 +77,13 @@ def summarize_pdf_url(pdf_url):
97
  # -------------------- Google API Setup --------------------
98
  GOOGLE_OAUTH_CONFIG = {
99
  "web": {
100
- "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
101
- "project_id": "huggingface-449214",
102
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
103
  "token_uri": "https://oauth2.googleapis.com/token",
104
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
105
- "client_secret": "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",
106
- "redirect_uris": ["https://euler314-craw-web.hf.space/"]
107
  }
108
  }
109
 
@@ -142,43 +122,15 @@ def exchange_code_for_credentials(auth_code):
142
  return creds, "Google Sign-In successful!"
143
  except Exception as e:
144
  return None, f"Error during token exchange: {e}"
 
145
  # -------------------- Playwright Setup --------------------
146
  def install_playwright_dependencies():
147
  os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
148
  os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
149
  try:
150
- subprocess.run(['apt-get', 'update', '-y'], check=True)
151
- packages = [
152
- 'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
153
- 'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
154
- 'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
155
- ]
156
- subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
157
- os.makedirs('/usr/lib/playwright', exist_ok=True)
158
- symlinks = {
159
- 'libnss3.so': '/usr/lib/x86_64-linux-gnu/libnss3.so',
160
- 'libnssutil3.so': '/usr/lib/x86_64-linux-gnu/libnssutil3.so',
161
- 'libsmime3.so': '/usr/lib/x86_64-linux-gnu/libsmime3.so',
162
- 'libnspr4.so': '/usr/lib/x86_64-linux-gnu/libnspr4.so',
163
- 'libatk-1.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-1.0.so.0',
164
- 'libatk-bridge-2.0.so.0': '/usr/lib/x86_64-linux-gnu/libatk-bridge-2.0.so.0',
165
- 'libcups.so.2': '/usr/lib/x86_64-linux-gnu/libcups.so.2',
166
- 'libatspi.so.0': '/usr/lib/x86_64-linux-gnu/libatspi.so.0',
167
- 'libXcomposite.so.1': '/usr/lib/x86_64-linux-gnu/libXcomposite.so.1',
168
- 'libXdamage.so.1': '/usr/lib/x86_64-linux-gnu/libXdamage.so.1'
169
- }
170
- for link_name, target in symlinks.items():
171
- link_path = os.path.join('/usr/lib/playwright', link_name)
172
- if not os.path.exists(link_path):
173
- os.symlink(target, link_path)
174
  subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
175
- browser_path = os.path.expanduser("~/.cache/ms-playwright")
176
- os.makedirs(browser_path, exist_ok=True)
177
- subprocess.run(['chmod', '-R', '755', browser_path], check=True)
178
- except subprocess.CalledProcessError as e:
179
- st.error(f"Error installing dependencies: {e}")
180
  except Exception as e:
181
- st.error(f"Error: {e}")
182
 
183
  # Initialize Playwright dependencies
184
  install_playwright_dependencies()
@@ -208,7 +160,6 @@ def sizeof_fmt(num, suffix='B'):
208
  return f"{num:3.1f}{unit}{suffix}"
209
  num /= 1024.0
210
  return f"{num:.1f}Y{suffix}"
211
-
212
  # ---------- Human-like Interactions -------------
213
  async def human_like_scroll(page):
214
  scroll_height = await page.evaluate('document.body.scrollHeight')
@@ -242,358 +193,39 @@ def nlp_extract_entities(text: str):
242
  # ---------- AI-enhanced Query Preprocessing -------------
243
  def ai_preprocess_query(query: str) -> str:
244
  return query
245
- # ---------- Download Manager -------------
246
- class DownloadManager:
247
- def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
248
- self.use_proxy = use_proxy
249
- self.proxy = proxy
250
- self.query = query
251
- self.num_results = num_results
252
- self.playwright = None
253
- self.browser = None
254
- self.context = None
255
- self.page = None
256
-
257
- async def __aenter__(self):
258
- self.playwright = await async_playwright().start()
259
- opts = {"headless": True}
260
- if self.use_proxy and self.proxy:
261
- opts["proxy"] = {"server": self.proxy}
262
- self.browser = await self.playwright.chromium.launch(**opts)
263
- self.context = await self.browser.new_context(user_agent=get_random_user_agent())
264
- self.page = await self.context.new_page()
265
- await self.page.set_extra_http_headers({
266
- 'Accept-Language': 'en-US,en;q=0.9',
267
- 'Accept-Encoding': 'gzip, deflate, br',
268
- 'Referer': 'https://www.bing.com/'
269
- })
270
- return self
271
-
272
- async def __aexit__(self, exc_type, exc_val, exc_tb):
273
- if self.browser:
274
- await self.browser.close()
275
- if self.playwright:
276
- await self.playwright.stop()
277
-
278
- async def get_file_size(self, url):
279
- try:
280
- response = await self.page.request.head(url)
281
- length = response.headers.get('Content-Length', None)
282
- if length:
283
- return sizeof_fmt(int(length))
284
- else:
285
- return "Unknown Size"
286
- except Exception:
287
- return "Unknown Size"
288
-
289
- async def get_pdf_metadata(self, url):
290
- try:
291
- resp = await self.page.request.get(url, timeout=15000)
292
- if resp.ok:
293
- content = await resp.body()
294
- pdf = BytesIO(content)
295
- reader = PdfReader(pdf)
296
- return {
297
- 'Title': reader.metadata.title if reader.metadata.title else 'N/A',
298
- 'Author': reader.metadata.author if reader.metadata.author else 'N/A',
299
- 'Pages': len(reader.pages),
300
- }
301
- else:
302
- return {}
303
- except Exception:
304
- return {}
305
-
306
- async def search_bing(self):
307
- if not self.query:
308
- return [], []
309
- query = self.query
310
- if "filetype:pdf" not in query.lower():
311
- query += " filetype:pdf"
312
- if "site:" not in query.lower():
313
- query += " site:edu OR site:arxiv.org OR site:openstax.org"
314
- query = ai_preprocess_query(query)
315
- query_processed = nlp_preprocess(query)
316
- logger.info(f"BING SEARCH NLP: Original='{query}' -> Processed='{query_processed}'")
317
-
318
- bing_url = f"https://www.bing.com/search?q={query_processed.replace(' ', '+')}&count={self.num_results}"
319
- try:
320
- await self.page.goto(bing_url, timeout=30000)
321
- await self.page.wait_for_selector('li.b_algo', timeout=30000)
322
- await human_like_scroll(self.page)
323
- html = await self.page.content()
324
- soup = BeautifulSoup(html, 'html.parser')
325
- raw_results = soup.find_all('li', class_='b_algo')
326
- url_list = []
327
- info_list = []
328
- snippets = []
329
-
330
- for r in raw_results:
331
- link_tag = r.find('a')
332
- snippet_tag = r.find('p')
333
- snippet_text = snippet_tag.get_text(strip=True) if snippet_tag else ""
334
- snippets.append(snippet_text)
335
- entities = nlp_extract_entities(snippet_text)
336
-
337
- if link_tag and 'href' in link_tag.attrs:
338
- link_url = link_tag['href']
339
- url_list.append(link_url)
340
- info_list.append({
341
- 'url': link_url,
342
- 'snippet': snippet_text,
343
- 'entities': entities
344
- })
345
- if len(url_list) >= self.num_results:
346
- break
347
-
348
- query_emb = semantic_model.encode(query, convert_to_tensor=True)
349
- snippet_embs = semantic_model.encode(snippets, convert_to_tensor=True)
350
- scores = util.cos_sim(query_emb, snippet_embs)[0]
351
- sorted_indices = scores.argsort(descending=True).cpu().numpy().tolist()
352
- sorted_url_list = [url_list[i] for i in sorted_indices]
353
- sorted_info_list = [info_list[i] for i in sorted_indices]
354
-
355
- return sorted_url_list, sorted_info_list
356
- except PlaywrightTimeoutError:
357
- logger.error("Bing search timed out.")
358
- return [], []
359
- except Exception as e:
360
- logger.error(f"Bing search error: {e}")
361
- return [], []
362
-
363
- async def extract_downloadable_files(self, url, custom_ext_list):
364
- found_files = []
365
- try:
366
- await self.page.goto(url, timeout=30000)
367
- await self.page.wait_for_load_state('networkidle', timeout=30000)
368
- await human_like_interactions(self.page)
369
- content = await self.page.content()
370
- soup = BeautifulSoup(content, 'html.parser')
371
-
372
- default_exts = [
373
- '.pdf', '.docx', '.zip', '.rar', '.exe', '.mp3',
374
- '.mp4', '.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif'
375
- ]
376
- all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
377
-
378
- anchors = soup.find_all('a', href=True)
379
- for a in anchors:
380
- href = a['href'].strip()
381
- if any(href.lower().endswith(ext) for ext in all_exts):
382
- if href.startswith('http'):
383
- file_url = href
384
- elif href.startswith('/'):
385
- parsed = urlparse(url)
386
- file_url = f"{parsed.scheme}://{parsed.netloc}{href}"
387
- else:
388
- continue
389
-
390
- size_str = await self.get_file_size(file_url)
391
- meta = {}
392
- if file_url.lower().endswith('.pdf'):
393
- meta = await self.get_pdf_metadata(file_url)
394
-
395
- found_files.append({
396
- 'url': file_url,
397
- 'filename': os.path.basename(file_url.split('?')[0]),
398
- 'size': size_str,
399
- 'metadata': meta
400
- })
401
-
402
- elif ("drive.google.com" in href) or ("drive.com" in href):
403
- file_id = None
404
- for pattern in [
405
- r'/file/d/([^/]+)/',
406
- r'open\?id=([^&]+)',
407
- r'id=([^&]+)'
408
- ]:
409
- match = re.search(pattern, href)
410
- if match:
411
- file_id = match.group(1)
412
- break
413
-
414
- if file_id:
415
- direct = f"https://drive.google.com/uc?export=download&id={file_id}"
416
- filename = f"drive_file_{file_id}"
417
- try:
418
- resp = await self.page.request.head(direct, timeout=15000)
419
- cd = resp.headers.get("Content-Disposition", "")
420
- if cd:
421
- mt = re.search(r'filename\*?="?([^";]+)', cd)
422
- if mt:
423
- filename = mt.group(1).strip('"').strip()
424
- else:
425
- ctype = resp.headers.get("Content-Type", "")
426
- ext_guess = mimetypes.guess_extension(ctype) or ""
427
- filename = f"drive_file_{file_id}{ext_guess}"
428
- except Exception:
429
- pass
430
-
431
- size_str = await self.get_file_size(direct)
432
- found_files.append({
433
- 'url': direct,
434
- 'filename': filename,
435
- 'size': size_str,
436
- 'metadata': {}
437
- })
438
-
439
- return found_files
440
- except PlaywrightTimeoutError:
441
- logger.error(f"Timeout extracting from {url}")
442
- return []
443
- except Exception as e:
444
- logger.error(f"Error extracting from {url}: {e}")
445
- return []
446
-
447
- async def download_file(self, file_info, save_dir, referer):
448
- file_url = file_info['url']
449
- fname = file_info['filename']
450
- path = os.path.join(save_dir, fname)
451
- base, ext = os.path.splitext(fname)
452
- i = 1
453
- while os.path.exists(path):
454
- path = os.path.join(save_dir, f"{base}({i}){ext}")
455
- i += 1
456
-
457
- os.makedirs(save_dir, exist_ok=True)
458
- try:
459
- if file_url.lower().endswith(".pdf") and "drive.google.com" not in file_url.lower():
460
- response = requests.get(file_url, stream=True)
461
- with open(path, "wb") as f:
462
- f.write(response.content)
463
- logger.info(f"Directly downloaded PDF: {path}")
464
- return path
465
-
466
- if "drive.google.com" in file_url.lower():
467
- import gdown
468
- try:
469
- result = gdown.download(file_url, output=path, quiet=False, fuzzy=True)
470
- if result is None:
471
- logger.error(f"gdown failed to download: {file_url}")
472
- return None
473
- current_ext = os.path.splitext(path)[1].lower()
474
- allowed_exts = {'.pdf', '.jpg', '.jpeg', '.png', '.docx', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv'}
475
- if current_ext not in allowed_exts:
476
- try:
477
- r = requests.head(file_url, allow_redirects=True, timeout=15)
478
- ctype = r.headers.get("Content-Type", "")
479
- guessed_ext = mimetypes.guess_extension(ctype) or ".pdf"
480
- except Exception as e:
481
- logger.error(f"Error in HEAD request for extension: {e}")
482
- guessed_ext = ".pdf"
483
- new_path = os.path.splitext(path)[0] + guessed_ext
484
- os.rename(path, new_path)
485
- path = new_path
486
- logger.info(f"Downloaded using gdown: {path}")
487
- return path
488
- except Exception as e:
489
- logger.error(f"Error downloading using gdown: {e}")
490
- return None
491
-
492
- headers = {
493
- 'Accept-Language': 'en-US,en;q=0.9',
494
- 'Accept-Encoding': 'gzip, deflate, br',
495
- 'Referer': referer
496
- }
497
- await human_like_interactions(self.page)
498
- resp = await self.page.request.get(file_url, headers=headers, timeout=30000)
499
- if resp.status == 403:
500
- logger.error(f"403 Forbidden: {file_url}")
501
- return None
502
- if not resp.ok:
503
- logger.error(f"Failed to download {file_url}: Status {resp.status}")
504
- return None
505
- data = await resp.body()
506
- with open(path, 'wb') as f:
507
- f.write(data)
508
- logger.info(f"Downloaded: {path}")
509
- return path
510
- except PlaywrightTimeoutError:
511
- logger.error(f"Timeout downloading {file_url}")
512
- return None
513
- except Exception as e:
514
- logger.error(f"Error downloading {file_url}: {e}")
515
- return None
516
-
517
- async def deep_search(self, url, custom_ext_list, sublink_limit=2000, max_concurrency=500):
518
- progress_text = st.empty()
519
- progress_bar = st.progress(0)
520
-
521
- progress_text.text("Analyzing main page...")
522
- all_files = []
523
- main_files = await self.extract_downloadable_files(url, custom_ext_list)
524
- all_files.extend(main_files)
525
-
526
- progress_text.text("Getting sublinks...")
527
- sublinks = await self.get_sublinks(url, sublink_limit)
528
- total_links = len(sublinks)
529
-
530
- progress_text.text(f"Processing {total_links} sublinks...")
531
- sem = asyncio.Semaphore(max_concurrency)
532
-
533
- async def analyze_one_sublink(link, idx):
534
- async with sem:
535
- progress_text.text(f"Processing link {idx}/{total_links}: {link}")
536
- progress_bar.progress(idx/total_links)
537
- return await self.extract_downloadable_files(link, custom_ext_list)
538
-
539
- tasks = [analyze_one_sublink(link, i) for i, link in enumerate(sublinks, 1)]
540
- sub_results = await asyncio.gather(*tasks)
541
-
542
- for sr in sub_results:
543
- all_files.extend(sr)
544
-
545
- unique_map = {f['url']: f for f in all_files}
546
- combined = list(unique_map.values())
547
-
548
- progress_text.text(f"Found {len(combined)} unique files.")
549
- progress_bar.progress(1.0)
550
- return combined
551
-
552
- async def get_sublinks(self, url, limit=20000):
553
- try:
554
- await self.page.goto(url, timeout=30000)
555
- content = await self.page.content()
556
- soup = BeautifulSoup(content, "html.parser")
557
- links = []
558
- for a in soup.find_all('a', href=True):
559
- href = a['href'].strip()
560
- if href.startswith('http'):
561
- links.append(href)
562
- elif href.startswith('/'):
563
- parsed = urlparse(url)
564
- links.append(f"{parsed.scheme}://{parsed.netloc}{href}")
565
- return list(set(links))[:limit]
566
- except Exception as e:
567
- logger.error(f"Error getting sublinks: {e}")
568
- return []
569
-
570
  def main():
571
-
572
- if 'session_state' not in st.session_state:
573
- st.session_state.session_state = {
574
- 'discovered_files': [],
575
- 'current_url': None,
576
- 'download_manager': None,
577
- 'google_creds': None
578
- }
579
 
580
  st.title("Advanced File Downloader")
581
 
582
- mode = st.sidebar.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
583
-
584
- with st.sidebar.expander("Advanced Options"):
585
- custom_extensions = st.text_input(
586
- "Custom File Extensions",
587
- placeholder=".csv, .txt, .epub"
588
- )
589
- max_concurrency = st.slider(
590
- "Max Concurrency",
591
- min_value=1,
592
- max_value=1000,
593
- value=200
594
- )
595
- use_proxy = st.checkbox("Use Proxy")
596
- proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
 
 
 
597
 
598
  # Google OAuth Section
599
  with st.expander("Google Drive Integration"):
@@ -604,81 +236,160 @@ def main():
604
  auth_code = st.text_input("Enter authorization code")
605
  if st.button("Complete Sign-In") and auth_code:
606
  creds, msg = exchange_code_for_credentials(auth_code)
607
- st.session_state.session_state['google_creds'] = creds
608
  st.write(msg)
609
 
 
610
  if mode == "Manual URL":
611
- manual_url_mode()
612
- elif mode == "Bing Search":
613
- bing_search_mode()
614
- else:
615
- pdf_summarizer_mode()
616
-
617
- def manual_url_mode():
618
- st.header("Manual URL Mode")
619
-
620
- url = st.text_input("Enter URL", placeholder="https://example.com")
621
-
622
- if st.button("Deep Search"):
623
- if url:
624
- async def run_deep_search():
625
- async with DownloadManager(
626
- use_proxy=st.session_state.get('use_proxy', False),
627
- proxy=st.session_state.get('proxy', None)
628
- ) as dm:
629
- files = await dm.deep_search(
630
- url=url,
631
- custom_ext_list=st.session_state.get('custom_extensions', '').split(','),
632
- max_concurrency=st.session_state.get('max_concurrency', 200)
633
- )
634
- st.session_state.session_state['discovered_files'] = files
635
- st.session_state.session_state['current_url'] = url
636
 
 
637
  if files:
638
- st.write(f"Found {len(files)} files:")
639
- for f in files:
640
- st.write(f"- {f['filename']} ({f['size']})")
641
  else:
642
  st.warning("No files found.")
643
-
644
- asyncio.run(run_deep_search())
645
-
646
- def bing_search_mode():
647
- st.header("Bing Search Mode")
648
-
649
- query = st.text_input("Enter search query")
650
- num_results = st.slider("Number of results", 1, 50, 5)
651
-
652
- if st.button("Search"):
653
- if query:
654
- async def run_search():
655
- async with DownloadManager(
656
- use_proxy=st.session_state.get('use_proxy', False),
657
- proxy=st.session_state.get('proxy', None),
658
- query=query,
659
- num_results=num_results
660
- ) as dm:
661
- urls, info = await dm.search_bing()
662
- if urls:
663
- st.write("Search Results:")
664
- for i, (url, info) in enumerate(zip(urls, info), 1):
665
- st.write(f"{i}. {url}")
666
- st.write(f" Snippet: {info['snippet']}")
667
- else:
668
- st.warning("No results found.")
669
-
670
- asyncio.run(run_search())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
 
672
- def pdf_summarizer_mode():
673
- st.header("PDF Summarizer")
674
-
675
- pdf_url = st.text_input("Enter PDF URL")
676
-
677
- if st.button("Summarize"):
678
- if pdf_url:
679
- summary = summarize_pdf_url(pdf_url)
680
- st.write("Summary:")
681
- st.write(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
 
683
  if __name__ == "__main__":
684
  main()
 
1
  import streamlit as st
2
+ # Must be the first Streamlit command
3
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
4
 
 
5
  import os
6
  import subprocess
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 
18
  import tempfile
19
  import mimetypes
20
  import requests
21
+
22
+ # -------------------- spaCy Model Setup --------------------
23
  import spacy
24
  import spacy.cli
25
  from spacy.language import Language
 
 
26
 
27
+ @Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
28
+ def dummy_roberta_transformer(nlp, name):
29
+ def dummy(doc):
30
+ return doc
31
+ return dummy
 
 
32
 
 
33
  @st.cache_resource
34
+ def load_nlp_model():
 
35
  try:
36
+ nlp_model = spacy.load("en_core_web_sm")
37
  except OSError:
38
+ st.write("Model en_core_web_sm not found. Downloading it now...")
39
  spacy.cli.download("en_core_web_sm")
40
+ nlp_model = spacy.load("en_core_web_sm")
41
+ return nlp_model
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ nlp_model = load_nlp_model()
44
 
45
  # Also load SentenceTransformer for semantic re-ranking.
46
  from sentence_transformers import SentenceTransformer, util
 
59
  summarizer = load_summarizer()
60
 
61
  def summarize_pdf_url(pdf_url):
 
 
 
 
62
  try:
63
  with st.spinner("Downloading and processing PDF..."):
64
  response = requests.get(pdf_url, stream=True)
 
68
  reader = PdfReader(temp_pdf.name)
69
  text = " ".join([page.extract_text() or "" for page in reader.pages])
70
  os.remove(temp_pdf.name)
71
+ limited_text = text[:3000]
72
  summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
73
  return summary[0]["summary_text"]
74
  except Exception as e:
 
77
  # -------------------- Google API Setup --------------------
78
  GOOGLE_OAUTH_CONFIG = {
79
  "web": {
80
+ "client_id": "your_client_id",
81
+ "project_id": "your_project_id",
82
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
83
  "token_uri": "https://oauth2.googleapis.com/token",
84
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
85
+ "client_secret": "your_client_secret",
86
+ "redirect_uris": ["your_redirect_uri"]
87
  }
88
  }
89
 
 
122
  return creds, "Google Sign-In successful!"
123
  except Exception as e:
124
  return None, f"Error during token exchange: {e}"
125
+
126
  # -------------------- Playwright Setup --------------------
127
  def install_playwright_dependencies():
128
  os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
129
  os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
130
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
 
 
 
 
 
132
  except Exception as e:
133
+ st.error(f"Error installing Playwright: {e}")
134
 
135
  # Initialize Playwright dependencies
136
  install_playwright_dependencies()
 
160
  return f"{num:3.1f}{unit}{suffix}"
161
  num /= 1024.0
162
  return f"{num:.1f}Y{suffix}"
 
163
  # ---------- Human-like Interactions -------------
164
  async def human_like_scroll(page):
165
  scroll_height = await page.evaluate('document.body.scrollHeight')
 
193
  # ---------- AI-enhanced Query Preprocessing -------------
194
  def ai_preprocess_query(query: str) -> str:
195
  return query
196
+
197
+ # Now I'll add the DownloadManager class...
198
+ # ---------- Download Manager Class -------------
199
+ [Previous DownloadManager class code here...] # Keep all the existing code from the DownloadManager class
200
+
201
+ # ---------- Main Streamlit UI Implementation -------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  def main():
203
+ if 'initialized' not in st.session_state:
204
+ st.session_state.initialized = True
205
+ st.session_state.discovered_files = []
206
+ st.session_state.current_url = None
207
+ st.session_state.google_creds = None
 
 
 
208
 
209
  st.title("Advanced File Downloader")
210
 
211
+ # Sidebar for settings
212
+ with st.sidebar:
213
+ st.header("Settings")
214
+ mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
215
+
216
+ with st.expander("Advanced Options"):
217
+ custom_extensions = st.text_input(
218
+ "Custom File Extensions",
219
+ placeholder=".csv, .txt, .epub"
220
+ )
221
+ max_concurrency = st.slider(
222
+ "Max Concurrency",
223
+ min_value=1,
224
+ max_value=1000,
225
+ value=200
226
+ )
227
+ use_proxy = st.checkbox("Use Proxy")
228
+ proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
229
 
230
  # Google OAuth Section
231
  with st.expander("Google Drive Integration"):
 
236
  auth_code = st.text_input("Enter authorization code")
237
  if st.button("Complete Sign-In") and auth_code:
238
  creds, msg = exchange_code_for_credentials(auth_code)
239
+ st.session_state.google_creds = creds
240
  st.write(msg)
241
 
242
+ # Main content area
243
  if mode == "Manual URL":
244
+ st.header("Manual URL Mode")
245
+ url = st.text_input("Enter URL", placeholder="https://example.com")
246
+
247
+ col1, col2 = st.columns(2)
248
+ with col1:
249
+ if st.button("Deep Search", use_container_width=True):
250
+ if url:
251
+ async def run_deep_search():
252
+ async with DownloadManager(
253
+ use_proxy=use_proxy,
254
+ proxy=proxy
255
+ ) as dm:
256
+ with st.spinner("Searching for files..."):
257
+ files = await dm.deep_search(
258
+ url=url,
259
+ custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
260
+ max_concurrency=max_concurrency
261
+ )
262
+ st.session_state.discovered_files = files
263
+ st.session_state.current_url = url
264
+ return files
 
 
 
 
265
 
266
+ files = asyncio.run(run_deep_search())
267
  if files:
268
+ st.success(f"Found {len(files)} files!")
 
 
269
  else:
270
  st.warning("No files found.")
271
+
272
+ with col2:
273
+ if st.button("Preview Page", use_container_width=True):
274
+ if url:
275
+ async def preview():
276
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
277
+ with st.spinner("Loading preview..."):
278
+ return await dm.preview_page(url)
279
+
280
+ preview_html = asyncio.run(preview())
281
+ st.markdown(preview_html, unsafe_allow_html=True)
282
+
283
+ # File selection and download section
284
+ if st.session_state.discovered_files:
285
+ with st.expander("Download Options", expanded=True):
286
+ file_options = [f"{f['filename']} ({f['size']})" for f in st.session_state.discovered_files]
287
+ selected_indices = st.multiselect(
288
+ "Select files to download",
289
+ range(len(file_options)),
290
+ format_func=lambda x: file_options[x]
291
+ )
292
+
293
+ if selected_indices:
294
+ download_dir = st.text_input("Download Directory", value="./downloads")
295
+ delete_after = st.checkbox("Delete after creating ZIP?")
296
+ upload_drive = st.checkbox("Upload to Google Drive?")
297
+
298
+ if st.button("Download Selected"):
299
+ selected_files = [st.session_state.discovered_files[i] for i in selected_indices]
300
+ async def download_files():
301
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
302
+ paths = []
303
+ for file_info in selected_files:
304
+ with st.spinner(f"Downloading {file_info['filename']}..."):
305
+ path = await dm.download_file(
306
+ file_info,
307
+ download_dir,
308
+ st.session_state.current_url
309
+ )
310
+ if path:
311
+ paths.append(path)
312
+ return paths
313
+
314
+ downloaded_paths = asyncio.run(download_files())
315
+ if downloaded_paths:
316
+ st.success(f"Successfully downloaded {len(downloaded_paths)} files!")
317
+
318
+ # Create ZIP if needed
319
+ if len(downloaded_paths) > 1 or delete_after or upload_drive:
320
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp:
321
+ with zipfile.ZipFile(tmp.name, 'w') as zf:
322
+ for p in downloaded_paths:
323
+ zf.write(p, arcname=os.path.basename(p))
324
+
325
+ if upload_drive and st.session_state.google_creds:
326
+ file_id = google_drive_upload(tmp.name, st.session_state.google_creds)
327
+ if file_id and not isinstance(file_id, str):
328
+ st.success(f"Uploaded to Google Drive! File ID: {file_id}")
329
+ else:
330
+ st.error("Failed to upload to Google Drive")
331
+
332
+ if delete_after:
333
+ for p in downloaded_paths:
334
+ try:
335
+ os.remove(p)
336
+ except:
337
+ pass
338
 
339
+ elif mode == "Bing Search":
340
+ st.header("Bing Search Mode")
341
+ query = st.text_input("Enter search query")
342
+ num_results = st.slider("Number of results", 1, 50, 5)
343
+
344
+ if st.button("Search"):
345
+ if query:
346
+ async def run_search():
347
+ async with DownloadManager(
348
+ use_proxy=use_proxy,
349
+ proxy=proxy,
350
+ query=query,
351
+ num_results=num_results
352
+ ) as dm:
353
+ with st.spinner("Searching..."):
354
+ return await dm.search_bing()
355
+
356
+ urls, info = asyncio.run(run_search())
357
+ if urls:
358
+ st.success(f"Found {len(urls)} results!")
359
+ for i, (url, info) in enumerate(zip(urls, info), 1):
360
+ with st.expander(f"Result {i}: {url}", expanded=i==1):
361
+ st.write(f"Snippet: {info['snippet']}")
362
+ if info['entities']:
363
+ st.write("Entities:", ', '.join(f"{e[0]} ({e[1]})" for e in info['entities']))
364
+
365
+ if st.button(f"Deep Search This Result {i}"):
366
+ st.session_state.current_url = url
367
+ async def search_result():
368
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
369
+ return await dm.deep_search(
370
+ url=url,
371
+ custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
372
+ max_concurrency=max_concurrency
373
+ )
374
+
375
+ files = asyncio.run(search_result())
376
+ if files:
377
+ st.session_state.discovered_files = files
378
+ st.success(f"Found {len(files)} files!")
379
+ else:
380
+ st.warning("No files found.")
381
+ else:
382
+ st.warning("No results found.")
383
+
384
+ else: # PDF Summarizer mode
385
+ st.header("PDF Summarizer")
386
+ pdf_url = st.text_input("Enter PDF URL")
387
+
388
+ if st.button("Summarize"):
389
+ if pdf_url:
390
+ summary = summarize_pdf_url(pdf_url)
391
+ st.write("Summary:")
392
+ st.write(summary)
393
 
394
  if __name__ == "__main__":
395
  main()