euler314 commited on
Commit
fd2ce95
·
verified ·
1 Parent(s): 3672c09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +275 -368
app.py CHANGED
@@ -1,77 +1,7 @@
1
  import streamlit as st
2
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
3
 
4
- # Import other required packages
5
- import spacy
6
- import spacy.cli
7
- import os
8
-
9
- @st.cache_resource
10
- def load_models():
11
- try:
12
- # Try to load spaCy model
13
- try:
14
- nlp = spacy.load("en_core_web_sm")
15
- except OSError:
16
- st.info("Downloading spaCy model...")
17
- spacy.cli.download("en_core_web_sm")
18
- nlp = spacy.load("en_core_web_sm")
19
-
20
- # Load SentenceTransformer with offline handling
21
- try:
22
- from sentence_transformers import SentenceTransformer
23
- model_name = 'all-MiniLM-L6-v2'
24
- cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
25
- if os.path.exists(os.path.join(cache_dir, model_name)):
26
- semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
27
- else:
28
- st.warning(f"Downloading SentenceTransformer model {model_name}...")
29
- semantic_model = SentenceTransformer(model_name)
30
- except Exception as e:
31
- st.error(f"Error loading SentenceTransformer: {e}")
32
- st.info("Continuing without semantic search capability...")
33
- semantic_model = None
34
-
35
- # Load Transformers pipeline with offline handling
36
- try:
37
- from transformers import pipeline
38
- model_name = "facebook/bart-large-cnn"
39
- cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
40
- if os.path.exists(os.path.join(cache_dir, model_name)):
41
- summarizer = pipeline("summarization", model=model_name)
42
- else:
43
- st.warning(f"Downloading Transformer model {model_name}...")
44
- summarizer = pipeline("summarization")
45
- except Exception as e:
46
- st.error(f"Error loading Transformers: {e}")
47
- st.info("Continuing without summarization capability...")
48
- summarizer = None
49
-
50
- return nlp, semantic_model, summarizer
51
-
52
- except Exception as e:
53
- st.error(f"Error loading models: {e}")
54
- return None, None, None
55
-
56
- # Initialize models with better error handling
57
- with st.spinner("Loading models..."):
58
- nlp_model, semantic_model, summarizer = load_models()
59
-
60
- if nlp_model is None:
61
- st.error("Failed to load essential NLP model. The application cannot continue.")
62
- st.stop()
63
- else:
64
- # Continue with available features based on which models loaded successfully
65
- if semantic_model is None:
66
- st.warning("Semantic search features will be disabled.")
67
- if summarizer is None:
68
- st.warning("PDF summarization features will be disabled.")
69
-
70
-
71
- # Rest of your imports and code here...
72
-
73
- # Rest of your code...
74
-
75
  import os
76
  import subprocess
77
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
@@ -88,7 +18,28 @@ import zipfile
88
  import tempfile
89
  import mimetypes
90
  import requests
91
- # -------------------- Playwright Setup --------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def install_playwright_dependencies():
93
  os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
94
  os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
@@ -126,23 +77,14 @@ def install_playwright_dependencies():
126
  except Exception as e:
127
  print(f"Error: {e}")
128
 
 
129
  install_playwright_dependencies()
130
 
131
- # -------------------- spaCy Model Setup --------------------
132
- import spacy
133
- import spacy.cli
134
- from spacy.language import Language
135
-
136
- @Language.factory("spacy-curated-transformers_RobertaTransformer_v1")
137
- def dummy_roberta_transformer(nlp, name):
138
- def dummy(doc):
139
- return doc
140
- return dummy
141
-
142
  @st.cache_resource
143
  def load_models():
144
  try:
145
- # Load spaCy model
146
  try:
147
  nlp = spacy.load("en_core_web_sm")
148
  except OSError:
@@ -150,18 +92,30 @@ def load_models():
150
  spacy.cli.download("en_core_web_sm")
151
  nlp = spacy.load("en_core_web_sm")
152
 
153
- # Load SentenceTransformer
154
  try:
155
  from sentence_transformers import SentenceTransformer
156
- semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
 
 
 
157
  except Exception as e:
158
  st.error(f"Error loading SentenceTransformer: {e}")
159
  semantic_model = None
160
 
161
- # Load Transformers pipeline with correct import
162
  try:
163
- from transformers import pipeline, AutoModelForSeq2SeqGenerationWithLMHead, AutoTokenizer
164
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
 
165
  except Exception as e:
166
  st.error(f"Error loading Transformers: {e}")
167
  summarizer = None
@@ -172,55 +126,38 @@ def load_models():
172
  st.error(f"Error loading models: {e}")
173
  return None, None, None
174
 
175
- # Also load SentenceTransformer for semantic re-ranking.
176
- from sentence_transformers import SentenceTransformer, util
177
- @st.cache_resource
178
- def load_semantic_model():
179
- return SentenceTransformer('all-MiniLM-L6-v2')
180
-
181
- semantic_model = load_semantic_model()
182
-
183
- # -------------------- Transformers Summarization Setup --------------------
184
- from transformers import pipeline
185
- @st.cache_resource
186
- def load_summarizer():
187
- return pipeline("summarization")
188
-
189
- summarizer = load_summarizer()
190
 
191
- def summarize_pdf_url(pdf_url):
192
- try:
193
- with st.spinner("Downloading and processing PDF..."):
194
- response = requests.get(pdf_url, stream=True)
195
- temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
196
- with open(temp_pdf.name, "wb") as f:
197
- f.write(response.content)
198
- reader = PdfReader(temp_pdf.name)
199
- text = " ".join([page.extract_text() or "" for page in reader.pages])
200
- os.remove(temp_pdf.name)
201
- limited_text = text[:3000]
202
- summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
203
- return summary[0]["summary_text"]
204
- except Exception as e:
205
- return f"Error summarizing PDF: {e}"
206
 
207
- # -------------------- Google API Setup --------------------
208
- GOOGLE_OAUTH_CONFIG = {
209
- "web": {
210
- "client_id": "your_client_id",
211
- "project_id": "your_project_id",
212
- "auth_uri": "https://accounts.google.com/o/oauth2/auth",
213
- "token_uri": "https://oauth2.googleapis.com/token",
214
- "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
215
- "client_secret": "your_client_secret",
216
- "redirect_uris": ["your_redirect_uri"]
217
- }
218
- }
219
 
220
- import google_auth_oauthlib.flow
221
- import googleapiclient.discovery
222
- import google.auth.transport.requests
 
 
 
 
 
 
223
 
 
224
  def get_google_auth_url():
225
  client_config = GOOGLE_OAUTH_CONFIG["web"]
226
  flow = google_auth_oauthlib.flow.Flow.from_client_config(
@@ -253,76 +190,16 @@ def exchange_code_for_credentials(auth_code):
253
  except Exception as e:
254
  return None, f"Error during token exchange: {e}"
255
 
256
- # -------------------- Playwright Setup --------------------
257
- def install_playwright_dependencies():
258
- os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
259
- os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
260
  try:
261
- subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
 
 
 
 
262
  except Exception as e:
263
- st.error(f"Error installing Playwright: {e}")
264
-
265
- # Initialize Playwright dependencies
266
- install_playwright_dependencies()
267
-
268
- # -------------------- Logging Setup --------------------
269
- logging.basicConfig(
270
- filename='advanced_download_log.txt',
271
- level=logging.INFO,
272
- format='%(asctime)s - %(levelname)s - %(message)s'
273
- )
274
- logger = logging.getLogger()
275
-
276
- # -------------------- Shared Utils --------------------
277
- USER_AGENTS = [
278
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
279
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
280
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
281
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
282
- ]
283
-
284
- def get_random_user_agent():
285
- return random.choice(USER_AGENTS)
286
-
287
- def sizeof_fmt(num, suffix='B'):
288
- for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
289
- if abs(num) < 1024.0:
290
- return f"{num:3.1f}{unit}{suffix}"
291
- num /= 1024.0
292
- return f"{num:.1f}Y{suffix}"
293
- # ---------- Human-like Interactions -------------
294
- async def human_like_scroll(page):
295
- scroll_height = await page.evaluate('document.body.scrollHeight')
296
- viewport_height = await page.evaluate('window.innerHeight')
297
- current_scroll = 0
298
- while current_scroll < scroll_height:
299
- await page.evaluate(f'window.scrollTo(0, {current_scroll})')
300
- await asyncio.sleep(random.uniform(0.5, 1.5))
301
- current_scroll += viewport_height * random.uniform(0.5, 1.5)
302
- scroll_height = await page.evaluate('document.body.scrollHeight')
303
-
304
- async def human_like_interactions(page):
305
- await page.mouse.move(random.randint(0, 1000), random.randint(0, 1000))
306
- await asyncio.sleep(random.uniform(0.5, 1.5))
307
- await page.mouse.click(random.randint(0, 1000), random.randint(0, 1000))
308
- await asyncio.sleep(random.uniform(0.5, 1.5))
309
- await page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
310
- await asyncio.sleep(random.uniform(0.5, 1.5))
311
-
312
- # ---------- NLP Helpers -------------
313
- def nlp_preprocess(query: str) -> str:
314
- doc = nlp_model(query)
315
- tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
316
- processed = " ".join(tokens)
317
- return processed if processed.strip() else query
318
-
319
- def nlp_extract_entities(text: str):
320
- doc = nlp_model(text)
321
- return [(ent.text, ent.label_) for ent in doc.ents]
322
-
323
- # ---------- AI-enhanced Query Preprocessing -------------
324
- def ai_preprocess_query(query: str) -> str:
325
- return query
326
  class DownloadManager:
327
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
328
  self.use_proxy = use_proxy
@@ -336,9 +213,20 @@ class DownloadManager:
336
 
337
  async def __aenter__(self):
338
  self.playwright = await async_playwright().start()
339
- opts = {"headless": True}
 
 
 
 
 
 
 
 
 
 
340
  if self.use_proxy and self.proxy:
341
  opts["proxy"] = {"server": self.proxy}
 
342
  self.browser = await self.playwright.chromium.launch(**opts)
343
  self.context = await self.browser.new_context(user_agent=get_random_user_agent())
344
  self.page = await self.context.new_page()
@@ -391,10 +279,6 @@ class DownloadManager:
391
  response = await page.goto(url, wait_until='networkidle', timeout=30000)
392
  if response and response.headers.get('location'):
393
  return response.headers['location']
394
- content_type = response.headers.get('content-type', '')
395
- if 'text/html' not in content_type.lower():
396
- return url
397
- content = await page.content()
398
  return page.url
399
  except Exception as e:
400
  logger.error(f"Error extracting real download URL: {e}")
@@ -432,6 +316,23 @@ class DownloadManager:
432
 
433
  for a in soup.find_all('a', href=True):
434
  href = a['href'].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  if any(href.lower().endswith(ext) for ext in all_exts):
436
  file_url = href if href.startswith('http') else (
437
  f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
@@ -449,6 +350,7 @@ class DownloadManager:
449
  'metadata': meta
450
  })
451
 
 
452
  elif ("drive.google.com" in href) or ("docs.google.com" in href):
453
  file_id = None
454
  for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
@@ -477,7 +379,15 @@ class DownloadManager:
477
  except Exception as e:
478
  logger.error(f"Error processing Google Drive link: {e}")
479
 
480
- return found_files
 
 
 
 
 
 
 
 
481
  except Exception as e:
482
  logger.error(f"Error extracting files from {url}: {e}")
483
  return []
@@ -531,75 +441,27 @@ class DownloadManager:
531
  logger.error(f"Error downloading {file_url}: {e}")
532
  return None
533
 
534
- async def search_bing(self):
535
- if not self.query:
536
- return [], []
537
-
538
- search_query = self.query
539
- if "filetype:pdf" not in search_query.lower():
540
- search_query += " filetype:pdf"
541
-
542
- search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
543
-
544
- try:
545
- await self.page.goto(search_url, timeout=30000)
546
- await self.page.wait_for_selector('li.b_algo', timeout=30000)
547
- await human_like_scroll(self.page)
548
-
549
- results = []
550
- elements = await self.page.query_selector_all('li.b_algo')
551
-
552
- for element in elements:
553
- link = await element.query_selector('h2 a')
554
- if link:
555
- url = await link.get_attribute('href')
556
- if url:
557
- results.append(url)
558
-
559
- return results[:self.num_results]
560
-
561
- except Exception as e:
562
- logger.error(f"Bing search error: {e}")
563
- return []
564
-
565
- async def get_sublinks(self, url, limit=100):
566
- try:
567
- await self.page.goto(url, timeout=30000)
568
- content = await self.page.content()
569
- soup = BeautifulSoup(content, 'html.parser')
570
-
571
- parsed_base = urlparse(url)
572
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
573
-
574
- links = set()
575
- for a in soup.find_all('a', href=True):
576
- href = a['href'].strip()
577
- if href.startswith('http'):
578
- links.add(href)
579
- elif href.startswith('/'):
580
- links.add(f"{base_url}{href}")
581
-
582
- return list(links)[:limit]
583
-
584
- except Exception as e:
585
- logger.error(f"Error getting sublinks: {e}")
586
- return []
587
-
588
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
589
  if not custom_ext_list:
590
  custom_ext_list = []
591
 
592
  progress_text = st.empty()
593
  progress_bar = st.progress(0)
 
594
 
595
  try:
596
  # Search main page
597
  progress_text.text("Analyzing main page...")
598
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
 
 
599
 
600
  # Get and search sublinks
601
  progress_text.text("Getting sublinks...")
602
  sublinks = await self.get_sublinks(url, sublink_limit)
 
 
 
603
 
604
  if not sublinks:
605
  progress_bar.progress(1.0)
@@ -607,32 +469,67 @@ class DownloadManager:
607
 
608
  # Process sublinks
609
  all_files = main_files
610
- total_links = len(sublinks)
611
 
612
  for i, sublink in enumerate(sublinks, 1):
 
613
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
614
- progress_bar.progress(i/total_links)
615
 
616
  sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
617
  all_files.extend(sub_files)
618
 
 
 
 
619
  # Make results unique
620
  seen_urls = set()
621
  unique_files = []
 
622
  for f in all_files:
623
  if f['url'] not in seen_urls:
624
  seen_urls.add(f['url'])
625
  unique_files.append(f)
626
 
627
- progress_text.text(f"Found {len(unique_files)} unique files")
 
 
628
  progress_bar.progress(1.0)
629
 
630
  return unique_files
631
 
632
  except Exception as e:
633
  logger.error(f"Deep search error: {e}")
 
634
  return []
 
 
 
 
 
 
635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  def main():
637
  if 'initialized' not in st.session_state:
638
  st.session_state.initialized = True
@@ -642,6 +539,7 @@ def main():
642
 
643
  st.title("Advanced File Downloader")
644
 
 
645
  with st.sidebar:
646
  st.header("Settings")
647
  mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
@@ -651,9 +549,28 @@ def main():
651
  "Custom File Extensions",
652
  placeholder=".csv, .txt, .epub"
653
  )
 
 
 
 
 
 
 
654
  use_proxy = st.checkbox("Use Proxy")
655
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
656
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  if mode == "Manual URL":
658
  st.header("Manual URL Mode")
659
  url = st.text_input("Enter URL", placeholder="https://example.com")
@@ -662,74 +579,99 @@ def main():
662
  if url:
663
  async def run_deep_search():
664
  async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
665
- with st.spinner("Searching for files..."):
666
- files = await dm.deep_search(
667
- url=url,
668
- custom_ext_list=custom_extensions.split(',') if custom_extensions else []
669
- )
670
- st.session_state.discovered_files = files
671
- st.session_state.current_url = url
672
- return files
673
 
674
  files = asyncio.run(run_deep_search())
675
  if files:
676
  st.success(f"Found {len(files)} files!")
677
 
678
- with st.expander("Found Files", expanded=True):
679
- for i, file in enumerate(files):
680
- col1, col2 = st.columns([3, 1])
681
- with col1:
682
- st.write(f"{i+1}. {file['filename']}")
683
- with col2:
684
- st.write(f"Size: {file['size']}")
685
 
686
- # Download section
687
- st.subheader("Download Files")
688
  selected_files = st.multiselect(
689
  "Select files to download",
690
  range(len(files)),
 
691
  format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
692
  )
693
 
694
  if selected_files:
695
- col1, col2 = st.columns([3, 1])
696
  with col1:
697
  download_dir = st.text_input("Download Directory", value="./downloads")
698
  with col2:
699
- if st.button("Download Selected", use_container_width=True):
700
- async def download_files():
701
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
702
- paths = []
703
- progress_text = st.empty()
704
- progress_bar = st.progress(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
- for i, idx in enumerate(selected_files):
707
- progress = (i + 1) / len(selected_files)
708
- progress_text.text(f"Downloading {files[idx]['filename']}...")
709
- progress_bar.progress(progress)
710
-
711
- path = await dm.download_file(
712
- files[idx],
713
- download_dir,
714
- url
715
- )
716
- if path:
717
- paths.append(path)
718
 
719
- progress_text.empty()
720
- progress_bar.empty()
721
- return paths
 
 
 
 
 
 
 
 
 
 
722
 
723
- downloaded = asyncio.run(download_files())
724
- if downloaded:
725
- st.success(f"Successfully downloaded {len(downloaded)} files to {download_dir}")
726
- # Create zip file if multiple files were downloaded
727
- if len(downloaded) > 1:
728
- zip_path = os.path.join(download_dir, "downloads.zip")
729
- with zipfile.ZipFile(zip_path, 'w') as zipf:
730
- for file in downloaded:
731
- zipf.write(file, os.path.basename(file))
732
- st.success(f"Created zip file: {zip_path}")
 
 
 
 
 
 
 
 
 
 
 
 
733
  else:
734
  st.warning("No files found.")
735
 
@@ -753,64 +695,18 @@ def main():
753
  st.success(f"Found {len(urls)} results!")
754
  for i, url in enumerate(urls, 1):
755
  with st.expander(f"Result {i}: {url}", expanded=i==1):
756
- if st.button(f"Deep Search This Result {i}"):
757
  files = await dm.deep_search(
758
  url=url,
759
- custom_ext_list=custom_extensions.split(',') if custom_extensions else []
 
760
  )
 
761
  if files:
762
  st.session_state.discovered_files = files
763
  st.session_state.current_url = url
764
  st.success(f"Found {len(files)} files!")
765
-
766
- with st.expander("Found Files", expanded=True):
767
- for j, file in enumerate(files):
768
- col1, col2 = st.columns([3, 1])
769
- with col1:
770
- st.write(f"{j+1}. {file['filename']}")
771
- with col2:
772
- st.write(f"Size: {file['size']}")
773
-
774
- selected_files = st.multiselect(
775
- "Select files to download",
776
- range(len(files)),
777
- format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
778
- )
779
-
780
- if selected_files:
781
- col1, col2 = st.columns([3, 1])
782
- with col1:
783
- download_dir = st.text_input("Download Directory", value="./downloads")
784
- with col2:
785
- if st.button("Download Selected Files"):
786
- progress_text = st.empty()
787
- progress_bar = st.progress(0)
788
-
789
- paths = []
790
- for k, idx in enumerate(selected_files):
791
- progress = (k + 1) / len(selected_files)
792
- progress_text.text(f"Downloading {files[idx]['filename']}...")
793
- progress_bar.progress(progress)
794
-
795
- path = await dm.download_file(
796
- files[idx],
797
- download_dir,
798
- url
799
- )
800
- if path:
801
- paths.append(path)
802
-
803
- progress_text.empty()
804
- progress_bar.empty()
805
-
806
- if paths:
807
- st.success(f"Successfully downloaded {len(paths)} files to {download_dir}")
808
- if len(paths) > 1:
809
- zip_path = os.path.join(download_dir, "downloads.zip")
810
- with zipfile.ZipFile(zip_path, 'w') as zipf:
811
- for file in paths:
812
- zipf.write(file, os.path.basename(file))
813
- st.success(f"Created zip file: {zip_path}")
814
  else:
815
  st.warning("No files found on this page.")
816
  else:
@@ -828,9 +724,20 @@ def main():
828
  if st.button("Summarize"):
829
  if pdf_url:
830
  with st.spinner("Generating summary..."):
831
- summary = summarize_pdf_url(pdf_url)
832
- st.write("Summary:")
833
- st.write(summary)
 
 
 
 
 
 
 
 
 
 
 
834
 
835
  if __name__ == "__main__":
836
  try:
 
1
  import streamlit as st
2
  st.set_page_config(page_title="Advanced File Downloader", layout="wide")
3
 
4
+ # Core imports
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import os
6
  import subprocess
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
 
18
  import tempfile
19
  import mimetypes
20
  import requests
21
+ import datetime
22
+ import spacy
23
+ import spacy.cli
24
+ from spacy.language import Language
25
+ import google_auth_oauthlib.flow
26
+ import googleapiclient.discovery
27
+ import google.auth.transport.requests
28
+
29
+ # Google OAuth Configuration
30
+ GOOGLE_OAUTH_CONFIG = {
31
+ "web": {
32
+ "client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
33
+ "project_id": "huggingface-449214",
34
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
35
+ "token_uri": "https://oauth2.googleapis.com/token",
36
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
37
+ "client_secret": "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",
38
+ "redirect_uris": ["https://euler314-craw-web.hf.space/"]
39
+ }
40
+ }
41
+
42
+ # Playwright Setup
43
  def install_playwright_dependencies():
44
  os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
45
  os.environ['LD_LIBRARY_PATH'] = '/usr/lib/playwright:/usr/lib/x86_64-linux-gnu'
 
77
  except Exception as e:
78
  print(f"Error: {e}")
79
 
80
+ # Initialize Playwright
81
  install_playwright_dependencies()
82
 
83
+ # Model Loading
 
 
 
 
 
 
 
 
 
 
84
  @st.cache_resource
85
  def load_models():
86
  try:
87
+ # Try to load spaCy model
88
  try:
89
  nlp = spacy.load("en_core_web_sm")
90
  except OSError:
 
92
  spacy.cli.download("en_core_web_sm")
93
  nlp = spacy.load("en_core_web_sm")
94
 
95
+ # Load SentenceTransformer with offline handling
96
  try:
97
  from sentence_transformers import SentenceTransformer
98
+ model_name = 'all-MiniLM-L6-v2'
99
+ cache_dir = os.path.expanduser('~/.cache/torch/sentence_transformers')
100
+ if os.path.exists(os.path.join(cache_dir, model_name)):
101
+ semantic_model = SentenceTransformer(os.path.join(cache_dir, model_name))
102
+ else:
103
+ st.warning(f"Downloading SentenceTransformer model {model_name}...")
104
+ semantic_model = SentenceTransformer(model_name)
105
  except Exception as e:
106
  st.error(f"Error loading SentenceTransformer: {e}")
107
  semantic_model = None
108
 
109
+ # Load Transformers pipeline with offline handling
110
  try:
111
+ from transformers import pipeline
112
+ model_name = "facebook/bart-large-cnn"
113
+ cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')
114
+ if os.path.exists(os.path.join(cache_dir, model_name)):
115
+ summarizer = pipeline("summarization", model=model_name)
116
+ else:
117
+ st.warning(f"Downloading Transformer model {model_name}...")
118
+ summarizer = pipeline("summarization")
119
  except Exception as e:
120
  st.error(f"Error loading Transformers: {e}")
121
  summarizer = None
 
126
  st.error(f"Error loading models: {e}")
127
  return None, None, None
128
 
129
+ # Initialize models
130
+ with st.spinner("Loading models..."):
131
+ nlp_model, semantic_model, summarizer = load_models()
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ # Utility Functions
134
+ def get_random_user_agent():
135
+ USER_AGENTS = [
136
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
137
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
138
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
139
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
140
+ ]
141
+ return random.choice(USER_AGENTS)
 
 
 
 
 
 
142
 
143
+ def sizeof_fmt(num, suffix='B'):
144
+ for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
145
+ if abs(num) < 1024.0:
146
+ return f"{num:3.1f}{unit}{suffix}"
147
+ num /= 1024.0
148
+ return f"{num:.1f}Y{suffix}"
 
 
 
 
 
 
149
 
150
+ def create_zip_file(file_paths, output_dir):
151
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
152
+ zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
153
+
154
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
155
+ for file_path in file_paths:
156
+ zipf.write(file_path, os.path.basename(file_path))
157
+
158
+ return zip_path
159
 
160
+ # Google Drive Functions
161
  def get_google_auth_url():
162
  client_config = GOOGLE_OAUTH_CONFIG["web"]
163
  flow = google_auth_oauthlib.flow.Flow.from_client_config(
 
190
  except Exception as e:
191
  return None, f"Error during token exchange: {e}"
192
 
193
+ def google_drive_upload(zip_path: str, credentials):
 
 
 
194
  try:
195
+ drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
196
+ file_metadata = {'name': os.path.basename(zip_path)}
197
+ media = googleapiclient.http.MediaFileUpload(zip_path, resumable=True)
198
+ created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
199
+ return created.get("id", "")
200
  except Exception as e:
201
+ return f"Error uploading to Drive: {str(e)}"
202
+ # DownloadManager Class
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  class DownloadManager:
204
  def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
205
  self.use_proxy = use_proxy
 
213
 
214
  async def __aenter__(self):
215
  self.playwright = await async_playwright().start()
216
+ opts = {
217
+ "headless": True,
218
+ "args": [
219
+ '--no-sandbox',
220
+ '--disable-setuid-sandbox',
221
+ '--disable-dev-shm-usage',
222
+ '--disable-gpu',
223
+ '--no-zygote',
224
+ '--single-process'
225
+ ]
226
+ }
227
  if self.use_proxy and self.proxy:
228
  opts["proxy"] = {"server": self.proxy}
229
+
230
  self.browser = await self.playwright.chromium.launch(**opts)
231
  self.context = await self.browser.new_context(user_agent=get_random_user_agent())
232
  self.page = await self.context.new_page()
 
279
  response = await page.goto(url, wait_until='networkidle', timeout=30000)
280
  if response and response.headers.get('location'):
281
  return response.headers['location']
 
 
 
 
282
  return page.url
283
  except Exception as e:
284
  logger.error(f"Error extracting real download URL: {e}")
 
316
 
317
  for a in soup.find_all('a', href=True):
318
  href = a['href'].strip()
319
+
320
+ # Handle PHP scripts and redirects
321
+ if '.php' in href.lower() or 'download' in href.lower():
322
+ full_url = href if href.startswith('http') else (
323
+ f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
324
+ )
325
+ real_url = await self.extract_real_download_url(full_url)
326
+ if real_url and real_url != full_url:
327
+ found_files.append({
328
+ 'url': real_url,
329
+ 'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
330
+ 'size': await self.get_file_size(real_url),
331
+ 'metadata': {}
332
+ })
333
+ continue
334
+
335
+ # Handle direct file links
336
  if any(href.lower().endswith(ext) for ext in all_exts):
337
  file_url = href if href.startswith('http') else (
338
  f"{base_url}{href}" if href.startswith('/') else f"{base_url}/{href}"
 
350
  'metadata': meta
351
  })
352
 
353
+ # Handle Google Drive links
354
  elif ("drive.google.com" in href) or ("docs.google.com" in href):
355
  file_id = None
356
  for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
 
379
  except Exception as e:
380
  logger.error(f"Error processing Google Drive link: {e}")
381
 
382
+ # Make results unique based on URLs
383
+ seen_urls = set()
384
+ unique_files = []
385
+ for f in found_files:
386
+ if f['url'] not in seen_urls:
387
+ seen_urls.add(f['url'])
388
+ unique_files.append(f)
389
+
390
+ return unique_files
391
  except Exception as e:
392
  logger.error(f"Error extracting files from {url}: {e}")
393
  return []
 
441
  logger.error(f"Error downloading {file_url}: {e}")
442
  return None
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
445
  if not custom_ext_list:
446
  custom_ext_list = []
447
 
448
  progress_text = st.empty()
449
  progress_bar = st.progress(0)
450
+ file_count_text = st.empty()
451
 
452
  try:
453
  # Search main page
454
  progress_text.text("Analyzing main page...")
455
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
456
+ initial_count = len(main_files)
457
+ file_count_text.text(f"Found {initial_count} files on main page")
458
 
459
  # Get and search sublinks
460
  progress_text.text("Getting sublinks...")
461
  sublinks = await self.get_sublinks(url, sublink_limit)
462
+ total_links = len(sublinks)
463
+
464
+ progress_text.text(f"Found {total_links} sublinks to process")
465
 
466
  if not sublinks:
467
  progress_bar.progress(1.0)
 
469
 
470
  # Process sublinks
471
  all_files = main_files
 
472
 
473
  for i, sublink in enumerate(sublinks, 1):
474
+ progress = i/total_links
475
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
476
+ progress_bar.progress(progress)
477
 
478
  sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
479
  all_files.extend(sub_files)
480
 
481
+ # Update count in real-time
482
+ file_count_text.text(f"Found {len(all_files)} total files")
483
+
484
  # Make results unique
485
  seen_urls = set()
486
  unique_files = []
487
+
488
  for f in all_files:
489
  if f['url'] not in seen_urls:
490
  seen_urls.add(f['url'])
491
  unique_files.append(f)
492
 
493
+ final_count = len(unique_files)
494
+ progress_text.text(f"Deep search complete!")
495
+ file_count_text.text(f"Found {final_count} unique files")
496
  progress_bar.progress(1.0)
497
 
498
  return unique_files
499
 
500
  except Exception as e:
501
  logger.error(f"Deep search error: {e}")
502
+ progress_text.text(f"Error during deep search: {str(e)}")
503
  return []
504
+ finally:
505
+ # Clean up progress indicators after a delay
506
+ await asyncio.sleep(2)
507
+ if not st.session_state.get('keep_progress', False):
508
+ progress_text.empty()
509
+ progress_bar.empty()
510
 
511
+ async def get_sublinks(self, url, limit=100):
512
+ try:
513
+ await self.page.goto(url, timeout=30000)
514
+ content = await self.page.content()
515
+ soup = BeautifulSoup(content, 'html.parser')
516
+
517
+ parsed_base = urlparse(url)
518
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
519
+
520
+ links = set()
521
+ for a in soup.find_all('a', href=True):
522
+ href = a['href'].strip()
523
+ if href.startswith('http'):
524
+ links.add(href)
525
+ elif href.startswith('/'):
526
+ links.add(f"{base_url}{href}")
527
+
528
+ return list(links)[:limit]
529
+
530
+ except Exception as e:
531
+ logger.error(f"Error getting sublinks: {e}")
532
+ return []
533
  def main():
534
  if 'initialized' not in st.session_state:
535
  st.session_state.initialized = True
 
539
 
540
  st.title("Advanced File Downloader")
541
 
542
+ # Sidebar settings
543
  with st.sidebar:
544
  st.header("Settings")
545
  mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"])
 
549
  "Custom File Extensions",
550
  placeholder=".csv, .txt, .epub"
551
  )
552
+ max_sublinks = st.number_input(
553
+ "Maximum Sublinks to Process",
554
+ min_value=1,
555
+ max_value=10000,
556
+ value=100,
557
+ help="Maximum number of sublinks to process from the main page"
558
+ )
559
  use_proxy = st.checkbox("Use Proxy")
560
  proxy = st.text_input("Proxy URL", placeholder="http://proxy:port")
561
 
562
+ # Google Drive Integration
563
+ with st.expander("Google Drive Integration"):
564
+ if st.button("Start Google Sign-In"):
565
+ auth_url = get_google_auth_url()
566
+ st.markdown(f"[Click here to authorize]({auth_url})")
567
+
568
+ auth_code = st.text_input("Enter authorization code")
569
+ if st.button("Complete Sign-In") and auth_code:
570
+ creds, msg = exchange_code_for_credentials(auth_code)
571
+ st.session_state.google_creds = creds
572
+ st.write(msg)
573
+
574
  if mode == "Manual URL":
575
  st.header("Manual URL Mode")
576
  url = st.text_input("Enter URL", placeholder="https://example.com")
 
579
  if url:
580
  async def run_deep_search():
581
  async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
582
+ files = await dm.deep_search(
583
+ url=url,
584
+ custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
585
+ sublink_limit=max_sublinks
586
+ )
587
+ st.session_state.discovered_files = files
588
+ st.session_state.current_url = url
589
+ return files
590
 
591
  files = asyncio.run(run_deep_search())
592
  if files:
593
  st.success(f"Found {len(files)} files!")
594
 
595
+ # Select All/Clear Selection buttons
596
+ col1, col2 = st.columns([1, 4])
597
+ with col1:
598
+ if st.button("Select All"):
599
+ st.session_state.selected_files = list(range(len(files)))
600
+ if st.button("Clear Selection"):
601
+ st.session_state.selected_files = []
602
 
603
+ # File selection
 
604
  selected_files = st.multiselect(
605
  "Select files to download",
606
  range(len(files)),
607
+ default=st.session_state.get('selected_files', []),
608
  format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})"
609
  )
610
 
611
  if selected_files:
612
+ col1, col2, col3, col4 = st.columns(4)
613
  with col1:
614
  download_dir = st.text_input("Download Directory", value="./downloads")
615
  with col2:
616
+ create_zip = st.checkbox("Create ZIP file", value=True)
617
+ with col3:
618
+ delete_after = st.checkbox("Delete after creating ZIP")
619
+ with col4:
620
+ upload_to_drive = st.checkbox("Upload to Google Drive")
621
+
622
+ if st.button("Download Selected"):
623
+ if not os.path.exists(download_dir):
624
+ os.makedirs(download_dir)
625
+
626
+ async def download_files():
627
+ downloaded_paths = []
628
+ progress_bar = st.progress(0)
629
+ status_text = st.empty()
630
+
631
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
632
+ for i, idx in enumerate(selected_files):
633
+ progress = (i + 1) / len(selected_files)
634
+ file_info = files[idx]
635
 
636
+ status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
637
+ progress_bar.progress(progress)
 
 
 
 
 
 
 
 
 
 
638
 
639
+ path = await dm.download_file(
640
+ file_info,
641
+ download_dir,
642
+ url
643
+ )
644
+ if path:
645
+ downloaded_paths.append(path)
646
+
647
+ status_text.empty()
648
+ progress_bar.empty()
649
+ return downloaded_paths
650
+
651
+ downloaded = asyncio.run(download_files())
652
 
653
+ if downloaded:
654
+ st.success(f"Successfully downloaded {len(downloaded)} files")
655
+
656
+ if create_zip or upload_to_drive:
657
+ zip_path = create_zip_file(downloaded, download_dir)
658
+ st.success(f"Created ZIP file: {zip_path}")
659
+
660
+ if upload_to_drive and st.session_state.get('google_creds'):
661
+ with st.spinner("Uploading to Google Drive..."):
662
+ drive_id = google_drive_upload(zip_path, st.session_state.google_creds)
663
+ if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
664
+ st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
665
+ else:
666
+ st.error(drive_id)
667
+
668
+ if delete_after:
669
+ for path in downloaded:
670
+ try:
671
+ os.remove(path)
672
+ except Exception as e:
673
+ st.warning(f"Could not delete {path}: {e}")
674
+ st.info("Deleted original files after ZIP creation")
675
  else:
676
  st.warning("No files found.")
677
 
 
695
  st.success(f"Found {len(urls)} results!")
696
  for i, url in enumerate(urls, 1):
697
  with st.expander(f"Result {i}: {url}", expanded=i==1):
698
+ if st.button(f"Deep Search Result {i}"):
699
  files = await dm.deep_search(
700
  url=url,
701
+ custom_ext_list=custom_extensions.split(',') if custom_extensions else [],
702
+ sublink_limit=max_sublinks
703
  )
704
+ # Reuse the same file handling logic as Manual URL mode
705
  if files:
706
  st.session_state.discovered_files = files
707
  st.session_state.current_url = url
708
  st.success(f"Found {len(files)} files!")
709
+ # Add file selection and download UI here (same as Manual URL mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
  else:
711
  st.warning("No files found on this page.")
712
  else:
 
724
  if st.button("Summarize"):
725
  if pdf_url:
726
  with st.spinner("Generating summary..."):
727
+ try:
728
+ response = requests.get(pdf_url, stream=True)
729
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
730
+ with open(temp_pdf.name, "wb") as f:
731
+ f.write(response.content)
732
+ reader = PdfReader(temp_pdf.name)
733
+ text = " ".join([page.extract_text() or "" for page in reader.pages])
734
+ os.remove(temp_pdf.name)
735
+ limited_text = text[:3000]
736
+ summary = summarizer(limited_text, max_length=200, min_length=50, do_sample=False)
737
+ st.write("Summary:")
738
+ st.write(summary[0]['summary_text'])
739
+ except Exception as e:
740
+ st.error(f"Error summarizing PDF: {e}")
741
 
742
  if __name__ == "__main__":
743
  try: