euler314 commited on
Commit
73aa1af
·
verified ·
1 Parent(s): 65b12b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1023 -158
app.py CHANGED
@@ -7,7 +7,7 @@ import subprocess
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8
  import asyncio
9
  import logging
10
- from urllib.parse import urlparse, urljoin, unquote
11
  import re
12
  from pathlib import Path
13
  from io import BytesIO
@@ -32,13 +32,27 @@ import googleapiclient.discovery
32
  import google.auth.transport.requests
33
  import googleapiclient.http
34
 
35
- # New imports for RAG search
36
  import nltk
37
  from sklearn.feature_extraction.text import TfidfVectorizer
38
  from sklearn.metrics.pairwise import cosine_similarity
39
  import numpy as np
40
  import docx2txt
41
- import PyPDF2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # -------------------- Logging Setup --------------------
44
  logging.basicConfig(
@@ -96,37 +110,126 @@ PROXY_ROTATION_CONFIG = {
96
  "proxies": [] # Will be populated from the UI if needed
97
  }
98
 
99
- # -------------------- RAG Search Class --------------------
100
- class RAGSearch:
101
  def __init__(self):
102
  self.file_texts = []
 
 
103
  self.file_metadata = []
104
- self.vectorizer = TfidfVectorizer(stop_words='english')
 
 
 
 
 
105
  self.vectors = None
 
 
106
 
107
  def add_file(self, file_data, file_info):
108
- """Add a file to the search index"""
109
- file_ext = os.path.splitext(file_info['filename'])[1]
110
  text = self.extract_text(file_data, file_ext)
 
111
  if text:
 
112
  self.file_texts.append(text)
113
  self.file_metadata.append(file_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  return True
115
  return False
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def extract_text(self, file_data, file_ext):
118
- """Extract text from different file types"""
119
  try:
120
  if file_ext.lower() == '.pdf':
121
  reader = PyPDF2.PdfReader(BytesIO(file_data))
122
  text = ""
123
  for page in reader.pages:
124
- text += page.extract_text() + "\n"
 
 
 
125
  return text
126
  elif file_ext.lower() in ['.docx', '.doc']:
127
  return docx2txt.process(BytesIO(file_data))
128
- elif file_ext.lower() in ['.txt', '.csv', '.json']:
 
 
 
 
 
 
 
 
 
 
 
129
  return file_data.decode('utf-8', errors='ignore')
 
 
 
 
130
  else:
131
  return ""
132
  except Exception as e:
@@ -134,35 +237,107 @@ class RAGSearch:
134
  return ""
135
 
136
  def build_index(self):
137
- """Build the search index"""
138
  if not self.file_texts:
139
  return False
 
140
  try:
 
141
  self.vectors = self.vectorizer.fit_transform(self.file_texts)
 
 
 
 
 
142
  return True
143
  except Exception as e:
144
  logger.error(f"Error building search index: {e}")
145
  return False
146
 
147
- def search(self, query, top_k=5):
148
- """Search the index for relevant files"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  if self.vectors is None:
150
  return []
151
 
152
  try:
153
- query_vector = self.vectorizer.transform([query])
154
- similarities = cosine_similarity(query_vector, self.vectors).flatten()
155
- top_indices = similarities.argsort()[-top_k:][::-1]
 
 
156
 
157
  results = []
158
- for i, idx in enumerate(top_indices):
159
- if similarities[idx] > 0:
160
- results.append({
161
- 'file_info': self.file_metadata[idx],
162
- 'score': float(similarities[idx]),
163
- 'rank': i+1
164
- })
165
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  except Exception as e:
167
  logger.error(f"Error during search: {e}")
168
  return []
@@ -222,6 +397,90 @@ def detect_captcha(html_content):
222
  html_lower = html_content.lower()
223
  return any(pattern in html_lower for pattern in captcha_patterns)
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  # -------------------- Google Drive Functions --------------------
226
  def get_google_auth_url():
227
  client_config = GOOGLE_OAUTH_CONFIG["web"]
@@ -314,6 +573,10 @@ class DownloadManager:
314
  self.request_count = 0
315
  self.captcha_detected = False
316
  self.download_timeout = 300 # 5 minutes timeout for downloads
 
 
 
 
317
 
318
  async def __aenter__(self):
319
  self.playwright = await async_playwright().start()
@@ -594,13 +857,51 @@ class DownloadManager:
594
  try:
595
  await self.rotate_proxy_if_needed()
596
 
597
- async with self.context.new_page() as page:
598
- response = await page.request.head(url, timeout=15000)
599
- length = response.headers.get('Content-Length', None)
600
- if length:
601
- return sizeof_fmt(int(length))
602
- else:
603
- return "Unknown Size"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  except Exception as e:
605
  logger.warning(f"Error getting file size: {e}")
606
  return "Unknown Size"
@@ -627,14 +928,53 @@ class DownloadManager:
627
  return {}
628
 
629
  async def extract_real_download_url(self, url):
 
630
  try:
631
- await self.rotate_proxy_if_needed()
632
-
633
- async with self.context.new_page() as page:
634
- response = await page.goto(url, wait_until='networkidle', timeout=30000)
635
- if response and response.headers.get('location'):
636
- return response.headers['location']
637
- return page.url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
  except Exception as e:
639
  logger.error(f"Error extracting real download URL: {e}")
640
  return url
@@ -702,13 +1042,17 @@ class DownloadManager:
702
  if any(full_url.lower().endswith(ext) for ext in
703
  ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
704
  links.add(full_url)
 
 
 
 
705
  except Exception as e:
706
  logger.warning(f"Request-based extraction failed: {e}")
707
 
708
  # Browser-based approach for more thorough extraction or if initial approach was inadequate
709
  try:
710
  # Check if we need to proceed with browser-based extraction
711
- if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
712
  logger.info("Using browser for enhanced link extraction")
713
 
714
  # Rotate proxy if needed
@@ -800,6 +1144,27 @@ class DownloadManager:
800
  ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
801
  links.add(href)
802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  # Check for ASP.NET specific elements that might contain exam links
804
  grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
805
  for grid in grid_elements:
@@ -928,6 +1293,11 @@ class DownloadManager:
928
  "/resource/", "/material/", "/notes/", "/subjectmaterial/"
929
  ]):
930
  filtered_links.append(link)
 
 
 
 
 
931
 
932
  logger.info(f"Found {len(filtered_links)} potential exam document links")
933
  return filtered_links
@@ -955,31 +1325,119 @@ class DownloadManager:
955
  }
956
  }
957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
958
  // Check for links in data attributes
959
- const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link]');
960
  for (const el of elements) {
961
- for (const attr of ['data-url', 'data-href', 'data-src', 'data-link']) {
962
  const val = el.getAttribute(attr);
963
- if (val && val.match(/^https?:\/\//)) {
964
- links.add(val);
 
 
 
 
 
 
 
 
 
965
  }
966
  }
967
  }
968
 
969
  // Look for URLs in inline event handlers
970
- const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup]');
971
  for (const el of clickableElements) {
972
- for (const attr of ['onclick', 'onmousedown', 'onmouseup']) {
973
  const val = el.getAttribute(attr);
974
  if (val) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975
  const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
976
  for (let match of urlMatches) {
977
  links.add(match.replace(/["']/g, ''));
978
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
  }
980
  }
981
  }
982
 
 
 
 
 
 
 
983
  return Array.from(links);
984
  }
985
  """)
@@ -1046,14 +1504,116 @@ class DownloadManager:
1046
  for link in shadow_links:
1047
  hidden_links.add(link)
1048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  return hidden_links
1050
 
1051
  async def extract_downloadable_files(self, url, custom_ext_list):
1052
  found_files = []
1053
  try:
 
 
 
 
 
 
 
 
 
 
 
1054
  # Rotate proxy if needed
1055
  await self.rotate_proxy_if_needed()
1056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
  # Special handling for educational exam sites
1058
  if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
1059
  ["exam", "test", "pastpaper", "eduexp"]):
@@ -1095,7 +1655,8 @@ class DownloadManager:
1095
  'url': real_url,
1096
  'filename': filename,
1097
  'size': size_str,
1098
- 'metadata': meta
 
1099
  })
1100
 
1101
  # If we found exam files with the specialized method, return them
@@ -1156,7 +1717,8 @@ class DownloadManager:
1156
  'url': real_url,
1157
  'filename': filename,
1158
  'size': await self.get_file_size(real_url),
1159
- 'metadata': {}
 
1160
  })
1161
  return found_files
1162
 
@@ -1177,7 +1739,7 @@ class DownloadManager:
1177
  for a in soup.find_all('a', href=True):
1178
  href = a['href'].strip()
1179
 
1180
- if '.php' in href.lower() or 'download' in href.lower():
1181
  full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1182
  real_url = await self.extract_real_download_url(full_url)
1183
  if real_url and real_url != full_url:
@@ -1185,7 +1747,8 @@ class DownloadManager:
1185
  'url': real_url,
1186
  'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
1187
  'size': await self.get_file_size(real_url),
1188
- 'metadata': {}
 
1189
  })
1190
  continue
1191
 
@@ -1199,7 +1762,8 @@ class DownloadManager:
1199
  'url': file_url,
1200
  'filename': os.path.basename(file_url.split('?')[0]),
1201
  'size': size_str,
1202
- 'metadata': meta
 
1203
  })
1204
 
1205
  # Handle Google Drive links
@@ -1229,7 +1793,8 @@ class DownloadManager:
1229
  'view_only': is_view_only,
1230
  'file_type': file_type,
1231
  'file_id': file_id
1232
- }
 
1233
  })
1234
 
1235
  # Also check for files in other elements (iframe, embed, object, etc.)
@@ -1246,7 +1811,8 @@ class DownloadManager:
1246
  'url': file_url,
1247
  'filename': os.path.basename(file_url.split('?')[0]),
1248
  'size': size_str,
1249
- 'metadata': meta
 
1250
  })
1251
 
1252
  # Check for file links in onclick attributes
@@ -1264,7 +1830,8 @@ class DownloadManager:
1264
  'url': url_match,
1265
  'filename': os.path.basename(url_match.split('?')[0]),
1266
  'size': size_str,
1267
- 'metadata': meta
 
1268
  })
1269
 
1270
  # Also check for data-src and data-url attributes (common in lazy-loaded sites)
@@ -1279,7 +1846,8 @@ class DownloadManager:
1279
  'url': file_url,
1280
  'filename': os.path.basename(file_url.split('?')[0]),
1281
  'size': await self.get_file_size(file_url),
1282
- 'metadata': {}
 
1283
  })
1284
  except:
1285
  pass
@@ -1313,7 +1881,8 @@ class DownloadManager:
1313
  'url': json_url,
1314
  'filename': os.path.basename(json_url.split('?')[0]),
1315
  'size': await self.get_file_size(json_url),
1316
- 'metadata': {}
 
1317
  })
1318
  except:
1319
  pass
@@ -1364,7 +1933,8 @@ class DownloadManager:
1364
  'url': href,
1365
  'filename': os.path.basename(href.split('?')[0]),
1366
  'size': await self.get_file_size(href),
1367
- 'metadata': {}
 
1368
  })
1369
 
1370
  # Check for hidden links that might be in JavaScript, iframes, or dynamic content
@@ -1375,7 +1945,8 @@ class DownloadManager:
1375
  'url': link,
1376
  'filename': os.path.basename(link.split('?')[0]),
1377
  'size': await self.get_file_size(link),
1378
- 'metadata': {}
 
1379
  })
1380
 
1381
  # Deduplicate files by URL
@@ -1393,7 +1964,7 @@ class DownloadManager:
1393
  return []
1394
 
1395
  async def download_file(self, file_info, save_dir, referer):
1396
- file_url = file_info['url']
1397
  fname = file_info['filename']
1398
  path = os.path.join(save_dir, fname)
1399
  base, ext = os.path.splitext(fname)
@@ -1403,6 +1974,11 @@ class DownloadManager:
1403
  counter += 1
1404
  os.makedirs(save_dir, exist_ok=True)
1405
 
 
 
 
 
 
1406
  try:
1407
  # Special handling for Google Drive files
1408
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
@@ -1414,6 +1990,7 @@ class DownloadManager:
1414
  logger.info(f"Attempting to download view-only file: {file_url}")
1415
  result_path = await self.force_download_viewonly(file_info, path)
1416
  if result_path:
 
1417
  return result_path
1418
 
1419
  # If that failed, try the regular download approach
@@ -1422,13 +1999,60 @@ class DownloadManager:
1422
  # Try regular download methods
1423
  success = await self.download_from_google_drive(file_url, path)
1424
  if success:
 
1425
  return path
1426
 
1427
  # If all methods failed for Google Drive, try one last approach
1428
  logger.warning("All standard methods failed, attempting force download")
1429
  result_path = await self.force_download_viewonly(file_info, path)
 
 
1430
  return result_path if result_path else None
1431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1432
  # Rotate proxy if needed
1433
  await self.rotate_proxy_if_needed()
1434
 
@@ -1456,6 +2080,7 @@ class DownloadManager:
1456
 
1457
  # Verify file was downloaded correctly
1458
  if os.path.exists(path) and os.path.getsize(path) > 0:
 
1459
  return path
1460
  except Exception as e:
1461
  logger.warning(f"Direct download failed: {e}, trying browser approach")
@@ -1475,7 +2100,9 @@ class DownloadManager:
1475
  content = await response.body()
1476
  with open(path, 'wb') as f:
1477
  f.write(content)
1478
- return path
 
 
1479
  else:
1480
  logger.error(f"Download failed with status {response.status}: {file_url}")
1481
 
@@ -1502,6 +2129,7 @@ class DownloadManager:
1502
  await download.save_as(path)
1503
 
1504
  if os.path.exists(path) and os.path.getsize(path) > 0:
 
1505
  return path
1506
  except Exception as e:
1507
  logger.error(f"Browser download manager approach failed: {e}")
@@ -2515,6 +3143,21 @@ class DownloadManager:
2515
  try:
2516
  logger.info(f"Fetching sublinks from: {url}")
2517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2518
  # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
2519
  if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
2520
  ["exam", "test", "pastpaper", "eduexp"]):
@@ -2532,8 +3175,12 @@ class DownloadManager:
2532
  await self.rotate_proxy_if_needed()
2533
 
2534
  # Standard sublink extraction for all sites
2535
- await self.page.goto(url, timeout=30000, wait_until='networkidle')
2536
-
 
 
 
 
2537
  # Get base URL for resolving relative links
2538
  parsed_base = urlparse(url)
2539
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
@@ -2732,8 +3379,46 @@ class DownloadManager:
2732
  if href and not href.startswith('javascript:'):
2733
  links.add(href)
2734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2735
  logger.info(f"Found {len(links)} sublinks")
2736
- return list(links)[:limit]
 
 
 
 
 
 
 
 
 
 
 
 
 
2737
 
2738
  except Exception as e:
2739
  logger.error(f"Error getting sublinks from {url}: {e}")
@@ -2834,6 +3519,9 @@ class DownloadManager:
2834
  file_count_text = st.empty()
2835
 
2836
  try:
 
 
 
2837
  progress_text.text("Analyzing main page...")
2838
  # Special handling for ASP.NET pages
2839
  is_aspnet = False
@@ -2848,6 +3536,25 @@ class DownloadManager:
2848
  except Exception:
2849
  pass
2850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2851
  # Extract files from main page
2852
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
2853
  initial_count = len(main_files)
@@ -2873,9 +3580,50 @@ class DownloadManager:
2873
  progress_bar.progress(progress)
2874
 
2875
  try:
2876
- # Use a longer timeout for ASP.NET pages which can be slower
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2877
  sub_timeout = timeout * 2 if is_aspnet else timeout
2878
 
 
 
 
 
2879
  # Extract files from sublink
2880
  sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
2881
  all_files.extend(sub_files)
@@ -2994,21 +3742,22 @@ def main():
2994
  if custom_ext_list != valid_ext_list:
2995
  st.warning("Invalid extensions ignored. Use format like '.csv'.")
2996
 
2997
- @st.cache_resource
2998
- def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
2999
- async def _run():
3000
- async with DownloadManager(
3001
- use_proxy=use_proxy_val,
3002
- proxy=proxy_val,
3003
- use_stealth=use_stealth_val
3004
- ) as dm:
3005
- files = await dm.deep_search(url, ext_list, max_links, timeout_val)
3006
- return files
3007
- return asyncio.run(_run())
 
 
3008
 
3009
  with st.spinner("Searching for files..."):
3010
- files = run_deep_search(url, valid_ext_list, max_sublinks,
3011
- sublink_timeout, use_proxy, proxy, use_stealth)
3012
 
3013
  if files:
3014
  st.session_state.discovered_files = files
@@ -3031,7 +3780,7 @@ def main():
3031
  if st.button("Search Files", key="rag_search_btn") and search_query:
3032
  # Initialize RAG search engine
3033
  if not st.session_state.rag_indexed:
3034
- rag_search = RAGSearch()
3035
 
3036
  with st.spinner("Indexing files for search..."):
3037
  # First download files to extract text
@@ -3044,7 +3793,7 @@ def main():
3044
  for i, file_info in enumerate(files):
3045
  # Only process common text-based file formats
3046
  ext = os.path.splitext(file_info['filename'])[1].lower()
3047
- if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json']:
3048
  path = await dm.download_file(file_info, temp_dir, url)
3049
  if path:
3050
  with open(path, 'rb') as f:
@@ -3077,14 +3826,28 @@ def main():
3077
  for result in search_results:
3078
  file_info = result['file_info']
3079
  score = result['score']
 
 
3080
  with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
3081
  st.write(f"Size: {file_info['size']}")
 
 
 
 
 
 
 
3082
  if 'metadata' in file_info and file_info['metadata']:
3083
  st.write("Metadata:")
3084
  for k, v in file_info['metadata'].items():
3085
  if k != 'file_id': # Skip technical details
3086
  st.write(f"- {k}: {v}")
3087
 
 
 
 
 
 
3088
  # Add direct download button
3089
  if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
3090
  with st.spinner(f"Downloading {file_info['filename']}..."):
@@ -3267,94 +4030,192 @@ def main():
3267
  # Create expanders for each result
3268
  for i, url in enumerate(urls, 1):
3269
  with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
3270
- if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
3271
- st.session_state.deep_search_url = url
3272
- st.session_state.do_deep_search = True
3273
  else:
3274
  st.warning("No search results found.")
3275
 
3276
  asyncio.run(run_search())
3277
 
3278
- # Handle deep search based on search results
3279
- if st.session_state.do_deep_search and st.session_state.deep_search_url:
3280
- url = st.session_state.deep_search_url
3281
- st.info(f"Deep searching: {url}")
3282
-
3283
- # Reset the flag to avoid re-running
3284
- st.session_state.do_deep_search = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3285
 
3286
- # Set up custom extensions
3287
- custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
3288
- valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
3289
 
3290
- @st.cache_resource
3291
- def run_deep_search(url, ext_list, max_links, timeout_val, use_proxy_val, proxy_val, use_stealth_val):
3292
- async def _run():
3293
- async with DownloadManager(
3294
- use_proxy=use_proxy_val,
3295
- proxy=proxy_val,
3296
- use_stealth=use_stealth_val
3297
- ) as dm:
3298
- files = await dm.deep_search(url, ext_list, max_links, timeout_val)
3299
- return files
3300
- return asyncio.run(_run())
3301
-
3302
- with st.spinner("Searching for files..."):
3303
- files = run_deep_search(url, valid_ext_list, max_sublinks,
3304
- sublink_timeout, use_proxy, proxy, use_stealth)
3305
-
3306
- if files:
3307
- st.session_state.discovered_files = files
3308
- st.session_state.current_url = url
3309
- st.success(f"Found {len(files)} files!")
3310
-
3311
- # Show files with direct download options
3312
- download_dir = "./downloads"
3313
- os.makedirs(download_dir, exist_ok=True)
3314
-
3315
- for i, file in enumerate(files):
3316
- col1, col2, col3 = st.columns([3, 1, 1])
3317
- with col1:
3318
- filename = file['filename']
3319
- size = file['size']
3320
- meta = file.get('metadata', {})
3321
- file_info = f"{filename} ({size})"
3322
- if meta and 'Pages' in meta:
3323
- file_info += f" - {meta.get('Pages', '')} pages"
3324
- st.markdown(f"**{i+1}. {file_info}**")
3325
-
3326
- with col2:
3327
- # Add direct download button for each file
3328
- if st.button(f"Download", key=f"direct_dl_{i}"):
3329
- with st.spinner(f"Downloading {filename}..."):
3330
- async def download_single_file():
3331
- async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
3332
- path = await dm.download_file(file, download_dir, url)
3333
- return path
3334
 
3335
- downloaded_path = asyncio.run(download_single_file())
3336
- if downloaded_path:
3337
- with open(downloaded_path, "rb") as f:
3338
- file_data = f.read()
3339
-
3340
- st.download_button(
3341
- label=f"Save {filename}",
3342
- data=file_data,
3343
- file_name=filename,
3344
- mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
3345
- key=f"save_file_{i}"
3346
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
3347
 
3348
- with col3:
3349
- # Add to selection for batch download
3350
- if i in st.session_state.selected_files:
3351
- if st.button("Unselect", key=f"unselect_{i}"):
3352
- st.session_state.selected_files.remove(i)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3353
  else:
3354
- if st.button("Select", key=f"select_{i}"):
3355
- st.session_state.selected_files.append(i)
3356
- else:
3357
- st.warning("No files found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3358
 
3359
  # Add a special section for direct Google Drive file download
3360
  st.markdown("---")
@@ -3400,7 +4261,11 @@ def main():
3400
 
3401
  # Add footer with attribution
3402
  st.markdown('---')
3403
- st.markdown('Created by [Euler314](https://github.com/yu314coder)')
 
 
 
 
3404
 
3405
  if __name__ == "__main__":
3406
  main()
 
7
  from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8
  import asyncio
9
  import logging
10
+ from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote
11
  import re
12
  from pathlib import Path
13
  from io import BytesIO
 
32
  import google.auth.transport.requests
33
  import googleapiclient.http
34
 
35
+ # Enhanced RAG search imports
36
  import nltk
37
  from sklearn.feature_extraction.text import TfidfVectorizer
38
  from sklearn.metrics.pairwise import cosine_similarity
39
  import numpy as np
40
  import docx2txt
41
+ try:
42
+ from langdetect import detect as detect_language
43
+ except ImportError:
44
+ # If langdetect is not available, we'll use a simple fallback
45
+ def detect_language(text):
46
+ return "en"
47
+
48
+ # Try to download NLTK data if not already present
49
+ try:
50
+ nltk.data.find('tokenizers/punkt')
51
+ except LookupError:
52
+ try:
53
+ nltk.download('punkt', quiet=True)
54
+ except:
55
+ pass
56
 
57
  # -------------------- Logging Setup --------------------
58
  logging.basicConfig(
 
110
  "proxies": [] # Will be populated from the UI if needed
111
  }
112
 
113
+ # -------------------- Enhanced RAG Search Class --------------------
114
+ class EnhancedRAGSearch:
115
  def __init__(self):
116
  self.file_texts = []
117
+ self.chunks = [] # Document chunks for more targeted search
118
+ self.chunk_metadata = [] # Metadata for each chunk
119
  self.file_metadata = []
120
+ self.vectorizer = TfidfVectorizer(
121
+ stop_words='english',
122
+ ngram_range=(1, 2), # Use bigrams for better context
123
+ max_features=10000, # Use more features for better representation
124
+ min_df=2 # Minimum document frequency
125
+ )
126
  self.vectors = None
127
+ self.chunk_vectors = None
128
+ self.languages = []
129
 
130
  def add_file(self, file_data, file_info):
131
+ """Add a file to the search index with improved processing"""
132
+ file_ext = os.path.splitext(file_info['filename'])[1].lower()
133
  text = self.extract_text(file_data, file_ext)
134
+
135
  if text:
136
+ # Store the whole document text
137
  self.file_texts.append(text)
138
  self.file_metadata.append(file_info)
139
+
140
+ # Try to detect language
141
+ try:
142
+ lang = detect_language(text[:1000]) # Use just the first 1000 chars for speed
143
+ self.languages.append(lang)
144
+ except:
145
+ self.languages.append('en') # Default to English
146
+
147
+ # Create chunks for more granular search
148
+ chunks = self.create_chunks(text)
149
+ for chunk in chunks:
150
+ self.chunks.append(chunk)
151
+ self.chunk_metadata.append({
152
+ 'file_info': file_info,
153
+ 'chunk_size': len(chunk),
154
+ 'file_index': len(self.file_texts) - 1
155
+ })
156
+
157
  return True
158
  return False
159
 
160
+ def create_chunks(self, text, chunk_size=1000, overlap=200):
161
+ """Split text into overlapping chunks for better search precision"""
162
+ # Try to use NLTK for sentence-aware chunking
163
+ try:
164
+ sentences = nltk.sent_tokenize(text)
165
+ chunks = []
166
+ current_chunk = ""
167
+
168
+ for sentence in sentences:
169
+ if len(current_chunk) + len(sentence) <= chunk_size:
170
+ current_chunk += sentence + " "
171
+ else:
172
+ # Add current chunk if it has content
173
+ if current_chunk:
174
+ chunks.append(current_chunk.strip())
175
+
176
+ # Start new chunk with overlap from previous chunk
177
+ if len(current_chunk) > overlap:
178
+ # Find the last space within the overlap region
179
+ overlap_text = current_chunk[-overlap:]
180
+ last_space = overlap_text.rfind(' ')
181
+ if last_space != -1:
182
+ current_chunk = current_chunk[-(overlap-last_space):] + sentence + " "
183
+ else:
184
+ current_chunk = sentence + " "
185
+ else:
186
+ current_chunk = sentence + " "
187
+
188
+ # Add the last chunk if it has content
189
+ if current_chunk:
190
+ chunks.append(current_chunk.strip())
191
+
192
+ return chunks
193
+ except:
194
+ # Fallback to simpler chunking approach
195
+ chunks = []
196
+ for i in range(0, len(text), chunk_size - overlap):
197
+ chunk = text[i:i + chunk_size]
198
+ if chunk:
199
+ chunks.append(chunk)
200
+ return chunks
201
+
202
  def extract_text(self, file_data, file_ext):
203
+ """Extract text from different file types with enhanced support"""
204
  try:
205
  if file_ext.lower() == '.pdf':
206
  reader = PyPDF2.PdfReader(BytesIO(file_data))
207
  text = ""
208
  for page in reader.pages:
209
+ extracted = page.extract_text()
210
+ if extracted:
211
+ text += extracted + "\n"
212
+ # If text extraction fails, try to OCR (would need extra libraries)
213
  return text
214
  elif file_ext.lower() in ['.docx', '.doc']:
215
  return docx2txt.process(BytesIO(file_data))
216
+ elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']:
217
+ # Handle both UTF-8 and other common encodings
218
+ try:
219
+ return file_data.decode('utf-8', errors='ignore')
220
+ except:
221
+ encodings = ['latin-1', 'iso-8859-1', 'windows-1252']
222
+ for enc in encodings:
223
+ try:
224
+ return file_data.decode(enc, errors='ignore')
225
+ except:
226
+ pass
227
+ # Last resort fallback
228
  return file_data.decode('utf-8', errors='ignore')
229
+ elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']:
230
+ # For these types, we would need additional libraries
231
+ # For now, return a placeholder message
232
+ return f"[Content of {file_ext} file - install additional libraries for full text extraction]"
233
  else:
234
  return ""
235
  except Exception as e:
 
237
  return ""
238
 
239
  def build_index(self):
240
+ """Build both document and chunk search indices"""
241
  if not self.file_texts:
242
  return False
243
+
244
  try:
245
+ # Build document-level index
246
  self.vectors = self.vectorizer.fit_transform(self.file_texts)
247
+
248
+ # Build chunk-level index if we have chunks
249
+ if self.chunks:
250
+ self.chunk_vectors = self.vectorizer.transform(self.chunks)
251
+
252
  return True
253
  except Exception as e:
254
  logger.error(f"Error building search index: {e}")
255
  return False
256
 
257
+ def expand_query(self, query):
258
+ """Add related terms to query for better recall"""
259
+ # This is a simple implementation - could be enhanced with a proper synonym API
260
+ expanded_terms = []
261
+
262
+ # Add some common expansions for document search
263
+ if "exam" in query.lower():
264
+ expanded_terms.extend(["test", "assessment", "quiz", "paper"])
265
+ elif "document" in query.lower():
266
+ expanded_terms.extend(["file", "paper", "report"])
267
+ elif "manual" in query.lower():
268
+ expanded_terms.extend(["guide", "instruction", "documentation"])
269
+
270
+ # Return original query plus expanded terms
271
+ if expanded_terms:
272
+ return f"{query} {' '.join(expanded_terms)}"
273
+ return query
274
+
275
+ def search(self, query, top_k=5, search_chunks=True):
276
+ """Enhanced search with both document and chunk-level search"""
277
  if self.vectors is None:
278
  return []
279
 
280
  try:
281
+ # Expand the query for better recall
282
+ expanded_query = self.expand_query(query)
283
+
284
+ # Transform the query
285
+ query_vector = self.vectorizer.transform([expanded_query])
286
 
287
  results = []
288
+
289
+ # First search at document level for higher-level matches
290
+ if self.vectors is not None:
291
+ doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
292
+ top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
293
+
294
+ for i, idx in enumerate(top_doc_indices):
295
+ if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results
296
+ results.append({
297
+ 'file_info': self.file_metadata[idx],
298
+ 'score': float(doc_similarities[idx]),
299
+ 'rank': i+1,
300
+ 'match_type': 'document',
301
+ 'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
302
+ })
303
+
304
+ # Then search at chunk level for more specific matches if enabled
305
+ if search_chunks and self.chunk_vectors is not None:
306
+ chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
307
+ top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results
308
+
309
+ # Use a set to avoid duplicate file results
310
+ seen_files = set(r['file_info']['url'] for r in results)
311
+
312
+ for i, idx in enumerate(top_chunk_indices):
313
+ if chunk_similarities[idx] > 0.15: # Higher threshold for chunks
314
+ file_index = self.chunk_metadata[idx]['file_index']
315
+ file_info = self.file_metadata[file_index]
316
+
317
+ # Only add if we haven't already included this file
318
+ if file_info['url'] not in seen_files:
319
+ seen_files.add(file_info['url'])
320
+ results.append({
321
+ 'file_info': file_info,
322
+ 'score': float(chunk_similarities[idx]),
323
+ 'rank': len(results) + 1,
324
+ 'match_type': 'chunk',
325
+ 'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
326
+ 'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
327
+ })
328
+
329
+ # Stop after we've found enough results
330
+ if len(results) >= top_k*1.5:
331
+ break
332
+
333
+ # Sort combined results by score
334
+ results.sort(key=lambda x: x['score'], reverse=True)
335
+
336
+ # Re-rank and truncate
337
+ for i, result in enumerate(results[:top_k]):
338
+ result['rank'] = i+1
339
+
340
+ return results[:top_k]
341
  except Exception as e:
342
  logger.error(f"Error during search: {e}")
343
  return []
 
397
  html_lower = html_content.lower()
398
  return any(pattern in html_lower for pattern in captcha_patterns)
399
 
400
+ def is_download_link(url):
401
+ """Enhanced function to detect if a URL is likely a download link"""
402
+ # Check for obvious download indicators in URL
403
+ url_lower = url.lower()
404
+
405
+ # Check for common download-related terms in the URL
406
+ download_terms = [
407
+ 'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
408
+ 'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
409
+ ]
410
+ if any(term in url_lower for term in download_terms):
411
+ return True
412
+
413
+ # Check for common download script patterns
414
+ script_patterns = [
415
+ 'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
416
+ 'download.aspx', 'getfile.aspx', 'file.aspx',
417
+ 'downloadhandler', 'filehandler', 'filedownload',
418
+ 'download.jsp', 'download.cgi', 'download.do',
419
+ 'download-file', 'get-file',
420
+ 'downloadfile', 'getfile', 'viewfile',
421
+ 'Action=downloadfile', 'action=download', 'action=view',
422
+ 'download?', 'file?', 'get?', 'view?'
423
+ ]
424
+ if any(pattern in url_lower for pattern in script_patterns):
425
+ return True
426
+
427
+ # Check for common file extensions in the URL path or parameters
428
+ path = urlparse(url).path
429
+ common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
430
+ '.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
431
+ '.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']
432
+
433
+ if any(ext in path.lower() for ext in common_extensions):
434
+ return True
435
+
436
+ # Check for file ID or file parameters in URL
437
+ params = parse_qs(urlparse(url).query)
438
+ param_keys = params.keys()
439
+ file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
440
+ if any(key.lower() in file_param_indicators for key in param_keys):
441
+ return True
442
+
443
+ # Check for complex encoding patterns like in the example URL
444
+ if 'Action=downloadfile' in url or 'fname=' in url:
445
+ return True
446
+
447
+ return False
448
+
449
+ def normalize_download_url(url):
450
+ """Normalize download URLs to handle various formats and encodings"""
451
+ try:
452
+ # Handle common URL shorteners and redirections
453
+ parsed = urlparse(url)
454
+
455
+ # Handle phpMyAdmin-style encoded URLs
456
+ if 'Action=downloadfile' in url and 'file=' in url:
457
+ # Extract the encoded file parameter
458
+ params = parse_qs(parsed.query)
459
+ if 'file' in params:
460
+ # This is just a placeholder - in a real implementation,
461
+ # you would need to handle the specific encoding used
462
+ encoded_file = params['file'][0]
463
+ # Keep the URL as is for now, since we'll handle it during download
464
+ return url
465
+
466
+ # Handle URLs with fname parameter (like in the example)
467
+ if 'fname=' in url:
468
+ # Keep as is - we'll handle this specially during download
469
+ return url
470
+
471
+ # For other URLs, make sure they are properly quoted
472
+ path = parsed.path
473
+ # Only quote the path portion if needed
474
+ if '%' not in path and ' ' in path:
475
+ path = quote(path)
476
+
477
+ # Reconstruct the URL
478
+ normalized = parsed._replace(path=path).geturl()
479
+ return normalized
480
+ except Exception as e:
481
+ logger.error(f"Error normalizing URL {url}: {e}")
482
+ return url
483
+
484
  # -------------------- Google Drive Functions --------------------
485
  def get_google_auth_url():
486
  client_config = GOOGLE_OAUTH_CONFIG["web"]
 
573
  self.request_count = 0
574
  self.captcha_detected = False
575
  self.download_timeout = 300 # 5 minutes timeout for downloads
576
+ # Track visited URLs to avoid revisiting the same URL multiple times
577
+ self.visited_urls = set()
578
+ # Track successfully downloaded files to avoid redownloading
579
+ self.downloaded_files = set()
580
 
581
  async def __aenter__(self):
582
  self.playwright = await async_playwright().start()
 
857
  try:
858
  await self.rotate_proxy_if_needed()
859
 
860
+ # For complex download URLs, we need to be careful with HEAD requests
861
+ if '?' in url or 'Action=downloadfile' in url or 'fname=' in url:
862
+ # For these URLs, we'll try a more reliable approach using range headers
863
+ headers = {
864
+ 'User-Agent': get_random_user_agent(),
865
+ 'Range': 'bytes=0-0' # Just request the first byte to check headers
866
+ }
867
+
868
+ try:
869
+ with requests.get(url, headers=headers, stream=True, timeout=10) as r:
870
+ if 'Content-Range' in r.headers:
871
+ content_range = r.headers['Content-Range']
872
+ match = re.search(r'bytes 0-0/(\d+)', content_range)
873
+ if match:
874
+ size = int(match.group(1))
875
+ return sizeof_fmt(size)
876
+
877
+ if 'Content-Length' in r.headers:
878
+ size = int(r.headers['Content-Length'])
879
+ # If size is 1, it's likely just our single requested byte
880
+ if size > 1:
881
+ return sizeof_fmt(size)
882
+ except Exception as e:
883
+ logger.warning(f"Error getting file size with Range request: {e}")
884
+
885
+ # Fallback to browser approach
886
+ try:
887
+ async with self.context.new_page() as page:
888
+ response = await page.request.head(url, timeout=15000)
889
+ length = response.headers.get('Content-Length', None)
890
+ if length:
891
+ return sizeof_fmt(int(length))
892
+ except Exception as e:
893
+ logger.warning(f"Error getting file size with browser: {e}")
894
+
895
+ return "Unknown Size"
896
+ else:
897
+ # Standard approach for normal URLs
898
+ async with self.context.new_page() as page:
899
+ response = await page.request.head(url, timeout=15000)
900
+ length = response.headers.get('Content-Length', None)
901
+ if length:
902
+ return sizeof_fmt(int(length))
903
+ else:
904
+ return "Unknown Size"
905
  except Exception as e:
906
  logger.warning(f"Error getting file size: {e}")
907
  return "Unknown Size"
 
928
  return {}
929
 
930
  async def extract_real_download_url(self, url):
931
+ """Enhanced method to extract real download URL, handling complex URLs"""
932
  try:
933
+ # Check if this is a complex download URL that needs special handling
934
+ if 'Action=downloadfile' in url or 'fname=' in url:
935
+ logger.info(f"Complex download URL detected: {url}")
936
+
937
+ # For these special cases, we'll use the browser to navigate and intercept redirects
938
+ await self.rotate_proxy_if_needed()
939
+
940
+ async with self.context.new_page() as page:
941
+ # Set up request interception to capture redirects
942
+ await page.route('**', lambda route: route.continue_())
943
+
944
+ # Listen for all responses
945
+ responses = []
946
+ page.on('response', lambda response: responses.append(response))
947
+
948
+ try:
949
+ # Go to the URL
950
+ await page.goto(url, wait_until='networkidle', timeout=30000)
951
+
952
+ # Check all responses for potential downloads
953
+ for response in responses:
954
+ # Look for content-disposition headers indicating a download
955
+ content_disposition = response.headers.get('Content-Disposition', '')
956
+ if 'attachment' in content_disposition or 'filename=' in content_disposition:
957
+ return response.url
958
+
959
+ # Look for content-type headers indicating a file
960
+ content_type = response.headers.get('Content-Type', '')
961
+ if content_type and content_type != 'text/html' and not content_type.startswith('text/'):
962
+ return response.url
963
+
964
+ # If no clear download was detected, return the final URL
965
+ return page.url
966
+ except Exception as e:
967
+ logger.warning(f"Error extracting real download URL: {e}")
968
+ return url
969
+ else:
970
+ # Standard approach for normal URLs
971
+ await self.rotate_proxy_if_needed()
972
+
973
+ async with self.context.new_page() as page:
974
+ response = await page.goto(url, wait_until='networkidle', timeout=30000)
975
+ if response and response.headers.get('location'):
976
+ return response.headers['location']
977
+ return page.url
978
  except Exception as e:
979
  logger.error(f"Error extracting real download URL: {e}")
980
  return url
 
1042
  if any(full_url.lower().endswith(ext) for ext in
1043
  ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
1044
  links.add(full_url)
1045
+
1046
+ # Check for download script parameters
1047
+ if "Action=downloadfile" in url or "fname=" in url:
1048
+ links.add(url) # Add the URL itself as it's a download link
1049
  except Exception as e:
1050
  logger.warning(f"Request-based extraction failed: {e}")
1051
 
1052
  # Browser-based approach for more thorough extraction or if initial approach was inadequate
1053
  try:
1054
  # Check if we need to proceed with browser-based extraction
1055
+ if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url:
1056
  logger.info("Using browser for enhanced link extraction")
1057
 
1058
  # Rotate proxy if needed
 
1144
  ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
1145
  links.add(href)
1146
 
1147
+ # Check for download links in the page
1148
+ download_links = await self.page.evaluate("""
1149
+ () => {
1150
+ // Find all links that might be download links
1151
+ const links = Array.from(document.querySelectorAll('a[href]'));
1152
+ return links
1153
+ .filter(a => {
1154
+ const href = a.href.toLowerCase();
1155
+ return href.includes('download') ||
1156
+ href.includes('getfile') ||
1157
+ href.includes('view.php') ||
1158
+ href.includes('action=downloadfile') ||
1159
+ href.includes('fname=');
1160
+ })
1161
+ .map(a => a.href);
1162
+ }
1163
+ """)
1164
+
1165
+ for dl_link in download_links:
1166
+ links.add(dl_link)
1167
+
1168
  # Check for ASP.NET specific elements that might contain exam links
1169
  grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
1170
  for grid in grid_elements:
 
1293
  "/resource/", "/material/", "/notes/", "/subjectmaterial/"
1294
  ]):
1295
  filtered_links.append(link)
1296
+ continue
1297
+
1298
+ # Check for download links (these may not have obvious extensions)
1299
+ if is_download_link(link):
1300
+ filtered_links.append(link)
1301
 
1302
  logger.info(f"Found {len(filtered_links)} potential exam document links")
1303
  return filtered_links
 
1325
  }
1326
  }
1327
 
1328
+ // Look for download-related variables in scripts
1329
+ for (const script of scripts) {
1330
+ const content = script.textContent || '';
1331
+ // Look for common patterns for file URLs in JavaScript
1332
+ if (content.includes('downloadURL') || content.includes('fileURL') ||
1333
+ content.includes('pdfURL') || content.includes('documentURL')) {
1334
+
1335
+ // Extract potential URLs
1336
+ const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || [];
1337
+ for (let match of potentialUrls) {
1338
+ const url = match.replace(/["']/g, '');
1339
+ // Try to resolve relative URLs
1340
+ if (url.startsWith('/') || !url.includes('://')) {
1341
+ if (url.startsWith('/')) {
1342
+ links.add(window.location.origin + url);
1343
+ } else {
1344
+ // Handle relative paths more carefully
1345
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
1346
+ links.add(base + url);
1347
+ }
1348
+ } else if (url.startsWith('http')) {
1349
+ links.add(url);
1350
+ }
1351
+ }
1352
+ }
1353
+ }
1354
+
1355
  // Check for links in data attributes
1356
+ const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]');
1357
  for (const el of elements) {
1358
+ for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) {
1359
  const val = el.getAttribute(attr);
1360
+ if (val) {
1361
+ // Try to resolve relative URLs
1362
+ if (val.startsWith('/')) {
1363
+ links.add(window.location.origin + val);
1364
+ } else if (val.startsWith('http')) {
1365
+ links.add(val);
1366
+ } else if (!val.startsWith('javascript:') && !val.startsWith('#')) {
1367
+ // Handle relative paths
1368
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
1369
+ links.add(base + val);
1370
+ }
1371
  }
1372
  }
1373
  }
1374
 
1375
  // Look for URLs in inline event handlers
1376
+ const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]');
1377
  for (const el of clickableElements) {
1378
+ for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) {
1379
  const val = el.getAttribute(attr);
1380
  if (val) {
1381
+ // Check for JavaScript URLs with window.location
1382
+ if (val.includes('window.location') || val.includes('document.location')) {
1383
+ const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/);
1384
+ if (urlMatch && urlMatch[1]) {
1385
+ const url = urlMatch[1];
1386
+ if (url.startsWith('/')) {
1387
+ links.add(window.location.origin + url);
1388
+ } else if (url.startsWith('http')) {
1389
+ links.add(url);
1390
+ } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
1391
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
1392
+ links.add(base + url);
1393
+ }
1394
+ }
1395
+ }
1396
+
1397
+ // Check for direct URLs in attributes
1398
  const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
1399
  for (let match of urlMatches) {
1400
  links.add(match.replace(/["']/g, ''));
1401
  }
1402
+
1403
+ // Check for download.php and similar patterns
1404
+ if (val.includes('download.php') || val.includes('getfile.php') ||
1405
+ val.includes('Action=downloadfile') || val.includes('viewfile.php')) {
1406
+
1407
+ // Handle both onclick handlers and direct hrefs
1408
+ let url = '';
1409
+ if (attr === 'href') {
1410
+ url = val;
1411
+ } else {
1412
+ // Extract URL from JavaScript
1413
+ const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i);
1414
+ if (jsUrlMatch) {
1415
+ url = jsUrlMatch[1];
1416
+ }
1417
+ }
1418
+
1419
+ // Resolve URL if needed
1420
+ if (url) {
1421
+ if (url.startsWith('/')) {
1422
+ links.add(window.location.origin + url);
1423
+ } else if (url.startsWith('http')) {
1424
+ links.add(url);
1425
+ } else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
1426
+ const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
1427
+ links.add(base + url);
1428
+ }
1429
+ }
1430
+ }
1431
  }
1432
  }
1433
  }
1434
 
1435
+ // Find PHP/ASP file download links
1436
+ const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]');
1437
+ for (const link of fileLinks) {
1438
+ links.add(link.href);
1439
+ }
1440
+
1441
  return Array.from(links);
1442
  }
1443
  """)
 
1504
  for link in shadow_links:
1505
  hidden_links.add(link)
1506
 
1507
+ # Look for download links in forms
1508
+ form_links = await page.evaluate("""
1509
+ () => {
1510
+ const links = new Set();
1511
+
1512
+ // Check for form actions that might be download endpoints
1513
+ const forms = document.querySelectorAll('form');
1514
+ for (const form of forms) {
1515
+ const action = form.action || '';
1516
+ if (action && (
1517
+ action.includes('download') ||
1518
+ action.includes('getfile') ||
1519
+ action.includes('viewfile') ||
1520
+ action.includes('Action=downloadfile')
1521
+ )) {
1522
+ // Collect input values that might be needed for the download
1523
+ const inputs = {};
1524
+ const formInputs = form.querySelectorAll('input[name]');
1525
+ for (const input of formInputs) {
1526
+ inputs[input.name] = input.value;
1527
+ }
1528
+
1529
+ // Store both the form action and any important inputs
1530
+ links.add(action);
1531
+ }
1532
+ }
1533
+
1534
+ return Array.from(links);
1535
+ }
1536
+ """)
1537
+
1538
+ for link in form_links:
1539
+ hidden_links.add(link)
1540
+
1541
  return hidden_links
1542
 
1543
  async def extract_downloadable_files(self, url, custom_ext_list):
1544
  found_files = []
1545
  try:
1546
+ # Normalize the URL to handle special cases
1547
+ normalized_url = normalize_download_url(url)
1548
+
1549
+ # Skip if we've already visited this URL
1550
+ if normalized_url in self.visited_urls:
1551
+ logger.info(f"Skipping already visited URL: {normalized_url}")
1552
+ return []
1553
+
1554
+ # Mark this URL as visited
1555
+ self.visited_urls.add(normalized_url)
1556
+
1557
  # Rotate proxy if needed
1558
  await self.rotate_proxy_if_needed()
1559
 
1560
+ # First check if this is a direct download link (Action=downloadfile or fname parameter)
1561
+ if is_download_link(normalized_url):
1562
+ logger.info(f"Processing potential direct download link: {normalized_url}")
1563
+
1564
+ # Try to extract the real download URL if needed
1565
+ real_url = await self.extract_real_download_url(normalized_url)
1566
+
1567
+ # Determine filename - for complex URLs this can be tricky
1568
+ filename = os.path.basename(urlparse(real_url).path)
1569
+
1570
+ # Handle URL-encoded filenames
1571
+ if '%' in filename:
1572
+ try:
1573
+ filename = unquote(filename)
1574
+ except Exception:
1575
+ pass
1576
+
1577
+ # For URLs with download parameters, try to extract filename from query
1578
+ if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
1579
+ # Look for file parameter
1580
+ params = parse_qs(urlparse(normalized_url).query)
1581
+
1582
+ # Check common filename parameters
1583
+ for param in ['file', 'filename', 'name', 'fname', 'f']:
1584
+ if param in params and params[param]:
1585
+ potential_filename = params[param][0]
1586
+ if potential_filename and '/' not in potential_filename and '\\' not in potential_filename:
1587
+ filename = os.path.basename(potential_filename)
1588
+ break
1589
+
1590
+ # If still no valid filename, use domain-based fallback
1591
+ if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
1592
+ domain = get_domain(real_url)
1593
+ # Try to determine file type from content-type or extension hints in URL
1594
+ ext = '.pdf' # Default
1595
+ for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
1596
+ if common_ext in normalized_url.lower():
1597
+ ext = common_ext
1598
+ break
1599
+ filename = f"file_from_{domain}{ext}"
1600
+
1601
+ # Get file size
1602
+ size_str = await self.get_file_size(real_url)
1603
+
1604
+ # Add to found files
1605
+ found_files.append({
1606
+ 'url': real_url,
1607
+ 'filename': filename,
1608
+ 'size': size_str,
1609
+ 'metadata': {},
1610
+ 'download_url': normalized_url # Keep original URL for downloading
1611
+ })
1612
+
1613
+ # For direct download links, we can return early
1614
+ if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)):
1615
+ return found_files
1616
+
1617
  # Special handling for educational exam sites
1618
  if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
1619
  ["exam", "test", "pastpaper", "eduexp"]):
 
1655
  'url': real_url,
1656
  'filename': filename,
1657
  'size': size_str,
1658
+ 'metadata': meta,
1659
+ 'download_url': link # Store original link for downloading
1660
  })
1661
 
1662
  # If we found exam files with the specialized method, return them
 
1717
  'url': real_url,
1718
  'filename': filename,
1719
  'size': await self.get_file_size(real_url),
1720
+ 'metadata': {},
1721
+ 'download_url': final_url # Keep original URL for downloading
1722
  })
1723
  return found_files
1724
 
 
1739
  for a in soup.find_all('a', href=True):
1740
  href = a['href'].strip()
1741
 
1742
+ if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower():
1743
  full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1744
  real_url = await self.extract_real_download_url(full_url)
1745
  if real_url and real_url != full_url:
 
1747
  'url': real_url,
1748
  'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
1749
  'size': await self.get_file_size(real_url),
1750
+ 'metadata': {},
1751
+ 'download_url': full_url # Original URL for download
1752
  })
1753
  continue
1754
 
 
1762
  'url': file_url,
1763
  'filename': os.path.basename(file_url.split('?')[0]),
1764
  'size': size_str,
1765
+ 'metadata': meta,
1766
+ 'download_url': file_url # Same as URL for direct links
1767
  })
1768
 
1769
  # Handle Google Drive links
 
1793
  'view_only': is_view_only,
1794
  'file_type': file_type,
1795
  'file_id': file_id
1796
+ },
1797
+ 'download_url': href # Same as URL for Google Drive
1798
  })
1799
 
1800
  # Also check for files in other elements (iframe, embed, object, etc.)
 
1811
  'url': file_url,
1812
  'filename': os.path.basename(file_url.split('?')[0]),
1813
  'size': size_str,
1814
+ 'metadata': meta,
1815
+ 'download_url': file_url
1816
  })
1817
 
1818
  # Check for file links in onclick attributes
 
1830
  'url': url_match,
1831
  'filename': os.path.basename(url_match.split('?')[0]),
1832
  'size': size_str,
1833
+ 'metadata': meta,
1834
+ 'download_url': url_match
1835
  })
1836
 
1837
  # Also check for data-src and data-url attributes (common in lazy-loaded sites)
 
1846
  'url': file_url,
1847
  'filename': os.path.basename(file_url.split('?')[0]),
1848
  'size': await self.get_file_size(file_url),
1849
+ 'metadata': {},
1850
+ 'download_url': file_url
1851
  })
1852
  except:
1853
  pass
 
1881
  'url': json_url,
1882
  'filename': os.path.basename(json_url.split('?')[0]),
1883
  'size': await self.get_file_size(json_url),
1884
+ 'metadata': {},
1885
+ 'download_url': json_url
1886
  })
1887
  except:
1888
  pass
 
1933
  'url': href,
1934
  'filename': os.path.basename(href.split('?')[0]),
1935
  'size': await self.get_file_size(href),
1936
+ 'metadata': {},
1937
+ 'download_url': href
1938
  })
1939
 
1940
  # Check for hidden links that might be in JavaScript, iframes, or dynamic content
 
1945
  'url': link,
1946
  'filename': os.path.basename(link.split('?')[0]),
1947
  'size': await self.get_file_size(link),
1948
+ 'metadata': {},
1949
+ 'download_url': link
1950
  })
1951
 
1952
  # Deduplicate files by URL
 
1964
  return []
1965
 
1966
  async def download_file(self, file_info, save_dir, referer):
1967
+ file_url = file_info.get('download_url', file_info['url']) # Use download_url if available
1968
  fname = file_info['filename']
1969
  path = os.path.join(save_dir, fname)
1970
  base, ext = os.path.splitext(fname)
 
1974
  counter += 1
1975
  os.makedirs(save_dir, exist_ok=True)
1976
 
1977
+ # Check if we've already downloaded this file
1978
+ if file_url in self.downloaded_files:
1979
+ logger.info(f"File already downloaded: {file_url}")
1980
+ return None
1981
+
1982
  try:
1983
  # Special handling for Google Drive files
1984
  if "drive.google.com" in file_url or "docs.google.com" in file_url:
 
1990
  logger.info(f"Attempting to download view-only file: {file_url}")
1991
  result_path = await self.force_download_viewonly(file_info, path)
1992
  if result_path:
1993
+ self.downloaded_files.add(file_url)
1994
  return result_path
1995
 
1996
  # If that failed, try the regular download approach
 
1999
  # Try regular download methods
2000
  success = await self.download_from_google_drive(file_url, path)
2001
  if success:
2002
+ self.downloaded_files.add(file_url)
2003
  return path
2004
 
2005
  # If all methods failed for Google Drive, try one last approach
2006
  logger.warning("All standard methods failed, attempting force download")
2007
  result_path = await self.force_download_viewonly(file_info, path)
2008
+ if result_path:
2009
+ self.downloaded_files.add(file_url)
2010
  return result_path if result_path else None
2011
 
2012
+ # Special handling for complex download URLs
2013
+ if 'Action=downloadfile' in file_url or 'fname=' in file_url:
2014
+ logger.info(f"Using browser download approach for complex URL: {file_url}")
2015
+
2016
+ # For these URLs, we'll need to navigate to the page and handle the download
2017
+ await self.rotate_proxy_if_needed()
2018
+
2019
+ async with self.context.new_page() as page:
2020
+ # Set up download event listener
2021
+ download_promise = page.wait_for_event("download")
2022
+
2023
+ # Navigate to the URL
2024
+ await page.goto(file_url, timeout=60000)
2025
+
2026
+ # Wait for the download to start
2027
+ try:
2028
+ download = await download_promise
2029
+ await download.save_as(path)
2030
+
2031
+ if os.path.exists(path) and os.path.getsize(path) > 0:
2032
+ self.downloaded_files.add(file_url)
2033
+ return path
2034
+ except Exception as e:
2035
+ logger.error(f"Browser download failed: {e}")
2036
+
2037
+ # If download didn't start automatically, try to find and click download buttons
2038
+ download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]')
2039
+ for button in download_buttons:
2040
+ try:
2041
+ await button.click()
2042
+ try:
2043
+ download = await download_promise
2044
+ await download.save_as(path)
2045
+ if os.path.exists(path) and os.path.getsize(path) > 0:
2046
+ self.downloaded_files.add(file_url)
2047
+ return path
2048
+ except:
2049
+ pass
2050
+ except:
2051
+ continue
2052
+
2053
+ # If browser approach failed, try direct request as last resort
2054
+ logger.info("Browser approach failed, trying direct request")
2055
+
2056
  # Rotate proxy if needed
2057
  await self.rotate_proxy_if_needed()
2058
 
 
2080
 
2081
  # Verify file was downloaded correctly
2082
  if os.path.exists(path) and os.path.getsize(path) > 0:
2083
+ self.downloaded_files.add(file_url)
2084
  return path
2085
  except Exception as e:
2086
  logger.warning(f"Direct download failed: {e}, trying browser approach")
 
2100
  content = await response.body()
2101
  with open(path, 'wb') as f:
2102
  f.write(content)
2103
+ if os.path.exists(path) and os.path.getsize(path) > 0:
2104
+ self.downloaded_files.add(file_url)
2105
+ return path
2106
  else:
2107
  logger.error(f"Download failed with status {response.status}: {file_url}")
2108
 
 
2129
  await download.save_as(path)
2130
 
2131
  if os.path.exists(path) and os.path.getsize(path) > 0:
2132
+ self.downloaded_files.add(file_url)
2133
  return path
2134
  except Exception as e:
2135
  logger.error(f"Browser download manager approach failed: {e}")
 
3143
  try:
3144
  logger.info(f"Fetching sublinks from: {url}")
3145
 
3146
+ # Check if this is a direct download link
3147
+ if is_download_link(url):
3148
+ logger.info(f"URL appears to be a direct download link: {url}")
3149
+ links.add(url)
3150
+ return list(links)[:limit]
3151
+
3152
+ # Skip if we've already visited this URL
3153
+ normalized_url = normalize_download_url(url)
3154
+ if normalized_url in self.visited_urls:
3155
+ logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}")
3156
+ return list(links)[:limit]
3157
+
3158
+ # Add to visited URLs
3159
+ self.visited_urls.add(normalized_url)
3160
+
3161
  # Special handling for educational sites like phsms.cloud.ncnu.edu.tw
3162
  if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
3163
  ["exam", "test", "pastpaper", "eduexp"]):
 
3175
  await self.rotate_proxy_if_needed()
3176
 
3177
  # Standard sublink extraction for all sites
3178
+ try:
3179
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
3180
+ except Exception as e:
3181
+ logger.warning(f"Error navigating to URL for sublink extraction: {e}")
3182
+ # Continue with what we have, we'll try to extract links anyway
3183
+
3184
  # Get base URL for resolving relative links
3185
  parsed_base = urlparse(url)
3186
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
 
3379
  if href and not href.startswith('javascript:'):
3380
  links.add(href)
3381
 
3382
+ # Find all download links
3383
+ download_links = await self.page.evaluate("""
3384
+ () => {
3385
+ return Array.from(document.querySelectorAll('a[href]'))
3386
+ .filter(a => {
3387
+ const href = a.href.toLowerCase();
3388
+ return href.includes('download') ||
3389
+ href.includes('file') ||
3390
+ href.includes('get') ||
3391
+ href.includes('view.php') ||
3392
+ href.includes('action=') ||
3393
+ href.includes('fname=');
3394
+ })
3395
+ .map(a => a.href);
3396
+ }
3397
+ """)
3398
+
3399
+ for download_link in download_links:
3400
+ links.add(download_link)
3401
+
3402
+ # Also check for hidden links in JavaScript, iframes, or dynamic content
3403
+ js_links = await self.discover_hidden_links(self.page)
3404
+ for link in js_links:
3405
+ links.add(link)
3406
+
3407
  logger.info(f"Found {len(links)} sublinks")
3408
+
3409
+ # Prioritize download links
3410
+ prioritized_links = []
3411
+ normal_links = []
3412
+
3413
+ for link in links:
3414
+ if is_download_link(link):
3415
+ prioritized_links.append(link)
3416
+ else:
3417
+ normal_links.append(link)
3418
+
3419
+ # Return prioritized links first, then normal links, up to the limit
3420
+ result = prioritized_links + normal_links
3421
+ return result[:limit]
3422
 
3423
  except Exception as e:
3424
  logger.error(f"Error getting sublinks from {url}: {e}")
 
3519
  file_count_text = st.empty()
3520
 
3521
  try:
3522
+ # Reset the visited URLs for a fresh deep search
3523
+ self.visited_urls = set()
3524
+
3525
  progress_text.text("Analyzing main page...")
3526
  # Special handling for ASP.NET pages
3527
  is_aspnet = False
 
3536
  except Exception:
3537
  pass
3538
 
3539
+ # Check if this URL is a direct download
3540
+ if is_download_link(url):
3541
+ progress_text.text("URL appears to be a direct download. Analyzing...")
3542
+
3543
+ # Try to extract file directly
3544
+ normalized_url = normalize_download_url(url)
3545
+ file_info = {
3546
+ 'url': normalized_url,
3547
+ 'download_url': normalized_url,
3548
+ 'filename': os.path.basename(urlparse(normalized_url).path) or 'download',
3549
+ 'size': 'Unknown Size',
3550
+ 'metadata': {}
3551
+ }
3552
+
3553
+ # Add to visited URLs
3554
+ self.visited_urls.add(normalized_url)
3555
+ progress_bar.progress(1.0)
3556
+ return [file_info]
3557
+
3558
  # Extract files from main page
3559
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
3560
  initial_count = len(main_files)
 
3580
  progress_bar.progress(progress)
3581
 
3582
  try:
3583
+ # Check if this is a direct download link
3584
+ if is_download_link(sublink):
3585
+ # For download links, just add the link directly
3586
+ normalized_url = normalize_download_url(sublink)
3587
+
3588
+ # Skip if already visited
3589
+ if normalized_url in self.visited_urls:
3590
+ continue
3591
+
3592
+ # Mark as visited
3593
+ self.visited_urls.add(normalized_url)
3594
+
3595
+ # Get file size if possible
3596
+ size_str = await self.get_file_size(normalized_url)
3597
+
3598
+ # Get filename, with fallback to domain-based name
3599
+ filename = os.path.basename(urlparse(normalized_url).path)
3600
+ if not filename or filename == '/' or '?' in filename:
3601
+ domain = get_domain(normalized_url)
3602
+ ext = '.pdf' # Default extension
3603
+ for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']:
3604
+ if common_ext in normalized_url.lower():
3605
+ ext = common_ext
3606
+ break
3607
+ filename = f"file_from_{domain}{ext}"
3608
+
3609
+ # Add file to results
3610
+ all_files.append({
3611
+ 'url': normalized_url,
3612
+ 'download_url': normalized_url,
3613
+ 'filename': filename,
3614
+ 'size': size_str,
3615
+ 'metadata': {}
3616
+ })
3617
+ file_count_text.text(f"Found {len(all_files)} total files")
3618
+ continue
3619
+
3620
+ # For regular links, use a longer timeout for ASP.NET pages which can be slower
3621
  sub_timeout = timeout * 2 if is_aspnet else timeout
3622
 
3623
+ # Skip already visited URLs
3624
+ if sublink in self.visited_urls:
3625
+ continue
3626
+
3627
  # Extract files from sublink
3628
  sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
3629
  all_files.extend(sub_files)
 
3742
  if custom_ext_list != valid_ext_list:
3743
  st.warning("Invalid extensions ignored. Use format like '.csv'.")
3744
 
3745
+ # Reset RAG engine for new search
3746
+ st.session_state.rag_indexed = False
3747
+ st.session_state.rag_engine = None
3748
+
3749
+ # Define a function to run the deep search
3750
+ async def run_deep_search():
3751
+ async with DownloadManager(
3752
+ use_proxy=use_proxy,
3753
+ proxy=proxy,
3754
+ use_stealth=use_stealth
3755
+ ) as dm:
3756
+ files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
3757
+ return files
3758
 
3759
  with st.spinner("Searching for files..."):
3760
+ files = asyncio.run(run_deep_search())
 
3761
 
3762
  if files:
3763
  st.session_state.discovered_files = files
 
3780
  if st.button("Search Files", key="rag_search_btn") and search_query:
3781
  # Initialize RAG search engine
3782
  if not st.session_state.rag_indexed:
3783
+ rag_search = EnhancedRAGSearch()
3784
 
3785
  with st.spinner("Indexing files for search..."):
3786
  # First download files to extract text
 
3793
  for i, file_info in enumerate(files):
3794
  # Only process common text-based file formats
3795
  ext = os.path.splitext(file_info['filename'])[1].lower()
3796
+ if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
3797
  path = await dm.download_file(file_info, temp_dir, url)
3798
  if path:
3799
  with open(path, 'rb') as f:
 
3826
  for result in search_results:
3827
  file_info = result['file_info']
3828
  score = result['score']
3829
+ match_type = result.get('match_type', 'document')
3830
+
3831
  with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
3832
  st.write(f"Size: {file_info['size']}")
3833
+ st.write(f"Match type: {match_type}")
3834
+
3835
+ # Show language if available
3836
+ if 'language' in result:
3837
+ st.write(f"Language: {result['language']}")
3838
+
3839
+ # Show metadata if available
3840
  if 'metadata' in file_info and file_info['metadata']:
3841
  st.write("Metadata:")
3842
  for k, v in file_info['metadata'].items():
3843
  if k != 'file_id': # Skip technical details
3844
  st.write(f"- {k}: {v}")
3845
 
3846
+ # Show content preview for chunk matches
3847
+ if 'chunk_preview' in result:
3848
+ st.write("Content preview:")
3849
+ st.text(result['chunk_preview'])
3850
+
3851
  # Add direct download button
3852
  if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
3853
  with st.spinner(f"Downloading {file_info['filename']}..."):
 
4030
  # Create expanders for each result
4031
  for i, url in enumerate(urls, 1):
4032
  with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
4033
+ st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}", on_click=set_deep_search_url, args=(url,))
 
 
4034
  else:
4035
  st.warning("No search results found.")
4036
 
4037
  asyncio.run(run_search())
4038
 
4039
+ # Handle deep search - using on_click function to avoid state issues
4040
+ if 'deep_search_url' in st.session_state and st.session_state.deep_search_url:
4041
+ url = st.session_state.deep_search_url
4042
+ st.info(f"Deep searching: {url}")
4043
+
4044
+ # Set up custom extensions
4045
+ custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
4046
+ valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
4047
+
4048
+ # Reset RAG engine for new search
4049
+ st.session_state.rag_indexed = False
4050
+ st.session_state.rag_engine = None
4051
+
4052
+ # Run the deep search
4053
+ async def run_bing_deep_search():
4054
+ async with DownloadManager(
4055
+ use_proxy=use_proxy,
4056
+ proxy=proxy,
4057
+ use_stealth=use_stealth
4058
+ ) as dm:
4059
+ files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
4060
+ return files
4061
+
4062
+ with st.spinner("Searching for files..."):
4063
+ files = asyncio.run(run_bing_deep_search())
4064
+
4065
+ if files:
4066
+ st.session_state.discovered_files = files
4067
+ st.session_state.current_url = url
4068
+ st.success(f"Found {len(files)} files!")
4069
 
4070
+ # Show files with direct download options
4071
+ download_dir = "./downloads"
4072
+ os.makedirs(download_dir, exist_ok=True)
4073
 
4074
+ # Individual file display with direct download buttons
4075
+ for i, file in enumerate(files):
4076
+ col1, col2, col3 = st.columns([3, 1, 1])
4077
+ with col1:
4078
+ filename = file['filename']
4079
+ size = file['size']
4080
+ meta = file.get('metadata', {})
4081
+ file_info = f"{filename} ({size})"
4082
+ if meta and 'Pages' in meta:
4083
+ file_info += f" - {meta.get('Pages', '')} pages"
4084
+ st.markdown(f"**{i+1}. {file_info}**")
4085
+
4086
+ with col2:
4087
+ # Add direct download button for each file
4088
+ if st.button(f"Download", key=f"direct_dl_bing_{i}"):
4089
+ with st.spinner(f"Downloading {filename}..."):
4090
+ async def download_single_file():
4091
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
4092
+ path = await dm.download_file(file, download_dir, url)
4093
+ return path
4094
+
4095
+ downloaded_path = asyncio.run(download_single_file())
4096
+ if downloaded_path:
4097
+ with open(downloaded_path, "rb") as f:
4098
+ file_data = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4099
 
4100
+ st.download_button(
4101
+ label=f"Save {filename}",
4102
+ data=file_data,
4103
+ file_name=filename,
4104
+ mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
4105
+ key=f"save_bing_file_{i}"
4106
+ )
4107
+
4108
+ with col3:
4109
+ # Add to selection for batch download
4110
+ if i in st.session_state.selected_files:
4111
+ if st.button("Unselect", key=f"bing_unselect_{i}"):
4112
+ st.session_state.selected_files.remove(i)
4113
+ else:
4114
+ if st.button("Select", key=f"bing_select_{i}"):
4115
+ st.session_state.selected_files.append(i)
4116
+
4117
+ # Add RAG Search interface for Bing results
4118
+ st.markdown("### Search Within Discovered Files")
4119
+ search_query = st.text_input("Enter search terms", key="bing_rag_search_query")
4120
+
4121
+ if st.button("Search Files", key="bing_rag_search_btn") and search_query:
4122
+ # Initialize RAG search engine
4123
+ if not st.session_state.rag_indexed:
4124
+ rag_search = EnhancedRAGSearch()
4125
 
4126
+ with st.spinner("Indexing files for search..."):
4127
+ # First download files to extract text
4128
+ temp_dir = "./temp_downloads"
4129
+ os.makedirs(temp_dir, exist_ok=True)
4130
+
4131
+ async def download_for_indexing():
4132
+ downloaded = 0
4133
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
4134
+ for i, file_info in enumerate(files):
4135
+ # Only process common text-based file formats
4136
+ ext = os.path.splitext(file_info['filename'])[1].lower()
4137
+ if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
4138
+ path = await dm.download_file(file_info, temp_dir, url)
4139
+ if path:
4140
+ with open(path, 'rb') as f:
4141
+ file_data = f.read()
4142
+
4143
+ # Add to search index
4144
+ if rag_search.add_file(file_data, file_info):
4145
+ downloaded += 1
4146
+
4147
+ # Clean up
4148
+ os.remove(path)
4149
+ return downloaded
4150
+
4151
+ indexed_count = asyncio.run(download_for_indexing())
4152
+ if indexed_count > 0:
4153
+ rag_search.build_index()
4154
+ st.session_state.rag_engine = rag_search
4155
+ st.session_state.rag_indexed = True
4156
+ st.success(f"Indexed {indexed_count} files for search")
4157
  else:
4158
+ st.warning("Could not index any files. Try with more text-based documents.")
4159
+
4160
+ # Perform the search
4161
+ if st.session_state.rag_indexed:
4162
+ search_results = st.session_state.rag_engine.search(search_query)
4163
+
4164
+ if search_results:
4165
+ st.write(f"Found {len(search_results)} relevant files:")
4166
+
4167
+ for result in search_results:
4168
+ file_info = result['file_info']
4169
+ score = result['score']
4170
+ match_type = result.get('match_type', 'document')
4171
+
4172
+ with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
4173
+ st.write(f"Size: {file_info['size']}")
4174
+ st.write(f"Match type: {match_type}")
4175
+
4176
+ # Show language if available
4177
+ if 'language' in result:
4178
+ st.write(f"Language: {result['language']}")
4179
+
4180
+ # Show metadata if available
4181
+ if 'metadata' in file_info and file_info['metadata']:
4182
+ st.write("Metadata:")
4183
+ for k, v in file_info['metadata'].items():
4184
+ if k != 'file_id': # Skip technical details
4185
+ st.write(f"- {k}: {v}")
4186
+
4187
+ # Show content preview for chunk matches
4188
+ if 'chunk_preview' in result:
4189
+ st.write("Content preview:")
4190
+ st.text(result['chunk_preview'])
4191
+
4192
+ # Add direct download button
4193
+ if st.button(f"Download this file", key=f"bing_rag_dl_{result['rank']}"):
4194
+ with st.spinner(f"Downloading {file_info['filename']}..."):
4195
+ async def download_search_result():
4196
+ async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
4197
+ path = await dm.download_file(file_info, download_dir, url)
4198
+ return path
4199
+
4200
+ path = asyncio.run(download_search_result())
4201
+ if path:
4202
+ with open(path, "rb") as f:
4203
+ file_data = f.read()
4204
+
4205
+ st.download_button(
4206
+ label=f"Save {file_info['filename']}",
4207
+ data=file_data,
4208
+ file_name=file_info['filename'],
4209
+ mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
4210
+ key=f"save_bing_rag_{result['rank']}"
4211
+ )
4212
+ else:
4213
+ st.warning("No matching files found for your query.")
4214
+ else:
4215
+ st.warning("No files found.")
4216
+
4217
+ # Reset the deep search URL after processing
4218
+ st.session_state.deep_search_url = None
4219
 
4220
  # Add a special section for direct Google Drive file download
4221
  st.markdown("---")
 
4261
 
4262
  # Add footer with attribution
4263
  st.markdown('---')
4264
+ st.markdown('Created by [Euler314](https://github.com/euler314)')
4265
+
4266
+ # Helper function for Bing search deep search URL setting
4267
+ def set_deep_search_url(url):
4268
+ st.session_state.deep_search_url = url
4269
 
4270
  if __name__ == "__main__":
4271
  main()