euler314 commited on
Commit
907ffd6
·
verified ·
1 Parent(s): b9d5bbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +442 -285
app.py CHANGED
@@ -90,7 +90,7 @@ def load_models():
90
 
91
  # Load SentenceTransformer
92
  try:
93
- semantic_model = SentenceTransformer('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
94
  except Exception as e:
95
  st.error(f"Error loading SentenceTransformer: {e}")
96
  semantic_model = None
@@ -314,12 +314,14 @@ class DownloadManager:
314
 
315
  parsed_base = urlparse(final_url)
316
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
 
317
 
 
318
  for a in soup.find_all('a', href=True):
319
  href = a['href'].strip()
320
 
321
  if '.php' in href.lower() or 'download' in href.lower():
322
- full_url = href if href.startswith('http') else f"{base_url}{href}"
323
  real_url = await self.extract_real_download_url(full_url)
324
  if real_url and real_url != full_url:
325
  found_files.append({
@@ -331,7 +333,7 @@ class DownloadManager:
331
  continue
332
 
333
  if any(href.lower().endswith(ext) for ext in all_exts):
334
- file_url = href if href.startswith('http') else f"{base_url}{href}"
335
  size_str = await self.get_file_size(file_url)
336
  meta = {}
337
  if file_url.lower().endswith('.pdf'):
@@ -373,6 +375,41 @@ class DownloadManager:
373
  }
374
  })
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  seen_urls = set()
377
  unique_files = []
378
  for f in found_files:
@@ -882,67 +919,6 @@ class DownloadManager:
882
  logger.warning("Standard download methods failed")
883
  return False
884
 
885
- async def get_google_drive_file_info(self, file_id):
886
- """Get file type and view-only status from Google Drive"""
887
- file_type = None
888
- is_view_only = False
889
-
890
- try:
891
- async with self.context.new_page() as page:
892
- await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
893
-
894
- # Check if view-only
895
- view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
896
- is_view_only = view_only_text is not None
897
-
898
- # Check for Google Docs viewer
899
- gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
900
- gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
901
- gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
902
-
903
- if gdocs_viewer:
904
- file_type = 'docx'
905
- elif gsheets_viewer:
906
- file_type = 'xlsx'
907
- elif gslides_viewer:
908
- file_type = 'pptx'
909
- else:
910
- # Check for PDF viewer
911
- pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
912
- if pdf_viewer:
913
- file_type = 'pdf'
914
- else:
915
- # Check for image viewer
916
- img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
917
- if img_viewer:
918
- # Get image type from src
919
- img_src = await img_viewer.get_attribute('src')
920
- if 'jpg' in img_src or 'jpeg' in img_src:
921
- file_type = 'jpg'
922
- elif 'png' in img_src:
923
- file_type = 'png'
924
- else:
925
- file_type = 'jpg' # Default to jpg
926
- else:
927
- # Generic file type fallback
928
- file_type = 'pdf' # Default to PDF
929
-
930
- # If still no type, check filename
931
- if not file_type:
932
- title_element = await page.query_selector('div[role="heading"]')
933
- if title_element:
934
- title = await title_element.text_content()
935
- if title:
936
- ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
937
- if ext_match:
938
- file_type = ext_match.group(1).lower()
939
-
940
- except Exception as e:
941
- logger.error(f"Error getting Google Drive file info: {e}")
942
- file_type = 'pdf' # Default to PDF if we can't determine
943
-
944
- return file_type, is_view_only
945
-
946
  async def download_viewonly_pdf_with_js(self, file_id, save_path):
947
  """Download view-only PDF using JavaScript approach - improved version"""
948
  try:
@@ -954,245 +930,134 @@ class DownloadManager:
954
  view_url = f"https://drive.google.com/file/d/{file_id}/view"
955
  await page.goto(view_url, wait_until='networkidle', timeout=60000)
956
 
957
- # Wait for rendering
958
  await page.wait_for_timeout(2000)
959
 
960
- # Inject required libraries - use CDN for jsPDF
961
- await page.evaluate("""
962
- async function injectLibraries() {
963
- // Add jsPDF
964
- return new Promise((resolve) => {
965
- const jspdfScript = document.createElement('script');
966
- jspdfScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
967
- jspdfScript.onload = () => resolve(true);
968
- document.head.appendChild(jspdfScript);
969
- });
970
- }
971
- return injectLibraries();
972
- """)
973
-
974
- # Wait for libraries to load
975
- await page.wait_for_timeout(2000)
976
-
977
- # Scroll through document to load all pages
978
  await page.evaluate("""
979
  async function scrollThroughDocument() {
980
  const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
981
  const container = document.querySelector('.drive-viewer-paginated-scrollable');
982
  if (!container) return false;
983
 
 
984
  const scrollHeight = container.scrollHeight;
985
  const viewportHeight = container.clientHeight;
986
- const scrollStep = viewportHeight / 2;
987
 
988
- for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
 
989
  container.scrollTo(0, scrollPos);
990
- await delay(500);
991
  }
992
 
993
  // One final scroll to bottom to ensure everything is loaded
994
  container.scrollTo(0, scrollHeight);
995
- await delay(1000);
996
 
997
  // Scroll back to top for PDF creation
998
  container.scrollTo(0, 0);
999
- await delay(500);
1000
 
1001
  return true;
1002
  }
1003
  return scrollThroughDocument();
1004
  """)
1005
 
1006
- # Wait after scrolling
1007
- await page.wait_for_timeout(2000)
1008
-
1009
- # Use the improved PDF creation script that captures all pages
1010
  pdf_base64 = await page.evaluate("""
1011
  async function createPDF() {
1012
  try {
1013
- // Make sure jsPDF is loaded
1014
- if (typeof window.jspdf === 'undefined') {
1015
- console.error('jsPDF not loaded');
1016
- return null;
1017
- }
 
 
 
1018
 
 
 
 
1019
  const { jsPDF } = window.jspdf;
1020
- const pdf = new jsPDF();
 
 
1021
 
1022
- // Get all page elements
1023
- const pages = document.querySelectorAll('.drive-viewer-paginated-page');
1024
- console.log('Found pages:', pages.length);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025
 
1026
- if (pages.length === 0) {
1027
- // Alternative: try to find images directly
1028
- const images = Array.from(document.querySelectorAll('img')).filter(img =>
1029
- img.src.startsWith('blob:') && img.width > 100 && img.height > 100
1030
- );
1031
 
1032
- console.log('Found images:', images.length);
 
 
 
 
1033
 
1034
- if (images.length === 0) {
1035
- return null;
1036
- }
 
 
1037
 
1038
- // Process each image
1039
- for (let i = 0; i < images.length; i++) {
1040
- const img = images[i];
1041
-
1042
- if (i > 0) {
1043
- pdf.addPage();
1044
- }
1045
-
1046
- // Create canvas and draw image
1047
- const canvas = document.createElement('canvas');
1048
- canvas.width = img.width;
1049
- canvas.height = img.height;
1050
- const ctx = canvas.getContext('2d');
1051
- ctx.drawImage(img, 0, 0, img.width, img.height);
1052
-
1053
- // Add to PDF
1054
- const imgData = canvas.toDataURL('image/jpeg', 0.95);
1055
-
1056
- // Calculate dimensions
1057
- const pageWidth = pdf.internal.pageSize.getWidth();
1058
- const pageHeight = pdf.internal.pageSize.getHeight();
1059
- const imgRatio = img.height / img.width;
1060
-
1061
- let imgWidth = pageWidth - 10;
1062
- let imgHeight = imgWidth * imgRatio;
1063
-
1064
- if (imgHeight > pageHeight - 10) {
1065
- imgHeight = pageHeight - 10;
1066
- imgWidth = imgHeight / imgRatio;
1067
- }
1068
-
1069
- // Center on page
1070
- const x = (pageWidth - imgWidth) / 2;
1071
- const y = (pageHeight - imgHeight) / 2;
1072
-
1073
- pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
1074
  }
1075
- } else {
1076
- // Process each page
1077
- const container = document.querySelector('.drive-viewer-paginated-scrollable');
1078
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
1079
 
1080
- for (let i = 0; i < pages.length; i++) {
1081
- // Add a new page for each page after the first
1082
- if (i > 0) {
1083
- pdf.addPage();
1084
- }
1085
-
1086
- // Scroll to the page and wait for it to render
1087
- pages[i].scrollIntoView();
1088
- await delay(300);
1089
-
1090
- // Find the image element inside the page
1091
- const pageImages = pages[i].querySelectorAll('img');
1092
- let targetImage = null;
1093
-
1094
- for (const img of pageImages) {
1095
- if (img.src.startsWith('blob:') && img.width > 50 && img.height > 50) {
1096
- targetImage = img;
1097
- break;
1098
- }
1099
- }
1100
-
1101
- if (!targetImage) {
1102
- // If no image found, try taking a screenshot of the page instead
1103
- const pageCanvas = document.createElement('canvas');
1104
- pageCanvas.width = pages[i].clientWidth;
1105
- pageCanvas.height = pages[i].clientHeight;
1106
- const ctx = pageCanvas.getContext('2d');
1107
-
1108
- // Draw the page background
1109
- ctx.fillStyle = 'white';
1110
- ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
1111
-
1112
- // Use html2canvas approach
1113
- try {
1114
- await delay(100);
1115
- // Just draw what we can see
1116
- const allElements = pages[i].querySelectorAll('*');
1117
- for (const el of allElements) {
1118
- if (el.tagName === 'IMG' && el.complete && el.src) {
1119
- const rect = el.getBoundingClientRect();
1120
- try {
1121
- ctx.drawImage(el, rect.left, rect.top, rect.width, rect.height);
1122
- } catch (e) {
1123
- console.error('Draw error:', e);
1124
- }
1125
- }
1126
- }
1127
- } catch (e) {
1128
- console.error('Canvas error:', e);
1129
- }
1130
-
1131
- // Add the canvas to the PDF
1132
- const imgData = pageCanvas.toDataURL('image/jpeg', 0.95);
1133
-
1134
- // Calculate dimensions
1135
- const pageWidth = pdf.internal.pageSize.getWidth();
1136
- const pageHeight = pdf.internal.pageSize.getHeight();
1137
- const imgRatio = pageCanvas.height / pageCanvas.width;
1138
-
1139
- let imgWidth = pageWidth - 10;
1140
- let imgHeight = imgWidth * imgRatio;
1141
-
1142
- if (imgHeight > pageHeight - 10) {
1143
- imgHeight = pageHeight - 10;
1144
- imgWidth = imgHeight / imgRatio;
1145
- }
1146
-
1147
- // Center on page
1148
- const x = (pageWidth - imgWidth) / 2;
1149
- const y = (pageHeight - imgHeight) / 2;
1150
-
1151
- pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
1152
- } else {
1153
- // Use the found image
1154
- const canvas = document.createElement('canvas');
1155
- canvas.width = targetImage.naturalWidth || targetImage.width;
1156
- canvas.height = targetImage.naturalHeight || targetImage.height;
1157
- const ctx = canvas.getContext('2d');
1158
-
1159
- // Draw image to canvas
1160
- try {
1161
- ctx.drawImage(targetImage, 0, 0, canvas.width, canvas.height);
1162
- } catch (e) {
1163
- console.error('Error drawing image:', e);
1164
- continue;
1165
- }
1166
-
1167
- // Add to PDF
1168
- const imgData = canvas.toDataURL('image/jpeg', 0.95);
1169
-
1170
- // Calculate dimensions
1171
- const pageWidth = pdf.internal.pageSize.getWidth();
1172
- const pageHeight = pdf.internal.pageSize.getHeight();
1173
- const imgRatio = canvas.height / canvas.width;
1174
-
1175
- let imgWidth = pageWidth - 10;
1176
- let imgHeight = imgWidth * imgRatio;
1177
-
1178
- if (imgHeight > pageHeight - 10) {
1179
- imgHeight = pageHeight - 10;
1180
- imgWidth = imgHeight / imgRatio;
1181
- }
1182
-
1183
- // Center on page
1184
- const x = (pageWidth - imgWidth) / 2;
1185
- const y = (pageHeight - imgHeight) / 2;
1186
-
1187
- pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
1188
- }
1189
  }
 
 
 
 
 
 
 
 
 
 
 
1190
  }
1191
 
1192
  // Return as base64
1193
  return pdf.output('datauristring');
1194
  } catch (e) {
1195
- console.error('PDF creation error:', e);
1196
  return null;
1197
  }
1198
  }
@@ -1200,7 +1065,6 @@ class DownloadManager:
1200
  """)
1201
 
1202
  if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
1203
- # If script method failed, try screenshot approach
1204
  logger.warning("PDF creation script failed, trying fallback method")
1205
  return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1206
 
@@ -1222,7 +1086,7 @@ class DownloadManager:
1222
  except Exception as e:
1223
  logger.error(f"Error saving PDF: {e}")
1224
  return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1225
-
1226
  except Exception as e:
1227
  logger.error(f"Error in view-only PDF download: {e}")
1228
  # Try fallback method
@@ -1409,72 +1273,365 @@ class DownloadManager:
1409
  logger.error(f"Error exporting Google Doc: {e}")
1410
  return False
1411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1412
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
1413
  if not custom_ext_list:
1414
  custom_ext_list = []
1415
  progress_text = st.empty()
1416
  progress_bar = st.progress(0)
1417
  file_count_text = st.empty()
 
1418
  try:
1419
  progress_text.text("Analyzing main page...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
1421
  initial_count = len(main_files)
1422
  file_count_text.text(f"Found {initial_count} files on main page")
 
 
1423
  progress_text.text("Getting sublinks...")
1424
  sublinks = await self.get_sublinks(url, sublink_limit)
1425
  total_links = len(sublinks)
1426
  progress_text.text(f"Found {total_links} sublinks to process")
 
1427
  if not sublinks:
1428
  progress_bar.progress(1.0)
1429
  return main_files
 
 
1430
  all_files = main_files
1431
  for i, sublink in enumerate(sublinks, 1):
1432
  progress = i / total_links
1433
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
1434
  progress_bar.progress(progress)
1435
- sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1436
- all_files.extend(sub_files)
1437
- file_count_text.text(f"Found {len(all_files)} total files")
 
 
 
 
 
 
 
 
 
 
 
1438
  seen_urls = set()
1439
  unique_files = []
1440
  for f in all_files:
1441
  if f['url'] not in seen_urls:
1442
  seen_urls.add(f['url'])
1443
  unique_files.append(f)
 
1444
  final_count = len(unique_files)
1445
  progress_text.text(f"Deep search complete!")
1446
  file_count_text.text(f"Found {final_count} unique files")
1447
  progress_bar.progress(1.0)
1448
  return unique_files
 
1449
  except Exception as e:
1450
  logger.error(f"Deep search error: {e}")
1451
  progress_text.text(f"Error during deep search: {str(e)}")
1452
  return []
 
1453
  finally:
1454
  await asyncio.sleep(2)
1455
  if not st.session_state.get('keep_progress', False):
1456
  progress_text.empty()
1457
  progress_bar.empty()
1458
-
1459
- async def get_sublinks(self, url, limit=10000):
1460
- try:
1461
- await self.page.goto(url, timeout=30000)
1462
- content = await self.page.content()
1463
- soup = BeautifulSoup(content, 'html.parser')
1464
- parsed_base = urlparse(url)
1465
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1466
- links = set()
1467
- for a in soup.find_all('a', href=True):
1468
- href = a['href'].strip()
1469
- if href.startswith('http'):
1470
- links.add(href)
1471
- elif href.startswith('/'):
1472
- links.add(f"{base_url}{href}")
1473
- return list(links)[:limit]
1474
- except Exception as e:
1475
- logger.error(f"Error getting sublinks: {e}")
1476
- return []
1477
-
1478
  # Utility Functions for New Features
1479
  def extract_keywords(text, n=5):
1480
  doc = nlp_model(text)
 
90
 
91
  # Load SentenceTransformer
92
  try:
93
+ semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
94
  except Exception as e:
95
  st.error(f"Error loading SentenceTransformer: {e}")
96
  semantic_model = None
 
314
 
315
  parsed_base = urlparse(final_url)
316
  base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
317
+ path_base = os.path.dirname(parsed_base.path)
318
 
319
+ # Process all anchor tags
320
  for a in soup.find_all('a', href=True):
321
  href = a['href'].strip()
322
 
323
  if '.php' in href.lower() or 'download' in href.lower():
324
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
325
  real_url = await self.extract_real_download_url(full_url)
326
  if real_url and real_url != full_url:
327
  found_files.append({
 
333
  continue
334
 
335
  if any(href.lower().endswith(ext) for ext in all_exts):
336
+ file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
337
  size_str = await self.get_file_size(file_url)
338
  meta = {}
339
  if file_url.lower().endswith('.pdf'):
 
375
  }
376
  })
377
 
378
+ # Also check for files in other elements (iframe, embed, object, etc.)
379
+ other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
380
+ for elem in other_elements:
381
+ src = elem.get('src') or elem.get('data')
382
+ if src and any(src.lower().endswith(ext) for ext in all_exts):
383
+ file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
384
+ size_str = await self.get_file_size(file_url)
385
+ meta = {}
386
+ if file_url.lower().endswith('.pdf'):
387
+ meta = await self.get_pdf_metadata(file_url)
388
+ found_files.append({
389
+ 'url': file_url,
390
+ 'filename': os.path.basename(file_url.split('?')[0]),
391
+ 'size': size_str,
392
+ 'metadata': meta
393
+ })
394
+
395
+ # Check for file links in onclick attributes
396
+ onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
397
+ for elem in onclick_elements:
398
+ onclick = await elem.get_attribute('onclick')
399
+ urls = re.findall(r'(https?://[^\'"]+)', onclick)
400
+ for url_match in urls:
401
+ if any(url_match.lower().endswith(ext) for ext in all_exts):
402
+ size_str = await self.get_file_size(url_match)
403
+ meta = {}
404
+ if url_match.lower().endswith('.pdf'):
405
+ meta = await self.get_pdf_metadata(url_match)
406
+ found_files.append({
407
+ 'url': url_match,
408
+ 'filename': os.path.basename(url_match.split('?')[0]),
409
+ 'size': size_str,
410
+ 'metadata': meta
411
+ })
412
+
413
  seen_urls = set()
414
  unique_files = []
415
  for f in found_files:
 
919
  logger.warning("Standard download methods failed")
920
  return False
921
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
  async def download_viewonly_pdf_with_js(self, file_id, save_path):
923
  """Download view-only PDF using JavaScript approach - improved version"""
924
  try:
 
930
  view_url = f"https://drive.google.com/file/d/{file_id}/view"
931
  await page.goto(view_url, wait_until='networkidle', timeout=60000)
932
 
933
+ # Wait for initial rendering
934
  await page.wait_for_timeout(2000)
935
 
936
+ # CRITICAL: Scroll through entire document to ensure all content is cached
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
  await page.evaluate("""
938
  async function scrollThroughDocument() {
939
  const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
940
  const container = document.querySelector('.drive-viewer-paginated-scrollable');
941
  if (!container) return false;
942
 
943
+ // Get total scroll height
944
  const scrollHeight = container.scrollHeight;
945
  const viewportHeight = container.clientHeight;
 
946
 
947
+ // Scroll down in increments to load all pages
948
+ for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += viewportHeight) {
949
  container.scrollTo(0, scrollPos);
950
+ await delay(800); // Wait for content to load
951
  }
952
 
953
  // One final scroll to bottom to ensure everything is loaded
954
  container.scrollTo(0, scrollHeight);
955
+ await delay(1500);
956
 
957
  // Scroll back to top for PDF creation
958
  container.scrollTo(0, 0);
959
+ await delay(800);
960
 
961
  return true;
962
  }
963
  return scrollThroughDocument();
964
  """)
965
 
966
+ # Use simplified script similar to the one provided
 
 
 
967
  pdf_base64 = await page.evaluate("""
968
  async function createPDF() {
969
  try {
970
+ // Create jsPDF script element
971
+ const loadJsPDF = () => new Promise((resolve, reject) => {
972
+ let jspdf = document.createElement("script");
973
+ jspdf.onload = () => resolve();
974
+ jspdf.onerror = () => reject(new Error("Failed to load jsPDF"));
975
+ jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
976
+ document.body.appendChild(jspdf);
977
+ });
978
 
979
+ await loadJsPDF();
980
+
981
+ // Create PDF
982
  const { jsPDF } = window.jspdf;
983
+ let pdf = new jsPDF();
984
+ let elements = document.getElementsByTagName("img");
985
+ let pageCount = 0;
986
 
987
+ // First pass to find and sort all valid page images
988
+ let pageImages = [];
989
+ for (let i = 0; i < elements.length; i++) {
990
+ let img = elements[i];
991
+ // Only process blob images (these are the PDF pages)
992
+ if (!/^blob:/.test(img.src)) continue;
993
+
994
+ // Skip tiny images (usually icons, not content)
995
+ if (img.width < 100 || img.height < 100) continue;
996
+
997
+ pageImages.push(img);
998
+ }
999
+
1000
+ // Sort images by their position if possible
1001
+ try {
1002
+ pageImages.sort((a, b) => {
1003
+ const rectA = a.getBoundingClientRect();
1004
+ const rectB = b.getBoundingClientRect();
1005
+ return rectA.top - rectB.top;
1006
+ });
1007
+ } catch (e) {
1008
+ console.error("Error sorting images:", e);
1009
+ }
1010
 
1011
+ // Process each image as a page
1012
+ for (let i = 0; i < pageImages.length; i++) {
1013
+ let img = pageImages[i];
 
 
1014
 
1015
+ // Create canvas to draw the image
1016
+ let canvasElement = document.createElement('canvas');
1017
+ let con = canvasElement.getContext("2d");
1018
+ canvasElement.width = img.width;
1019
+ canvasElement.height = img.height;
1020
 
1021
+ // Draw image to canvas
1022
+ con.drawImage(img, 0, 0, img.width, img.height);
1023
+
1024
+ // Add image to PDF
1025
+ let imgData = canvasElement.toDataURL("image/jpeg", 0.95);
1026
 
1027
+ // Add a new page for each page after the first
1028
+ if (pageCount > 0) {
1029
+ pdf.addPage();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1030
  }
 
 
 
 
1031
 
1032
+ // Calculate dimensions to fit the page
1033
+ const pageWidth = pdf.internal.pageSize.getWidth();
1034
+ const pageHeight = pdf.internal.pageSize.getHeight();
1035
+ const imgRatio = img.height / img.width;
1036
+
1037
+ let imgWidth = pageWidth;
1038
+ let imgHeight = imgWidth * imgRatio;
1039
+
1040
+ if (imgHeight > pageHeight) {
1041
+ imgHeight = pageHeight;
1042
+ imgWidth = imgHeight / imgRatio;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043
  }
1044
+
1045
+ // Center on page
1046
+ const x = (pageWidth - imgWidth) / 2;
1047
+ const y = (pageHeight - imgHeight) / 2;
1048
+
1049
+ pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
1050
+ pageCount++;
1051
+ }
1052
+
1053
+ if (pageCount === 0) {
1054
+ return null; // No pages found
1055
  }
1056
 
1057
  // Return as base64
1058
  return pdf.output('datauristring');
1059
  } catch (e) {
1060
+ console.error("PDF creation error:", e);
1061
  return null;
1062
  }
1063
  }
 
1065
  """)
1066
 
1067
  if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
 
1068
  logger.warning("PDF creation script failed, trying fallback method")
1069
  return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1070
 
 
1086
  except Exception as e:
1087
  logger.error(f"Error saving PDF: {e}")
1088
  return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
1089
+
1090
  except Exception as e:
1091
  logger.error(f"Error in view-only PDF download: {e}")
1092
  # Try fallback method
 
1273
  logger.error(f"Error exporting Google Doc: {e}")
1274
  return False
1275
 
1276
+ async def get_google_drive_file_info(self, file_id):
1277
+ """Get file type and view-only status from Google Drive"""
1278
+ file_type = None
1279
+ is_view_only = False
1280
+
1281
+ try:
1282
+ async with self.context.new_page() as page:
1283
+ await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
1284
+
1285
+ # Check if view-only
1286
+ view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
1287
+ is_view_only = view_only_text is not None
1288
+
1289
+ # Check for Google Docs viewer
1290
+ gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
1291
+ gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
1292
+ gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
1293
+
1294
+ if gdocs_viewer:
1295
+ file_type = 'docx'
1296
+ elif gsheets_viewer:
1297
+ file_type = 'xlsx'
1298
+ elif gslides_viewer:
1299
+ file_type = 'pptx'
1300
+ else:
1301
+ # Check for PDF viewer
1302
+ pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
1303
+ if pdf_viewer:
1304
+ file_type = 'pdf'
1305
+ else:
1306
+ # Check for image viewer
1307
+ img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
1308
+ if img_viewer:
1309
+ # Get image type from src
1310
+ img_src = await img_viewer.get_attribute('src')
1311
+ if 'jpg' in img_src or 'jpeg' in img_src:
1312
+ file_type = 'jpg'
1313
+ elif 'png' in img_src:
1314
+ file_type = 'png'
1315
+ else:
1316
+ file_type = 'jpg' # Default to jpg
1317
+ else:
1318
+ # Generic file type fallback
1319
+ file_type = 'pdf' # Default to PDF
1320
+
1321
+ # If still no type, check filename
1322
+ if not file_type:
1323
+ title_element = await page.query_selector('div[role="heading"]')
1324
+ if title_element:
1325
+ title = await title_element.text_content()
1326
+ if title:
1327
+ ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
1328
+ if ext_match:
1329
+ file_type = ext_match.group(1).lower()
1330
+
1331
+ except Exception as e:
1332
+ logger.error(f"Error getting Google Drive file info: {e}")
1333
+ file_type = 'pdf' # Default to PDF if we can't determine
1334
+
1335
+ return file_type, is_view_only
1336
+
1337
+ async def get_sublinks(self, url, limit=10000):
1338
+ """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
1339
+ links = set()
1340
+ try:
1341
+ logger.info(f"Fetching sublinks from: {url}")
1342
+
1343
+ # Go to page and wait for full load
1344
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
1345
+
1346
+ # Get base URL for resolving relative links
1347
+ parsed_base = urlparse(url)
1348
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
1349
+ path_base = os.path.dirname(parsed_base.path)
1350
+
1351
+ # Check if page has ASP.NET elements which might need special handling
1352
+ is_aspnet = await self.page.evaluate('''
1353
+ () => {
1354
+ return document.querySelector('form#aspnetForm') !== null ||
1355
+ document.querySelector('input[name="__VIEWSTATE"]') !== null;
1356
+ }
1357
+ ''')
1358
+
1359
+ if is_aspnet:
1360
+ logger.info("Detected ASP.NET page, using enhanced extraction method")
1361
+
1362
+ # Try to interact with ASP.NET controls that might reveal more links
1363
+ # Look for dropdowns, buttons, and grid elements
1364
+ dropdowns = await self.page.query_selector_all('select')
1365
+ buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
1366
+
1367
+ # Try interacting with dropdowns first
1368
+ for dropdown in dropdowns:
1369
+ try:
1370
+ # Get all options
1371
+ options = await self.page.evaluate('''
1372
+ (dropdown) => {
1373
+ return Array.from(dropdown.options).map(o => o.value);
1374
+ }
1375
+ ''', dropdown)
1376
+
1377
+ # Try selecting each option
1378
+ for option in options:
1379
+ if option:
1380
+ await dropdown.select_option(value=option)
1381
+ await self.page.wait_for_timeout(1000)
1382
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
1383
+
1384
+ # Extract any new links that appeared
1385
+ await self.extract_all_link_types(links, base_url, path_base)
1386
+ except Exception as e:
1387
+ logger.warning(f"Error interacting with dropdown: {e}")
1388
+
1389
+ # Try clicking buttons (but avoid dangerous ones like "delete")
1390
+ safe_buttons = []
1391
+ for button in buttons:
1392
+ button_text = await button.text_content() or ""
1393
+ button_value = await button.get_attribute("value") or ""
1394
+ button_id = await button.get_attribute("id") or ""
1395
+ combined_text = (button_text + button_value + button_id).lower()
1396
+
1397
+ # Skip potentially destructive buttons
1398
+ if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
1399
+ continue
1400
+
1401
+ # Prioritize buttons that might show more content
1402
+ if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
1403
+ safe_buttons.append(button)
1404
+
1405
+ # Click the safe buttons
1406
+ for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks
1407
+ try:
1408
+ await button.click()
1409
+ await self.page.wait_for_timeout(1000)
1410
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
1411
+
1412
+ # Extract any new links that appeared
1413
+ await self.extract_all_link_types(links, base_url, path_base)
1414
+ except Exception as e:
1415
+ logger.warning(f"Error clicking button: {e}")
1416
+
1417
+ # Extract links from the initial page state
1418
+ await self.extract_all_link_types(links, base_url, path_base)
1419
+
1420
+ # Look specifically for links inside grid/table views which are common in ASP.NET applications
1421
+ grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
1422
+ for cell in grid_cells:
1423
+ try:
1424
+ href = await cell.get_attribute('href')
1425
+ if href:
1426
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1427
+ links.add(full_url)
1428
+ except Exception as e:
1429
+ logger.warning(f"Error extracting grid link: {e}")
1430
+
1431
+ # Extract links from onclick attributes and javascript:__doPostBack calls
1432
+ postback_links = await self.page.evaluate('''
1433
+ () => {
1434
+ const results = [];
1435
+ // Find elements with onclick containing __doPostBack
1436
+ const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
1437
+ for (const el of elements) {
1438
+ // Extract the postback target
1439
+ const onclick = el.getAttribute('onclick') || '';
1440
+ const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
1441
+ if (match && match[1]) {
1442
+ // Get the visible text to use as description
1443
+ const text = el.innerText || el.textContent || 'Link';
1444
+ results.push({
1445
+ id: match[1],
1446
+ text: text.trim()
1447
+ });
1448
+ }
1449
+ }
1450
+ return results;
1451
+ }
1452
+ ''')
1453
+
1454
+ # Try interacting with some of the postback links
1455
+ for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions
1456
+ try:
1457
+ logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
1458
+ await self.page.evaluate(f'''
1459
+ () => {{
1460
+ if (typeof __doPostBack === 'function') {{
1461
+ __doPostBack('{postback["id"]}', '');
1462
+ }}
1463
+ }}
1464
+ ''')
1465
+ await self.page.wait_for_timeout(1500)
1466
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
1467
+
1468
+ # Extract any new links that appeared
1469
+ await self.extract_all_link_types(links, base_url, path_base)
1470
+ except Exception as e:
1471
+ logger.warning(f"Error with postback: {e}")
1472
+
1473
+ logger.info(f"Found {len(links)} sublinks")
1474
+ return list(links)[:limit]
1475
+
1476
+ except Exception as e:
1477
+ logger.error(f"Error getting sublinks from {url}: {e}")
1478
+ return list(links)[:limit] # Return what we have so far
1479
+
1480
+ async def extract_all_link_types(self, links_set, base_url, path_base):
1481
+ """Extract all types of links from the current page"""
1482
+ # Get all <a> tag links
1483
+ a_links = await self.page.query_selector_all('a[href]')
1484
+ for a in a_links:
1485
+ try:
1486
+ href = await a.get_attribute('href')
1487
+ if href and not href.startswith('javascript:') and not href.startswith('#'):
1488
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1489
+ links_set.add(full_url)
1490
+ except Exception:
1491
+ pass
1492
+
1493
+ # Get iframe sources
1494
+ iframes = await self.page.query_selector_all('iframe[src]')
1495
+ for iframe in iframes:
1496
+ try:
1497
+ src = await iframe.get_attribute('src')
1498
+ if src and not src.startswith('javascript:') and not src.startswith('about:'):
1499
+ full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
1500
+ links_set.add(full_url)
1501
+ except Exception:
1502
+ pass
1503
+
1504
+ # Get links from onclick attributes that reference URLs
1505
+ onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
1506
+ for el in onclick_elements:
1507
+ try:
1508
+ onclick = await el.get_attribute('onclick')
1509
+ urls = re.findall(r'(https?://[^\'"]+)', onclick)
1510
+ for url in urls:
1511
+ links_set.add(url)
1512
+ except Exception:
1513
+ pass
1514
+
1515
+ # Look for URLs in data-* attributes
1516
+ data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
1517
+ for el in data_elements:
1518
+ for attr in ['data-url', 'data-href', 'data-src']:
1519
+ try:
1520
+ value = await el.get_attribute(attr)
1521
+ if value and not value.startswith('javascript:'):
1522
+ full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
1523
+ links_set.add(full_url)
1524
+ except Exception:
1525
+ pass
1526
+
1527
+ # Look for special anchor links that might not have href attributes
1528
+ special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
1529
+ for anchor in special_anchors:
1530
+ try:
1531
+ href = await anchor.get_attribute('href')
1532
+ if href and not href.startswith('javascript:') and not href.startswith('#'):
1533
+ full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
1534
+ links_set.add(full_url)
1535
+ except Exception:
1536
+ pass
1537
+
1538
+ def resolve_relative_url(self, relative_url, base_url, path_base):
1539
+ """Properly resolve relative URLs considering multiple formats"""
1540
+ if relative_url.startswith('/'):
1541
+ # Absolute path relative to domain
1542
+ return f"{base_url}{relative_url}"
1543
+ elif relative_url.startswith('./'):
1544
+ # Explicit relative path
1545
+ return f"{base_url}{path_base}/{relative_url[2:]}"
1546
+ elif relative_url.startswith('../'):
1547
+ # Parent directory
1548
+ parent_path = '/'.join(path_base.split('/')[:-1])
1549
+ return f"{base_url}{parent_path}/{relative_url[3:]}"
1550
+ else:
1551
+ # Regular relative path
1552
+ return f"{base_url}{path_base}/{relative_url}"
1553
+
1554
  async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
1555
  if not custom_ext_list:
1556
  custom_ext_list = []
1557
  progress_text = st.empty()
1558
  progress_bar = st.progress(0)
1559
  file_count_text = st.empty()
1560
+
1561
  try:
1562
  progress_text.text("Analyzing main page...")
1563
+ # Special handling for ASP.NET pages
1564
+ is_aspnet = False
1565
+ try:
1566
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
1567
+ is_aspnet = await self.page.evaluate('''
1568
+ () => {
1569
+ return document.querySelector('form#aspnetForm') !== null ||
1570
+ document.querySelector('input[name="__VIEWSTATE"]') !== null;
1571
+ }
1572
+ ''')
1573
+ except Exception:
1574
+ pass
1575
+
1576
+ # Extract files from main page
1577
  main_files = await self.extract_downloadable_files(url, custom_ext_list)
1578
  initial_count = len(main_files)
1579
  file_count_text.text(f"Found {initial_count} files on main page")
1580
+
1581
+ # Get sublinks with enhanced method
1582
  progress_text.text("Getting sublinks...")
1583
  sublinks = await self.get_sublinks(url, sublink_limit)
1584
  total_links = len(sublinks)
1585
  progress_text.text(f"Found {total_links} sublinks to process")
1586
+
1587
  if not sublinks:
1588
  progress_bar.progress(1.0)
1589
  return main_files
1590
+
1591
+ # Process each sublink
1592
  all_files = main_files
1593
  for i, sublink in enumerate(sublinks, 1):
1594
  progress = i / total_links
1595
  progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
1596
  progress_bar.progress(progress)
1597
+
1598
+ try:
1599
+ # Use a longer timeout for ASP.NET pages which can be slower
1600
+ sub_timeout = timeout * 2 if is_aspnet else timeout
1601
+
1602
+ # Extract files from sublink with appropriate timeout
1603
+ async with async_timeout(sub_timeout):
1604
+ sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
1605
+ all_files.extend(sub_files)
1606
+ file_count_text.text(f"Found {len(all_files)} total files")
1607
+ except Exception as e:
1608
+ logger.warning(f"Error processing sublink {sublink}: {e}")
1609
+
1610
+ # Deduplicate files
1611
  seen_urls = set()
1612
  unique_files = []
1613
  for f in all_files:
1614
  if f['url'] not in seen_urls:
1615
  seen_urls.add(f['url'])
1616
  unique_files.append(f)
1617
+
1618
  final_count = len(unique_files)
1619
  progress_text.text(f"Deep search complete!")
1620
  file_count_text.text(f"Found {final_count} unique files")
1621
  progress_bar.progress(1.0)
1622
  return unique_files
1623
+
1624
  except Exception as e:
1625
  logger.error(f"Deep search error: {e}")
1626
  progress_text.text(f"Error during deep search: {str(e)}")
1627
  return []
1628
+
1629
  finally:
1630
  await asyncio.sleep(2)
1631
  if not st.session_state.get('keep_progress', False):
1632
  progress_text.empty()
1633
  progress_bar.empty()
1634
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1635
  # Utility Functions for New Features
1636
  def extract_keywords(text, n=5):
1637
  doc = nlp_model(text)