Update app.py
Browse files
app.py
CHANGED
@@ -90,7 +90,7 @@ def load_models():
|
|
90 |
|
91 |
# Load SentenceTransformer
|
92 |
try:
|
93 |
-
semantic_model = SentenceTransformer('
|
94 |
except Exception as e:
|
95 |
st.error(f"Error loading SentenceTransformer: {e}")
|
96 |
semantic_model = None
|
@@ -314,12 +314,14 @@ class DownloadManager:
|
|
314 |
|
315 |
parsed_base = urlparse(final_url)
|
316 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
|
317 |
|
|
|
318 |
for a in soup.find_all('a', href=True):
|
319 |
href = a['href'].strip()
|
320 |
|
321 |
if '.php' in href.lower() or 'download' in href.lower():
|
322 |
-
full_url = href if href.startswith('http') else
|
323 |
real_url = await self.extract_real_download_url(full_url)
|
324 |
if real_url and real_url != full_url:
|
325 |
found_files.append({
|
@@ -331,7 +333,7 @@ class DownloadManager:
|
|
331 |
continue
|
332 |
|
333 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
334 |
-
file_url = href if href.startswith('http') else
|
335 |
size_str = await self.get_file_size(file_url)
|
336 |
meta = {}
|
337 |
if file_url.lower().endswith('.pdf'):
|
@@ -373,6 +375,41 @@ class DownloadManager:
|
|
373 |
}
|
374 |
})
|
375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
seen_urls = set()
|
377 |
unique_files = []
|
378 |
for f in found_files:
|
@@ -882,67 +919,6 @@ class DownloadManager:
|
|
882 |
logger.warning("Standard download methods failed")
|
883 |
return False
|
884 |
|
885 |
-
async def get_google_drive_file_info(self, file_id):
|
886 |
-
"""Get file type and view-only status from Google Drive"""
|
887 |
-
file_type = None
|
888 |
-
is_view_only = False
|
889 |
-
|
890 |
-
try:
|
891 |
-
async with self.context.new_page() as page:
|
892 |
-
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
|
893 |
-
|
894 |
-
# Check if view-only
|
895 |
-
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
|
896 |
-
is_view_only = view_only_text is not None
|
897 |
-
|
898 |
-
# Check for Google Docs viewer
|
899 |
-
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
|
900 |
-
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
|
901 |
-
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
|
902 |
-
|
903 |
-
if gdocs_viewer:
|
904 |
-
file_type = 'docx'
|
905 |
-
elif gsheets_viewer:
|
906 |
-
file_type = 'xlsx'
|
907 |
-
elif gslides_viewer:
|
908 |
-
file_type = 'pptx'
|
909 |
-
else:
|
910 |
-
# Check for PDF viewer
|
911 |
-
pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
|
912 |
-
if pdf_viewer:
|
913 |
-
file_type = 'pdf'
|
914 |
-
else:
|
915 |
-
# Check for image viewer
|
916 |
-
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
|
917 |
-
if img_viewer:
|
918 |
-
# Get image type from src
|
919 |
-
img_src = await img_viewer.get_attribute('src')
|
920 |
-
if 'jpg' in img_src or 'jpeg' in img_src:
|
921 |
-
file_type = 'jpg'
|
922 |
-
elif 'png' in img_src:
|
923 |
-
file_type = 'png'
|
924 |
-
else:
|
925 |
-
file_type = 'jpg' # Default to jpg
|
926 |
-
else:
|
927 |
-
# Generic file type fallback
|
928 |
-
file_type = 'pdf' # Default to PDF
|
929 |
-
|
930 |
-
# If still no type, check filename
|
931 |
-
if not file_type:
|
932 |
-
title_element = await page.query_selector('div[role="heading"]')
|
933 |
-
if title_element:
|
934 |
-
title = await title_element.text_content()
|
935 |
-
if title:
|
936 |
-
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
|
937 |
-
if ext_match:
|
938 |
-
file_type = ext_match.group(1).lower()
|
939 |
-
|
940 |
-
except Exception as e:
|
941 |
-
logger.error(f"Error getting Google Drive file info: {e}")
|
942 |
-
file_type = 'pdf' # Default to PDF if we can't determine
|
943 |
-
|
944 |
-
return file_type, is_view_only
|
945 |
-
|
946 |
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
947 |
"""Download view-only PDF using JavaScript approach - improved version"""
|
948 |
try:
|
@@ -954,245 +930,134 @@ class DownloadManager:
|
|
954 |
view_url = f"https://drive.google.com/file/d/{file_id}/view"
|
955 |
await page.goto(view_url, wait_until='networkidle', timeout=60000)
|
956 |
|
957 |
-
# Wait for rendering
|
958 |
await page.wait_for_timeout(2000)
|
959 |
|
960 |
-
#
|
961 |
-
await page.evaluate("""
|
962 |
-
async function injectLibraries() {
|
963 |
-
// Add jsPDF
|
964 |
-
return new Promise((resolve) => {
|
965 |
-
const jspdfScript = document.createElement('script');
|
966 |
-
jspdfScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
|
967 |
-
jspdfScript.onload = () => resolve(true);
|
968 |
-
document.head.appendChild(jspdfScript);
|
969 |
-
});
|
970 |
-
}
|
971 |
-
return injectLibraries();
|
972 |
-
""")
|
973 |
-
|
974 |
-
# Wait for libraries to load
|
975 |
-
await page.wait_for_timeout(2000)
|
976 |
-
|
977 |
-
# Scroll through document to load all pages
|
978 |
await page.evaluate("""
|
979 |
async function scrollThroughDocument() {
|
980 |
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
981 |
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
982 |
if (!container) return false;
|
983 |
|
|
|
984 |
const scrollHeight = container.scrollHeight;
|
985 |
const viewportHeight = container.clientHeight;
|
986 |
-
const scrollStep = viewportHeight / 2;
|
987 |
|
988 |
-
|
|
|
989 |
container.scrollTo(0, scrollPos);
|
990 |
-
await delay(
|
991 |
}
|
992 |
|
993 |
// One final scroll to bottom to ensure everything is loaded
|
994 |
container.scrollTo(0, scrollHeight);
|
995 |
-
await delay(
|
996 |
|
997 |
// Scroll back to top for PDF creation
|
998 |
container.scrollTo(0, 0);
|
999 |
-
await delay(
|
1000 |
|
1001 |
return true;
|
1002 |
}
|
1003 |
return scrollThroughDocument();
|
1004 |
""")
|
1005 |
|
1006 |
-
#
|
1007 |
-
await page.wait_for_timeout(2000)
|
1008 |
-
|
1009 |
-
# Use the improved PDF creation script that captures all pages
|
1010 |
pdf_base64 = await page.evaluate("""
|
1011 |
async function createPDF() {
|
1012 |
try {
|
1013 |
-
//
|
1014 |
-
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
|
|
|
|
|
|
1018 |
|
|
|
|
|
|
|
1019 |
const { jsPDF } = window.jspdf;
|
1020 |
-
|
|
|
|
|
1021 |
|
1022 |
-
//
|
1023 |
-
|
1024 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1025 |
|
1026 |
-
|
1027 |
-
|
1028 |
-
|
1029 |
-
img.src.startsWith('blob:') && img.width > 100 && img.height > 100
|
1030 |
-
);
|
1031 |
|
1032 |
-
|
|
|
|
|
|
|
|
|
1033 |
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
|
|
|
|
1037 |
|
1038 |
-
//
|
1039 |
-
|
1040 |
-
|
1041 |
-
|
1042 |
-
if (i > 0) {
|
1043 |
-
pdf.addPage();
|
1044 |
-
}
|
1045 |
-
|
1046 |
-
// Create canvas and draw image
|
1047 |
-
const canvas = document.createElement('canvas');
|
1048 |
-
canvas.width = img.width;
|
1049 |
-
canvas.height = img.height;
|
1050 |
-
const ctx = canvas.getContext('2d');
|
1051 |
-
ctx.drawImage(img, 0, 0, img.width, img.height);
|
1052 |
-
|
1053 |
-
// Add to PDF
|
1054 |
-
const imgData = canvas.toDataURL('image/jpeg', 0.95);
|
1055 |
-
|
1056 |
-
// Calculate dimensions
|
1057 |
-
const pageWidth = pdf.internal.pageSize.getWidth();
|
1058 |
-
const pageHeight = pdf.internal.pageSize.getHeight();
|
1059 |
-
const imgRatio = img.height / img.width;
|
1060 |
-
|
1061 |
-
let imgWidth = pageWidth - 10;
|
1062 |
-
let imgHeight = imgWidth * imgRatio;
|
1063 |
-
|
1064 |
-
if (imgHeight > pageHeight - 10) {
|
1065 |
-
imgHeight = pageHeight - 10;
|
1066 |
-
imgWidth = imgHeight / imgRatio;
|
1067 |
-
}
|
1068 |
-
|
1069 |
-
// Center on page
|
1070 |
-
const x = (pageWidth - imgWidth) / 2;
|
1071 |
-
const y = (pageHeight - imgHeight) / 2;
|
1072 |
-
|
1073 |
-
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
1074 |
}
|
1075 |
-
} else {
|
1076 |
-
// Process each page
|
1077 |
-
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
1078 |
-
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
1079 |
|
1080 |
-
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
-
|
1085 |
-
|
1086 |
-
|
1087 |
-
|
1088 |
-
|
1089 |
-
|
1090 |
-
|
1091 |
-
const pageImages = pages[i].querySelectorAll('img');
|
1092 |
-
let targetImage = null;
|
1093 |
-
|
1094 |
-
for (const img of pageImages) {
|
1095 |
-
if (img.src.startsWith('blob:') && img.width > 50 && img.height > 50) {
|
1096 |
-
targetImage = img;
|
1097 |
-
break;
|
1098 |
-
}
|
1099 |
-
}
|
1100 |
-
|
1101 |
-
if (!targetImage) {
|
1102 |
-
// If no image found, try taking a screenshot of the page instead
|
1103 |
-
const pageCanvas = document.createElement('canvas');
|
1104 |
-
pageCanvas.width = pages[i].clientWidth;
|
1105 |
-
pageCanvas.height = pages[i].clientHeight;
|
1106 |
-
const ctx = pageCanvas.getContext('2d');
|
1107 |
-
|
1108 |
-
// Draw the page background
|
1109 |
-
ctx.fillStyle = 'white';
|
1110 |
-
ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
|
1111 |
-
|
1112 |
-
// Use html2canvas approach
|
1113 |
-
try {
|
1114 |
-
await delay(100);
|
1115 |
-
// Just draw what we can see
|
1116 |
-
const allElements = pages[i].querySelectorAll('*');
|
1117 |
-
for (const el of allElements) {
|
1118 |
-
if (el.tagName === 'IMG' && el.complete && el.src) {
|
1119 |
-
const rect = el.getBoundingClientRect();
|
1120 |
-
try {
|
1121 |
-
ctx.drawImage(el, rect.left, rect.top, rect.width, rect.height);
|
1122 |
-
} catch (e) {
|
1123 |
-
console.error('Draw error:', e);
|
1124 |
-
}
|
1125 |
-
}
|
1126 |
-
}
|
1127 |
-
} catch (e) {
|
1128 |
-
console.error('Canvas error:', e);
|
1129 |
-
}
|
1130 |
-
|
1131 |
-
// Add the canvas to the PDF
|
1132 |
-
const imgData = pageCanvas.toDataURL('image/jpeg', 0.95);
|
1133 |
-
|
1134 |
-
// Calculate dimensions
|
1135 |
-
const pageWidth = pdf.internal.pageSize.getWidth();
|
1136 |
-
const pageHeight = pdf.internal.pageSize.getHeight();
|
1137 |
-
const imgRatio = pageCanvas.height / pageCanvas.width;
|
1138 |
-
|
1139 |
-
let imgWidth = pageWidth - 10;
|
1140 |
-
let imgHeight = imgWidth * imgRatio;
|
1141 |
-
|
1142 |
-
if (imgHeight > pageHeight - 10) {
|
1143 |
-
imgHeight = pageHeight - 10;
|
1144 |
-
imgWidth = imgHeight / imgRatio;
|
1145 |
-
}
|
1146 |
-
|
1147 |
-
// Center on page
|
1148 |
-
const x = (pageWidth - imgWidth) / 2;
|
1149 |
-
const y = (pageHeight - imgHeight) / 2;
|
1150 |
-
|
1151 |
-
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
1152 |
-
} else {
|
1153 |
-
// Use the found image
|
1154 |
-
const canvas = document.createElement('canvas');
|
1155 |
-
canvas.width = targetImage.naturalWidth || targetImage.width;
|
1156 |
-
canvas.height = targetImage.naturalHeight || targetImage.height;
|
1157 |
-
const ctx = canvas.getContext('2d');
|
1158 |
-
|
1159 |
-
// Draw image to canvas
|
1160 |
-
try {
|
1161 |
-
ctx.drawImage(targetImage, 0, 0, canvas.width, canvas.height);
|
1162 |
-
} catch (e) {
|
1163 |
-
console.error('Error drawing image:', e);
|
1164 |
-
continue;
|
1165 |
-
}
|
1166 |
-
|
1167 |
-
// Add to PDF
|
1168 |
-
const imgData = canvas.toDataURL('image/jpeg', 0.95);
|
1169 |
-
|
1170 |
-
// Calculate dimensions
|
1171 |
-
const pageWidth = pdf.internal.pageSize.getWidth();
|
1172 |
-
const pageHeight = pdf.internal.pageSize.getHeight();
|
1173 |
-
const imgRatio = canvas.height / canvas.width;
|
1174 |
-
|
1175 |
-
let imgWidth = pageWidth - 10;
|
1176 |
-
let imgHeight = imgWidth * imgRatio;
|
1177 |
-
|
1178 |
-
if (imgHeight > pageHeight - 10) {
|
1179 |
-
imgHeight = pageHeight - 10;
|
1180 |
-
imgWidth = imgHeight / imgRatio;
|
1181 |
-
}
|
1182 |
-
|
1183 |
-
// Center on page
|
1184 |
-
const x = (pageWidth - imgWidth) / 2;
|
1185 |
-
const y = (pageHeight - imgHeight) / 2;
|
1186 |
-
|
1187 |
-
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
1188 |
-
}
|
1189 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1190 |
}
|
1191 |
|
1192 |
// Return as base64
|
1193 |
return pdf.output('datauristring');
|
1194 |
} catch (e) {
|
1195 |
-
console.error(
|
1196 |
return null;
|
1197 |
}
|
1198 |
}
|
@@ -1200,7 +1065,6 @@ class DownloadManager:
|
|
1200 |
""")
|
1201 |
|
1202 |
if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
|
1203 |
-
# If script method failed, try screenshot approach
|
1204 |
logger.warning("PDF creation script failed, trying fallback method")
|
1205 |
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
1206 |
|
@@ -1222,7 +1086,7 @@ class DownloadManager:
|
|
1222 |
except Exception as e:
|
1223 |
logger.error(f"Error saving PDF: {e}")
|
1224 |
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
1225 |
-
|
1226 |
except Exception as e:
|
1227 |
logger.error(f"Error in view-only PDF download: {e}")
|
1228 |
# Try fallback method
|
@@ -1409,72 +1273,365 @@ class DownloadManager:
|
|
1409 |
logger.error(f"Error exporting Google Doc: {e}")
|
1410 |
return False
|
1411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1412 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
1413 |
if not custom_ext_list:
|
1414 |
custom_ext_list = []
|
1415 |
progress_text = st.empty()
|
1416 |
progress_bar = st.progress(0)
|
1417 |
file_count_text = st.empty()
|
|
|
1418 |
try:
|
1419 |
progress_text.text("Analyzing main page...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1420 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
1421 |
initial_count = len(main_files)
|
1422 |
file_count_text.text(f"Found {initial_count} files on main page")
|
|
|
|
|
1423 |
progress_text.text("Getting sublinks...")
|
1424 |
sublinks = await self.get_sublinks(url, sublink_limit)
|
1425 |
total_links = len(sublinks)
|
1426 |
progress_text.text(f"Found {total_links} sublinks to process")
|
|
|
1427 |
if not sublinks:
|
1428 |
progress_bar.progress(1.0)
|
1429 |
return main_files
|
|
|
|
|
1430 |
all_files = main_files
|
1431 |
for i, sublink in enumerate(sublinks, 1):
|
1432 |
progress = i / total_links
|
1433 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
1434 |
progress_bar.progress(progress)
|
1435 |
-
|
1436 |
-
|
1437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1438 |
seen_urls = set()
|
1439 |
unique_files = []
|
1440 |
for f in all_files:
|
1441 |
if f['url'] not in seen_urls:
|
1442 |
seen_urls.add(f['url'])
|
1443 |
unique_files.append(f)
|
|
|
1444 |
final_count = len(unique_files)
|
1445 |
progress_text.text(f"Deep search complete!")
|
1446 |
file_count_text.text(f"Found {final_count} unique files")
|
1447 |
progress_bar.progress(1.0)
|
1448 |
return unique_files
|
|
|
1449 |
except Exception as e:
|
1450 |
logger.error(f"Deep search error: {e}")
|
1451 |
progress_text.text(f"Error during deep search: {str(e)}")
|
1452 |
return []
|
|
|
1453 |
finally:
|
1454 |
await asyncio.sleep(2)
|
1455 |
if not st.session_state.get('keep_progress', False):
|
1456 |
progress_text.empty()
|
1457 |
progress_bar.empty()
|
1458 |
-
|
1459 |
-
async def get_sublinks(self, url, limit=10000):
|
1460 |
-
try:
|
1461 |
-
await self.page.goto(url, timeout=30000)
|
1462 |
-
content = await self.page.content()
|
1463 |
-
soup = BeautifulSoup(content, 'html.parser')
|
1464 |
-
parsed_base = urlparse(url)
|
1465 |
-
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
1466 |
-
links = set()
|
1467 |
-
for a in soup.find_all('a', href=True):
|
1468 |
-
href = a['href'].strip()
|
1469 |
-
if href.startswith('http'):
|
1470 |
-
links.add(href)
|
1471 |
-
elif href.startswith('/'):
|
1472 |
-
links.add(f"{base_url}{href}")
|
1473 |
-
return list(links)[:limit]
|
1474 |
-
except Exception as e:
|
1475 |
-
logger.error(f"Error getting sublinks: {e}")
|
1476 |
-
return []
|
1477 |
-
|
1478 |
# Utility Functions for New Features
|
1479 |
def extract_keywords(text, n=5):
|
1480 |
doc = nlp_model(text)
|
|
|
90 |
|
91 |
# Load SentenceTransformer
|
92 |
try:
|
93 |
+
semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
|
94 |
except Exception as e:
|
95 |
st.error(f"Error loading SentenceTransformer: {e}")
|
96 |
semantic_model = None
|
|
|
314 |
|
315 |
parsed_base = urlparse(final_url)
|
316 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
317 |
+
path_base = os.path.dirname(parsed_base.path)
|
318 |
|
319 |
+
# Process all anchor tags
|
320 |
for a in soup.find_all('a', href=True):
|
321 |
href = a['href'].strip()
|
322 |
|
323 |
if '.php' in href.lower() or 'download' in href.lower():
|
324 |
+
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
325 |
real_url = await self.extract_real_download_url(full_url)
|
326 |
if real_url and real_url != full_url:
|
327 |
found_files.append({
|
|
|
333 |
continue
|
334 |
|
335 |
if any(href.lower().endswith(ext) for ext in all_exts):
|
336 |
+
file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
337 |
size_str = await self.get_file_size(file_url)
|
338 |
meta = {}
|
339 |
if file_url.lower().endswith('.pdf'):
|
|
|
375 |
}
|
376 |
})
|
377 |
|
378 |
+
# Also check for files in other elements (iframe, embed, object, etc.)
|
379 |
+
other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
|
380 |
+
for elem in other_elements:
|
381 |
+
src = elem.get('src') or elem.get('data')
|
382 |
+
if src and any(src.lower().endswith(ext) for ext in all_exts):
|
383 |
+
file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
|
384 |
+
size_str = await self.get_file_size(file_url)
|
385 |
+
meta = {}
|
386 |
+
if file_url.lower().endswith('.pdf'):
|
387 |
+
meta = await self.get_pdf_metadata(file_url)
|
388 |
+
found_files.append({
|
389 |
+
'url': file_url,
|
390 |
+
'filename': os.path.basename(file_url.split('?')[0]),
|
391 |
+
'size': size_str,
|
392 |
+
'metadata': meta
|
393 |
+
})
|
394 |
+
|
395 |
+
# Check for file links in onclick attributes
|
396 |
+
onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
|
397 |
+
for elem in onclick_elements:
|
398 |
+
onclick = await elem.get_attribute('onclick')
|
399 |
+
urls = re.findall(r'(https?://[^\'"]+)', onclick)
|
400 |
+
for url_match in urls:
|
401 |
+
if any(url_match.lower().endswith(ext) for ext in all_exts):
|
402 |
+
size_str = await self.get_file_size(url_match)
|
403 |
+
meta = {}
|
404 |
+
if url_match.lower().endswith('.pdf'):
|
405 |
+
meta = await self.get_pdf_metadata(url_match)
|
406 |
+
found_files.append({
|
407 |
+
'url': url_match,
|
408 |
+
'filename': os.path.basename(url_match.split('?')[0]),
|
409 |
+
'size': size_str,
|
410 |
+
'metadata': meta
|
411 |
+
})
|
412 |
+
|
413 |
seen_urls = set()
|
414 |
unique_files = []
|
415 |
for f in found_files:
|
|
|
919 |
logger.warning("Standard download methods failed")
|
920 |
return False
|
921 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
922 |
async def download_viewonly_pdf_with_js(self, file_id, save_path):
|
923 |
"""Download view-only PDF using JavaScript approach - improved version"""
|
924 |
try:
|
|
|
930 |
view_url = f"https://drive.google.com/file/d/{file_id}/view"
|
931 |
await page.goto(view_url, wait_until='networkidle', timeout=60000)
|
932 |
|
933 |
+
# Wait for initial rendering
|
934 |
await page.wait_for_timeout(2000)
|
935 |
|
936 |
+
# CRITICAL: Scroll through entire document to ensure all content is cached
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
937 |
await page.evaluate("""
|
938 |
async function scrollThroughDocument() {
|
939 |
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
940 |
const container = document.querySelector('.drive-viewer-paginated-scrollable');
|
941 |
if (!container) return false;
|
942 |
|
943 |
+
// Get total scroll height
|
944 |
const scrollHeight = container.scrollHeight;
|
945 |
const viewportHeight = container.clientHeight;
|
|
|
946 |
|
947 |
+
// Scroll down in increments to load all pages
|
948 |
+
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += viewportHeight) {
|
949 |
container.scrollTo(0, scrollPos);
|
950 |
+
await delay(800); // Wait for content to load
|
951 |
}
|
952 |
|
953 |
// One final scroll to bottom to ensure everything is loaded
|
954 |
container.scrollTo(0, scrollHeight);
|
955 |
+
await delay(1500);
|
956 |
|
957 |
// Scroll back to top for PDF creation
|
958 |
container.scrollTo(0, 0);
|
959 |
+
await delay(800);
|
960 |
|
961 |
return true;
|
962 |
}
|
963 |
return scrollThroughDocument();
|
964 |
""")
|
965 |
|
966 |
+
# Use simplified script similar to the one provided
|
|
|
|
|
|
|
967 |
pdf_base64 = await page.evaluate("""
|
968 |
async function createPDF() {
|
969 |
try {
|
970 |
+
// Create jsPDF script element
|
971 |
+
const loadJsPDF = () => new Promise((resolve, reject) => {
|
972 |
+
let jspdf = document.createElement("script");
|
973 |
+
jspdf.onload = () => resolve();
|
974 |
+
jspdf.onerror = () => reject(new Error("Failed to load jsPDF"));
|
975 |
+
jspdf.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js';
|
976 |
+
document.body.appendChild(jspdf);
|
977 |
+
});
|
978 |
|
979 |
+
await loadJsPDF();
|
980 |
+
|
981 |
+
// Create PDF
|
982 |
const { jsPDF } = window.jspdf;
|
983 |
+
let pdf = new jsPDF();
|
984 |
+
let elements = document.getElementsByTagName("img");
|
985 |
+
let pageCount = 0;
|
986 |
|
987 |
+
// First pass to find and sort all valid page images
|
988 |
+
let pageImages = [];
|
989 |
+
for (let i = 0; i < elements.length; i++) {
|
990 |
+
let img = elements[i];
|
991 |
+
// Only process blob images (these are the PDF pages)
|
992 |
+
if (!/^blob:/.test(img.src)) continue;
|
993 |
+
|
994 |
+
// Skip tiny images (usually icons, not content)
|
995 |
+
if (img.width < 100 || img.height < 100) continue;
|
996 |
+
|
997 |
+
pageImages.push(img);
|
998 |
+
}
|
999 |
+
|
1000 |
+
// Sort images by their position if possible
|
1001 |
+
try {
|
1002 |
+
pageImages.sort((a, b) => {
|
1003 |
+
const rectA = a.getBoundingClientRect();
|
1004 |
+
const rectB = b.getBoundingClientRect();
|
1005 |
+
return rectA.top - rectB.top;
|
1006 |
+
});
|
1007 |
+
} catch (e) {
|
1008 |
+
console.error("Error sorting images:", e);
|
1009 |
+
}
|
1010 |
|
1011 |
+
// Process each image as a page
|
1012 |
+
for (let i = 0; i < pageImages.length; i++) {
|
1013 |
+
let img = pageImages[i];
|
|
|
|
|
1014 |
|
1015 |
+
// Create canvas to draw the image
|
1016 |
+
let canvasElement = document.createElement('canvas');
|
1017 |
+
let con = canvasElement.getContext("2d");
|
1018 |
+
canvasElement.width = img.width;
|
1019 |
+
canvasElement.height = img.height;
|
1020 |
|
1021 |
+
// Draw image to canvas
|
1022 |
+
con.drawImage(img, 0, 0, img.width, img.height);
|
1023 |
+
|
1024 |
+
// Add image to PDF
|
1025 |
+
let imgData = canvasElement.toDataURL("image/jpeg", 0.95);
|
1026 |
|
1027 |
+
// Add a new page for each page after the first
|
1028 |
+
if (pageCount > 0) {
|
1029 |
+
pdf.addPage();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1030 |
}
|
|
|
|
|
|
|
|
|
1031 |
|
1032 |
+
// Calculate dimensions to fit the page
|
1033 |
+
const pageWidth = pdf.internal.pageSize.getWidth();
|
1034 |
+
const pageHeight = pdf.internal.pageSize.getHeight();
|
1035 |
+
const imgRatio = img.height / img.width;
|
1036 |
+
|
1037 |
+
let imgWidth = pageWidth;
|
1038 |
+
let imgHeight = imgWidth * imgRatio;
|
1039 |
+
|
1040 |
+
if (imgHeight > pageHeight) {
|
1041 |
+
imgHeight = pageHeight;
|
1042 |
+
imgWidth = imgHeight / imgRatio;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1043 |
}
|
1044 |
+
|
1045 |
+
// Center on page
|
1046 |
+
const x = (pageWidth - imgWidth) / 2;
|
1047 |
+
const y = (pageHeight - imgHeight) / 2;
|
1048 |
+
|
1049 |
+
pdf.addImage(imgData, 'JPEG', x, y, imgWidth, imgHeight);
|
1050 |
+
pageCount++;
|
1051 |
+
}
|
1052 |
+
|
1053 |
+
if (pageCount === 0) {
|
1054 |
+
return null; // No pages found
|
1055 |
}
|
1056 |
|
1057 |
// Return as base64
|
1058 |
return pdf.output('datauristring');
|
1059 |
} catch (e) {
|
1060 |
+
console.error("PDF creation error:", e);
|
1061 |
return null;
|
1062 |
}
|
1063 |
}
|
|
|
1065 |
""")
|
1066 |
|
1067 |
if not pdf_base64 or not pdf_base64.startswith('data:application/pdf;base64,'):
|
|
|
1068 |
logger.warning("PDF creation script failed, trying fallback method")
|
1069 |
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
1070 |
|
|
|
1086 |
except Exception as e:
|
1087 |
logger.error(f"Error saving PDF: {e}")
|
1088 |
return await self.download_viewonly_with_screenshots(file_id, save_path, 'pdf')
|
1089 |
+
|
1090 |
except Exception as e:
|
1091 |
logger.error(f"Error in view-only PDF download: {e}")
|
1092 |
# Try fallback method
|
|
|
1273 |
logger.error(f"Error exporting Google Doc: {e}")
|
1274 |
return False
|
1275 |
|
1276 |
+
async def get_google_drive_file_info(self, file_id):
|
1277 |
+
"""Get file type and view-only status from Google Drive"""
|
1278 |
+
file_type = None
|
1279 |
+
is_view_only = False
|
1280 |
+
|
1281 |
+
try:
|
1282 |
+
async with self.context.new_page() as page:
|
1283 |
+
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
|
1284 |
+
|
1285 |
+
# Check if view-only
|
1286 |
+
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
|
1287 |
+
is_view_only = view_only_text is not None
|
1288 |
+
|
1289 |
+
# Check for Google Docs viewer
|
1290 |
+
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
|
1291 |
+
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
|
1292 |
+
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
|
1293 |
+
|
1294 |
+
if gdocs_viewer:
|
1295 |
+
file_type = 'docx'
|
1296 |
+
elif gsheets_viewer:
|
1297 |
+
file_type = 'xlsx'
|
1298 |
+
elif gslides_viewer:
|
1299 |
+
file_type = 'pptx'
|
1300 |
+
else:
|
1301 |
+
# Check for PDF viewer
|
1302 |
+
pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
|
1303 |
+
if pdf_viewer:
|
1304 |
+
file_type = 'pdf'
|
1305 |
+
else:
|
1306 |
+
# Check for image viewer
|
1307 |
+
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
|
1308 |
+
if img_viewer:
|
1309 |
+
# Get image type from src
|
1310 |
+
img_src = await img_viewer.get_attribute('src')
|
1311 |
+
if 'jpg' in img_src or 'jpeg' in img_src:
|
1312 |
+
file_type = 'jpg'
|
1313 |
+
elif 'png' in img_src:
|
1314 |
+
file_type = 'png'
|
1315 |
+
else:
|
1316 |
+
file_type = 'jpg' # Default to jpg
|
1317 |
+
else:
|
1318 |
+
# Generic file type fallback
|
1319 |
+
file_type = 'pdf' # Default to PDF
|
1320 |
+
|
1321 |
+
# If still no type, check filename
|
1322 |
+
if not file_type:
|
1323 |
+
title_element = await page.query_selector('div[role="heading"]')
|
1324 |
+
if title_element:
|
1325 |
+
title = await title_element.text_content()
|
1326 |
+
if title:
|
1327 |
+
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
|
1328 |
+
if ext_match:
|
1329 |
+
file_type = ext_match.group(1).lower()
|
1330 |
+
|
1331 |
+
except Exception as e:
|
1332 |
+
logger.error(f"Error getting Google Drive file info: {e}")
|
1333 |
+
file_type = 'pdf' # Default to PDF if we can't determine
|
1334 |
+
|
1335 |
+
return file_type, is_view_only
|
1336 |
+
|
1337 |
+
async def get_sublinks(self, url, limit=10000):
|
1338 |
+
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
|
1339 |
+
links = set()
|
1340 |
+
try:
|
1341 |
+
logger.info(f"Fetching sublinks from: {url}")
|
1342 |
+
|
1343 |
+
# Go to page and wait for full load
|
1344 |
+
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
1345 |
+
|
1346 |
+
# Get base URL for resolving relative links
|
1347 |
+
parsed_base = urlparse(url)
|
1348 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
1349 |
+
path_base = os.path.dirname(parsed_base.path)
|
1350 |
+
|
1351 |
+
# Check if page has ASP.NET elements which might need special handling
|
1352 |
+
is_aspnet = await self.page.evaluate('''
|
1353 |
+
() => {
|
1354 |
+
return document.querySelector('form#aspnetForm') !== null ||
|
1355 |
+
document.querySelector('input[name="__VIEWSTATE"]') !== null;
|
1356 |
+
}
|
1357 |
+
''')
|
1358 |
+
|
1359 |
+
if is_aspnet:
|
1360 |
+
logger.info("Detected ASP.NET page, using enhanced extraction method")
|
1361 |
+
|
1362 |
+
# Try to interact with ASP.NET controls that might reveal more links
|
1363 |
+
# Look for dropdowns, buttons, and grid elements
|
1364 |
+
dropdowns = await self.page.query_selector_all('select')
|
1365 |
+
buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
|
1366 |
+
|
1367 |
+
# Try interacting with dropdowns first
|
1368 |
+
for dropdown in dropdowns:
|
1369 |
+
try:
|
1370 |
+
# Get all options
|
1371 |
+
options = await self.page.evaluate('''
|
1372 |
+
(dropdown) => {
|
1373 |
+
return Array.from(dropdown.options).map(o => o.value);
|
1374 |
+
}
|
1375 |
+
''', dropdown)
|
1376 |
+
|
1377 |
+
# Try selecting each option
|
1378 |
+
for option in options:
|
1379 |
+
if option:
|
1380 |
+
await dropdown.select_option(value=option)
|
1381 |
+
await self.page.wait_for_timeout(1000)
|
1382 |
+
await self.page.wait_for_load_state('networkidle', timeout=5000)
|
1383 |
+
|
1384 |
+
# Extract any new links that appeared
|
1385 |
+
await self.extract_all_link_types(links, base_url, path_base)
|
1386 |
+
except Exception as e:
|
1387 |
+
logger.warning(f"Error interacting with dropdown: {e}")
|
1388 |
+
|
1389 |
+
# Try clicking buttons (but avoid dangerous ones like "delete")
|
1390 |
+
safe_buttons = []
|
1391 |
+
for button in buttons:
|
1392 |
+
button_text = await button.text_content() or ""
|
1393 |
+
button_value = await button.get_attribute("value") or ""
|
1394 |
+
button_id = await button.get_attribute("id") or ""
|
1395 |
+
combined_text = (button_text + button_value + button_id).lower()
|
1396 |
+
|
1397 |
+
# Skip potentially destructive buttons
|
1398 |
+
if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
|
1399 |
+
continue
|
1400 |
+
|
1401 |
+
# Prioritize buttons that might show more content
|
1402 |
+
if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
|
1403 |
+
safe_buttons.append(button)
|
1404 |
+
|
1405 |
+
# Click the safe buttons
|
1406 |
+
for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks
|
1407 |
+
try:
|
1408 |
+
await button.click()
|
1409 |
+
await self.page.wait_for_timeout(1000)
|
1410 |
+
await self.page.wait_for_load_state('networkidle', timeout=5000)
|
1411 |
+
|
1412 |
+
# Extract any new links that appeared
|
1413 |
+
await self.extract_all_link_types(links, base_url, path_base)
|
1414 |
+
except Exception as e:
|
1415 |
+
logger.warning(f"Error clicking button: {e}")
|
1416 |
+
|
1417 |
+
# Extract links from the initial page state
|
1418 |
+
await self.extract_all_link_types(links, base_url, path_base)
|
1419 |
+
|
1420 |
+
# Look specifically for links inside grid/table views which are common in ASP.NET applications
|
1421 |
+
grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
|
1422 |
+
for cell in grid_cells:
|
1423 |
+
try:
|
1424 |
+
href = await cell.get_attribute('href')
|
1425 |
+
if href:
|
1426 |
+
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
1427 |
+
links.add(full_url)
|
1428 |
+
except Exception as e:
|
1429 |
+
logger.warning(f"Error extracting grid link: {e}")
|
1430 |
+
|
1431 |
+
# Extract links from onclick attributes and javascript:__doPostBack calls
|
1432 |
+
postback_links = await self.page.evaluate('''
|
1433 |
+
() => {
|
1434 |
+
const results = [];
|
1435 |
+
// Find elements with onclick containing __doPostBack
|
1436 |
+
const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
|
1437 |
+
for (const el of elements) {
|
1438 |
+
// Extract the postback target
|
1439 |
+
const onclick = el.getAttribute('onclick') || '';
|
1440 |
+
const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
|
1441 |
+
if (match && match[1]) {
|
1442 |
+
// Get the visible text to use as description
|
1443 |
+
const text = el.innerText || el.textContent || 'Link';
|
1444 |
+
results.push({
|
1445 |
+
id: match[1],
|
1446 |
+
text: text.trim()
|
1447 |
+
});
|
1448 |
+
}
|
1449 |
+
}
|
1450 |
+
return results;
|
1451 |
+
}
|
1452 |
+
''')
|
1453 |
+
|
1454 |
+
# Try interacting with some of the postback links
|
1455 |
+
for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions
|
1456 |
+
try:
|
1457 |
+
logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
|
1458 |
+
await self.page.evaluate(f'''
|
1459 |
+
() => {{
|
1460 |
+
if (typeof __doPostBack === 'function') {{
|
1461 |
+
__doPostBack('{postback["id"]}', '');
|
1462 |
+
}}
|
1463 |
+
}}
|
1464 |
+
''')
|
1465 |
+
await self.page.wait_for_timeout(1500)
|
1466 |
+
await self.page.wait_for_load_state('networkidle', timeout=5000)
|
1467 |
+
|
1468 |
+
# Extract any new links that appeared
|
1469 |
+
await self.extract_all_link_types(links, base_url, path_base)
|
1470 |
+
except Exception as e:
|
1471 |
+
logger.warning(f"Error with postback: {e}")
|
1472 |
+
|
1473 |
+
logger.info(f"Found {len(links)} sublinks")
|
1474 |
+
return list(links)[:limit]
|
1475 |
+
|
1476 |
+
except Exception as e:
|
1477 |
+
logger.error(f"Error getting sublinks from {url}: {e}")
|
1478 |
+
return list(links)[:limit] # Return what we have so far
|
1479 |
+
|
1480 |
+
async def extract_all_link_types(self, links_set, base_url, path_base):
|
1481 |
+
"""Extract all types of links from the current page"""
|
1482 |
+
# Get all <a> tag links
|
1483 |
+
a_links = await self.page.query_selector_all('a[href]')
|
1484 |
+
for a in a_links:
|
1485 |
+
try:
|
1486 |
+
href = await a.get_attribute('href')
|
1487 |
+
if href and not href.startswith('javascript:') and not href.startswith('#'):
|
1488 |
+
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
1489 |
+
links_set.add(full_url)
|
1490 |
+
except Exception:
|
1491 |
+
pass
|
1492 |
+
|
1493 |
+
# Get iframe sources
|
1494 |
+
iframes = await self.page.query_selector_all('iframe[src]')
|
1495 |
+
for iframe in iframes:
|
1496 |
+
try:
|
1497 |
+
src = await iframe.get_attribute('src')
|
1498 |
+
if src and not src.startswith('javascript:') and not src.startswith('about:'):
|
1499 |
+
full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
|
1500 |
+
links_set.add(full_url)
|
1501 |
+
except Exception:
|
1502 |
+
pass
|
1503 |
+
|
1504 |
+
# Get links from onclick attributes that reference URLs
|
1505 |
+
onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
|
1506 |
+
for el in onclick_elements:
|
1507 |
+
try:
|
1508 |
+
onclick = await el.get_attribute('onclick')
|
1509 |
+
urls = re.findall(r'(https?://[^\'"]+)', onclick)
|
1510 |
+
for url in urls:
|
1511 |
+
links_set.add(url)
|
1512 |
+
except Exception:
|
1513 |
+
pass
|
1514 |
+
|
1515 |
+
# Look for URLs in data-* attributes
|
1516 |
+
data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
|
1517 |
+
for el in data_elements:
|
1518 |
+
for attr in ['data-url', 'data-href', 'data-src']:
|
1519 |
+
try:
|
1520 |
+
value = await el.get_attribute(attr)
|
1521 |
+
if value and not value.startswith('javascript:'):
|
1522 |
+
full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
|
1523 |
+
links_set.add(full_url)
|
1524 |
+
except Exception:
|
1525 |
+
pass
|
1526 |
+
|
1527 |
+
# Look for special anchor links that might not have href attributes
|
1528 |
+
special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
|
1529 |
+
for anchor in special_anchors:
|
1530 |
+
try:
|
1531 |
+
href = await anchor.get_attribute('href')
|
1532 |
+
if href and not href.startswith('javascript:') and not href.startswith('#'):
|
1533 |
+
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
1534 |
+
links_set.add(full_url)
|
1535 |
+
except Exception:
|
1536 |
+
pass
|
1537 |
+
|
1538 |
+
def resolve_relative_url(self, relative_url, base_url, path_base):
|
1539 |
+
"""Properly resolve relative URLs considering multiple formats"""
|
1540 |
+
if relative_url.startswith('/'):
|
1541 |
+
# Absolute path relative to domain
|
1542 |
+
return f"{base_url}{relative_url}"
|
1543 |
+
elif relative_url.startswith('./'):
|
1544 |
+
# Explicit relative path
|
1545 |
+
return f"{base_url}{path_base}/{relative_url[2:]}"
|
1546 |
+
elif relative_url.startswith('../'):
|
1547 |
+
# Parent directory
|
1548 |
+
parent_path = '/'.join(path_base.split('/')[:-1])
|
1549 |
+
return f"{base_url}{parent_path}/{relative_url[3:]}"
|
1550 |
+
else:
|
1551 |
+
# Regular relative path
|
1552 |
+
return f"{base_url}{path_base}/{relative_url}"
|
1553 |
+
|
1554 |
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
|
1555 |
if not custom_ext_list:
|
1556 |
custom_ext_list = []
|
1557 |
progress_text = st.empty()
|
1558 |
progress_bar = st.progress(0)
|
1559 |
file_count_text = st.empty()
|
1560 |
+
|
1561 |
try:
|
1562 |
progress_text.text("Analyzing main page...")
|
1563 |
+
# Special handling for ASP.NET pages
|
1564 |
+
is_aspnet = False
|
1565 |
+
try:
|
1566 |
+
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
1567 |
+
is_aspnet = await self.page.evaluate('''
|
1568 |
+
() => {
|
1569 |
+
return document.querySelector('form#aspnetForm') !== null ||
|
1570 |
+
document.querySelector('input[name="__VIEWSTATE"]') !== null;
|
1571 |
+
}
|
1572 |
+
''')
|
1573 |
+
except Exception:
|
1574 |
+
pass
|
1575 |
+
|
1576 |
+
# Extract files from main page
|
1577 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
1578 |
initial_count = len(main_files)
|
1579 |
file_count_text.text(f"Found {initial_count} files on main page")
|
1580 |
+
|
1581 |
+
# Get sublinks with enhanced method
|
1582 |
progress_text.text("Getting sublinks...")
|
1583 |
sublinks = await self.get_sublinks(url, sublink_limit)
|
1584 |
total_links = len(sublinks)
|
1585 |
progress_text.text(f"Found {total_links} sublinks to process")
|
1586 |
+
|
1587 |
if not sublinks:
|
1588 |
progress_bar.progress(1.0)
|
1589 |
return main_files
|
1590 |
+
|
1591 |
+
# Process each sublink
|
1592 |
all_files = main_files
|
1593 |
for i, sublink in enumerate(sublinks, 1):
|
1594 |
progress = i / total_links
|
1595 |
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
1596 |
progress_bar.progress(progress)
|
1597 |
+
|
1598 |
+
try:
|
1599 |
+
# Use a longer timeout for ASP.NET pages which can be slower
|
1600 |
+
sub_timeout = timeout * 2 if is_aspnet else timeout
|
1601 |
+
|
1602 |
+
# Extract files from sublink with appropriate timeout
|
1603 |
+
async with async_timeout(sub_timeout):
|
1604 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
1605 |
+
all_files.extend(sub_files)
|
1606 |
+
file_count_text.text(f"Found {len(all_files)} total files")
|
1607 |
+
except Exception as e:
|
1608 |
+
logger.warning(f"Error processing sublink {sublink}: {e}")
|
1609 |
+
|
1610 |
+
# Deduplicate files
|
1611 |
seen_urls = set()
|
1612 |
unique_files = []
|
1613 |
for f in all_files:
|
1614 |
if f['url'] not in seen_urls:
|
1615 |
seen_urls.add(f['url'])
|
1616 |
unique_files.append(f)
|
1617 |
+
|
1618 |
final_count = len(unique_files)
|
1619 |
progress_text.text(f"Deep search complete!")
|
1620 |
file_count_text.text(f"Found {final_count} unique files")
|
1621 |
progress_bar.progress(1.0)
|
1622 |
return unique_files
|
1623 |
+
|
1624 |
except Exception as e:
|
1625 |
logger.error(f"Deep search error: {e}")
|
1626 |
progress_text.text(f"Error during deep search: {str(e)}")
|
1627 |
return []
|
1628 |
+
|
1629 |
finally:
|
1630 |
await asyncio.sleep(2)
|
1631 |
if not st.session_state.get('keep_progress', False):
|
1632 |
progress_text.empty()
|
1633 |
progress_bar.empty()
|
1634 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1635 |
# Utility Functions for New Features
|
1636 |
def extract_keywords(text, n=5):
|
1637 |
doc = nlp_model(text)
|