Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import subprocess
|
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
8 |
import asyncio
|
9 |
import logging
|
10 |
-
from urllib.parse import urlparse, urljoin, unquote
|
11 |
import re
|
12 |
from pathlib import Path
|
13 |
from io import BytesIO
|
@@ -32,13 +32,27 @@ import googleapiclient.discovery
|
|
32 |
import google.auth.transport.requests
|
33 |
import googleapiclient.http
|
34 |
|
35 |
-
#
|
36 |
import nltk
|
37 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
38 |
from sklearn.metrics.pairwise import cosine_similarity
|
39 |
import numpy as np
|
40 |
import docx2txt
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# -------------------- Logging Setup --------------------
|
44 |
logging.basicConfig(
|
@@ -96,37 +110,126 @@ PROXY_ROTATION_CONFIG = {
|
|
96 |
"proxies": [] # Will be populated from the UI if needed
|
97 |
}
|
98 |
|
99 |
-
# -------------------- RAG Search Class --------------------
|
100 |
-
class
|
101 |
def __init__(self):
|
102 |
self.file_texts = []
|
|
|
|
|
103 |
self.file_metadata = []
|
104 |
-
self.vectorizer = TfidfVectorizer(
|
|
|
|
|
|
|
|
|
|
|
105 |
self.vectors = None
|
|
|
|
|
106 |
|
107 |
def add_file(self, file_data, file_info):
|
108 |
-
"""Add a file to the search index"""
|
109 |
-
file_ext = os.path.splitext(file_info['filename'])[1]
|
110 |
text = self.extract_text(file_data, file_ext)
|
|
|
111 |
if text:
|
|
|
112 |
self.file_texts.append(text)
|
113 |
self.file_metadata.append(file_info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
return True
|
115 |
return False
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def extract_text(self, file_data, file_ext):
|
118 |
-
"""Extract text from different file types"""
|
119 |
try:
|
120 |
if file_ext.lower() == '.pdf':
|
121 |
reader = PyPDF2.PdfReader(BytesIO(file_data))
|
122 |
text = ""
|
123 |
for page in reader.pages:
|
124 |
-
|
|
|
|
|
|
|
125 |
return text
|
126 |
elif file_ext.lower() in ['.docx', '.doc']:
|
127 |
return docx2txt.process(BytesIO(file_data))
|
128 |
-
elif file_ext.lower() in ['.txt', '.csv', '.json']:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
return file_data.decode('utf-8', errors='ignore')
|
|
|
|
|
|
|
|
|
130 |
else:
|
131 |
return ""
|
132 |
except Exception as e:
|
@@ -134,35 +237,107 @@ class RAGSearch:
|
|
134 |
return ""
|
135 |
|
136 |
def build_index(self):
|
137 |
-
"""Build
|
138 |
if not self.file_texts:
|
139 |
return False
|
|
|
140 |
try:
|
|
|
141 |
self.vectors = self.vectorizer.fit_transform(self.file_texts)
|
|
|
|
|
|
|
|
|
|
|
142 |
return True
|
143 |
except Exception as e:
|
144 |
logger.error(f"Error building search index: {e}")
|
145 |
return False
|
146 |
|
147 |
-
def
|
148 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
if self.vectors is None:
|
150 |
return []
|
151 |
|
152 |
try:
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
156 |
|
157 |
results = []
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
except Exception as e:
|
167 |
logger.error(f"Error during search: {e}")
|
168 |
return []
|
@@ -222,6 +397,90 @@ def detect_captcha(html_content):
|
|
222 |
html_lower = html_content.lower()
|
223 |
return any(pattern in html_lower for pattern in captcha_patterns)
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
# -------------------- Google Drive Functions --------------------
|
226 |
def get_google_auth_url():
|
227 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
@@ -314,6 +573,10 @@ class DownloadManager:
|
|
314 |
self.request_count = 0
|
315 |
self.captcha_detected = False
|
316 |
self.download_timeout = 300 # 5 minutes timeout for downloads
|
|
|
|
|
|
|
|
|
317 |
|
318 |
async def __aenter__(self):
|
319 |
self.playwright = await async_playwright().start()
|
@@ -594,13 +857,51 @@ class DownloadManager:
|
|
594 |
try:
|
595 |
await self.rotate_proxy_if_needed()
|
596 |
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
except Exception as e:
|
605 |
logger.warning(f"Error getting file size: {e}")
|
606 |
return "Unknown Size"
|
@@ -627,14 +928,53 @@ class DownloadManager:
|
|
627 |
return {}
|
628 |
|
629 |
async def extract_real_download_url(self, url):
|
|
|
630 |
try:
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
638 |
except Exception as e:
|
639 |
logger.error(f"Error extracting real download URL: {e}")
|
640 |
return url
|
@@ -702,13 +1042,17 @@ class DownloadManager:
|
|
702 |
if any(full_url.lower().endswith(ext) for ext in
|
703 |
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
704 |
links.add(full_url)
|
|
|
|
|
|
|
|
|
705 |
except Exception as e:
|
706 |
logger.warning(f"Request-based extraction failed: {e}")
|
707 |
|
708 |
# Browser-based approach for more thorough extraction or if initial approach was inadequate
|
709 |
try:
|
710 |
# Check if we need to proceed with browser-based extraction
|
711 |
-
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url:
|
712 |
logger.info("Using browser for enhanced link extraction")
|
713 |
|
714 |
# Rotate proxy if needed
|
@@ -800,6 +1144,27 @@ class DownloadManager:
|
|
800 |
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
801 |
links.add(href)
|
802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
# Check for ASP.NET specific elements that might contain exam links
|
804 |
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
|
805 |
for grid in grid_elements:
|
@@ -928,6 +1293,11 @@ class DownloadManager:
|
|
928 |
"/resource/", "/material/", "/notes/", "/subjectmaterial/"
|
929 |
]):
|
930 |
filtered_links.append(link)
|
|
|
|
|
|
|
|
|
|
|
931 |
|
932 |
logger.info(f"Found {len(filtered_links)} potential exam document links")
|
933 |
return filtered_links
|
@@ -955,31 +1325,119 @@ class DownloadManager:
|
|
955 |
}
|
956 |
}
|
957 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
958 |
// Check for links in data attributes
|
959 |
-
const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link]');
|
960 |
for (const el of elements) {
|
961 |
-
for (const attr of ['data-url', 'data-href', 'data-src', 'data-link']) {
|
962 |
const val = el.getAttribute(attr);
|
963 |
-
if (val
|
964 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
965 |
}
|
966 |
}
|
967 |
}
|
968 |
|
969 |
// Look for URLs in inline event handlers
|
970 |
-
const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup]');
|
971 |
for (const el of clickableElements) {
|
972 |
-
for (const attr of ['onclick', 'onmousedown', 'onmouseup']) {
|
973 |
const val = el.getAttribute(attr);
|
974 |
if (val) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
975 |
const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
|
976 |
for (let match of urlMatches) {
|
977 |
links.add(match.replace(/["']/g, ''));
|
978 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
979 |
}
|
980 |
}
|
981 |
}
|
982 |
|
|
|
|
|
|
|
|
|
|
|
|
|
983 |
return Array.from(links);
|
984 |
}
|
985 |
""")
|
@@ -1046,14 +1504,116 @@ class DownloadManager:
|
|
1046 |
for link in shadow_links:
|
1047 |
hidden_links.add(link)
|
1048 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1049 |
return hidden_links
|
1050 |
|
1051 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
1052 |
found_files = []
|
1053 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1054 |
# Rotate proxy if needed
|
1055 |
await self.rotate_proxy_if_needed()
|
1056 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1057 |
# Special handling for educational exam sites
|
1058 |
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
|
1059 |
["exam", "test", "pastpaper", "eduexp"]):
|
@@ -1095,7 +1655,8 @@ class DownloadManager:
|
|
1095 |
'url': real_url,
|
1096 |
'filename': filename,
|
1097 |
'size': size_str,
|
1098 |
-
'metadata': meta
|
|
|
1099 |
})
|
1100 |
|
1101 |
# If we found exam files with the specialized method, return them
|
@@ -1156,7 +1717,8 @@ class DownloadManager:
|
|
1156 |
'url': real_url,
|
1157 |
'filename': filename,
|
1158 |
'size': await self.get_file_size(real_url),
|
1159 |
-
'metadata': {}
|
|
|
1160 |
})
|
1161 |
return found_files
|
1162 |
|
@@ -1177,7 +1739,7 @@ class DownloadManager:
|
|
1177 |
for a in soup.find_all('a', href=True):
|
1178 |
href = a['href'].strip()
|
1179 |
|
1180 |
-
if '.php' in href.lower() or 'download' in href.lower():
|
1181 |
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
1182 |
real_url = await self.extract_real_download_url(full_url)
|
1183 |
if real_url and real_url != full_url:
|
@@ -1185,7 +1747,8 @@ class DownloadManager:
|
|
1185 |
'url': real_url,
|
1186 |
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
1187 |
'size': await self.get_file_size(real_url),
|
1188 |
-
'metadata': {}
|
|
|
1189 |
})
|
1190 |
continue
|
1191 |
|
@@ -1199,7 +1762,8 @@ class DownloadManager:
|
|
1199 |
'url': file_url,
|
1200 |
'filename': os.path.basename(file_url.split('?')[0]),
|
1201 |
'size': size_str,
|
1202 |
-
'metadata': meta
|
|
|
1203 |
})
|
1204 |
|
1205 |
# Handle Google Drive links
|
@@ -1229,7 +1793,8 @@ class DownloadManager:
|
|
1229 |
'view_only': is_view_only,
|
1230 |
'file_type': file_type,
|
1231 |
'file_id': file_id
|
1232 |
-
}
|
|
|
1233 |
})
|
1234 |
|
1235 |
# Also check for files in other elements (iframe, embed, object, etc.)
|
@@ -1246,7 +1811,8 @@ class DownloadManager:
|
|
1246 |
'url': file_url,
|
1247 |
'filename': os.path.basename(file_url.split('?')[0]),
|
1248 |
'size': size_str,
|
1249 |
-
'metadata': meta
|
|
|
1250 |
})
|
1251 |
|
1252 |
# Check for file links in onclick attributes
|
@@ -1264,7 +1830,8 @@ class DownloadManager:
|
|
1264 |
'url': url_match,
|
1265 |
'filename': os.path.basename(url_match.split('?')[0]),
|
1266 |
'size': size_str,
|
1267 |
-
'metadata': meta
|
|
|
1268 |
})
|
1269 |
|
1270 |
# Also check for data-src and data-url attributes (common in lazy-loaded sites)
|
@@ -1279,7 +1846,8 @@ class DownloadManager:
|
|
1279 |
'url': file_url,
|
1280 |
'filename': os.path.basename(file_url.split('?')[0]),
|
1281 |
'size': await self.get_file_size(file_url),
|
1282 |
-
'metadata': {}
|
|
|
1283 |
})
|
1284 |
except:
|
1285 |
pass
|
@@ -1313,7 +1881,8 @@ class DownloadManager:
|
|
1313 |
'url': json_url,
|
1314 |
'filename': os.path.basename(json_url.split('?')[0]),
|
1315 |
'size': await self.get_file_size(json_url),
|
1316 |
-
'metadata': {}
|
|
|
1317 |
})
|
1318 |
except:
|
1319 |
pass
|
@@ -1364,7 +1933,8 @@ class DownloadManager:
|
|
1364 |
'url': href,
|
1365 |
'filename': os.path.basename(href.split('?')[0]),
|
1366 |
'size': await self.get_file_size(href),
|
1367 |
-
'metadata': {}
|
|
|
1368 |
})
|
1369 |
|
1370 |
# Check for hidden links that might be in JavaScript, iframes, or dynamic content
|
@@ -1375,7 +1945,8 @@ class DownloadManager:
|
|
1375 |
'url': link,
|
1376 |
'filename': os.path.basename(link.split('?')[0]),
|
1377 |
'size': await self.get_file_size(link),
|
1378 |
-
'metadata': {}
|
|
|
1379 |
})
|
1380 |
|
1381 |
# Deduplicate files by URL
|
@@ -1393,7 +1964,7 @@ class DownloadManager:
|
|
1393 |
return []
|
1394 |
|
1395 |
async def download_file(self, file_info, save_dir, referer):
|
1396 |
-
file_url = file_info['url']
|
1397 |
fname = file_info['filename']
|
1398 |
path = os.path.join(save_dir, fname)
|
1399 |
base, ext = os.path.splitext(fname)
|
@@ -1403,6 +1974,11 @@ class DownloadManager:
|
|
1403 |
counter += 1
|
1404 |
os.makedirs(save_dir, exist_ok=True)
|
1405 |
|
|
|
|
|
|
|
|
|
|
|
1406 |
try:
|
1407 |
# Special handling for Google Drive files
|
1408 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
@@ -1414,6 +1990,7 @@ class DownloadManager:
|
|
1414 |
logger.info(f"Attempting to download view-only file: {file_url}")
|
1415 |
result_path = await self.force_download_viewonly(file_info, path)
|
1416 |
if result_path:
|
|
|
1417 |
return result_path
|
1418 |
|
1419 |
# If that failed, try the regular download approach
|
@@ -1422,13 +1999,60 @@ class DownloadManager:
|
|
1422 |
# Try regular download methods
|
1423 |
success = await self.download_from_google_drive(file_url, path)
|
1424 |
if success:
|
|
|
1425 |
return path
|
1426 |
|
1427 |
# If all methods failed for Google Drive, try one last approach
|
1428 |
logger.warning("All standard methods failed, attempting force download")
|
1429 |
result_path = await self.force_download_viewonly(file_info, path)
|
|
|
|
|
1430 |
return result_path if result_path else None
|
1431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1432 |
# Rotate proxy if needed
|
1433 |
await self.rotate_proxy_if_needed()
|
1434 |
|
@@ -1456,6 +2080,7 @@ class DownloadManager:
|
|
1456 |
|
1457 |
# Verify file was downloaded correctly
|
1458 |
if os.path.exists(path) and os.path.getsize(path) > 0:
|
|
|
1459 |
return path
|
1460 |
except Exception as e:
|
1461 |
logger.warning(f"Direct download failed: {e}, trying browser approach")
|
@@ -1475,7 +2100,9 @@ class DownloadManager:
|
|
1475 |
content = await response.body()
|
1476 |
with open(path, 'wb') as f:
|
1477 |
f.write(content)
|
1478 |
-
|
|
|
|
|
1479 |
else:
|
1480 |
logger.error(f"Download failed with status {response.status}: {file_url}")
|
1481 |
|
@@ -1502,6 +2129,7 @@ class DownloadManager:
|
|
1502 |
await download.save_as(path)
|
1503 |
|
1504 |
if os.path.exists(path) and os.path.getsize(path) > 0:
|
|
|
1505 |
return path
|
1506 |
except Exception as e:
|
1507 |
logger.error(f"Browser download manager approach failed: {e}")
|
@@ -2515,6 +3143,21 @@ class DownloadManager:
|
|
2515 |
try:
|
2516 |
logger.info(f"Fetching sublinks from: {url}")
|
2517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2518 |
# Special handling for educational sites like phsms.cloud.ncnu.edu.tw
|
2519 |
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
|
2520 |
["exam", "test", "pastpaper", "eduexp"]):
|
@@ -2532,8 +3175,12 @@ class DownloadManager:
|
|
2532 |
await self.rotate_proxy_if_needed()
|
2533 |
|
2534 |
# Standard sublink extraction for all sites
|
2535 |
-
|
2536 |
-
|
|
|
|
|
|
|
|
|
2537 |
# Get base URL for resolving relative links
|
2538 |
parsed_base = urlparse(url)
|
2539 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
@@ -2732,8 +3379,46 @@ class DownloadManager:
|
|
2732 |
if href and not href.startswith('javascript:'):
|
2733 |
links.add(href)
|
2734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2735 |
logger.info(f"Found {len(links)} sublinks")
|
2736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2737 |
|
2738 |
except Exception as e:
|
2739 |
logger.error(f"Error getting sublinks from {url}: {e}")
|
@@ -2834,6 +3519,9 @@ class DownloadManager:
|
|
2834 |
file_count_text = st.empty()
|
2835 |
|
2836 |
try:
|
|
|
|
|
|
|
2837 |
progress_text.text("Analyzing main page...")
|
2838 |
# Special handling for ASP.NET pages
|
2839 |
is_aspnet = False
|
@@ -2848,6 +3536,25 @@ class DownloadManager:
|
|
2848 |
except Exception:
|
2849 |
pass
|
2850 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2851 |
# Extract files from main page
|
2852 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
2853 |
initial_count = len(main_files)
|
@@ -2873,9 +3580,50 @@ class DownloadManager:
|
|
2873 |
progress_bar.progress(progress)
|
2874 |
|
2875 |
try:
|
2876 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2877 |
sub_timeout = timeout * 2 if is_aspnet else timeout
|
2878 |
|
|
|
|
|
|
|
|
|
2879 |
# Extract files from sublink
|
2880 |
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
2881 |
all_files.extend(sub_files)
|
@@ -2994,21 +3742,22 @@ def main():
|
|
2994 |
if custom_ext_list != valid_ext_list:
|
2995 |
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
2996 |
|
2997 |
-
|
2998 |
-
|
2999 |
-
|
3000 |
-
|
3001 |
-
|
3002 |
-
|
3003 |
-
|
3004 |
-
|
3005 |
-
|
3006 |
-
|
3007 |
-
|
|
|
|
|
3008 |
|
3009 |
with st.spinner("Searching for files..."):
|
3010 |
-
files = run_deep_search(
|
3011 |
-
sublink_timeout, use_proxy, proxy, use_stealth)
|
3012 |
|
3013 |
if files:
|
3014 |
st.session_state.discovered_files = files
|
@@ -3031,7 +3780,7 @@ def main():
|
|
3031 |
if st.button("Search Files", key="rag_search_btn") and search_query:
|
3032 |
# Initialize RAG search engine
|
3033 |
if not st.session_state.rag_indexed:
|
3034 |
-
rag_search =
|
3035 |
|
3036 |
with st.spinner("Indexing files for search..."):
|
3037 |
# First download files to extract text
|
@@ -3044,7 +3793,7 @@ def main():
|
|
3044 |
for i, file_info in enumerate(files):
|
3045 |
# Only process common text-based file formats
|
3046 |
ext = os.path.splitext(file_info['filename'])[1].lower()
|
3047 |
-
if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json']:
|
3048 |
path = await dm.download_file(file_info, temp_dir, url)
|
3049 |
if path:
|
3050 |
with open(path, 'rb') as f:
|
@@ -3077,14 +3826,28 @@ def main():
|
|
3077 |
for result in search_results:
|
3078 |
file_info = result['file_info']
|
3079 |
score = result['score']
|
|
|
|
|
3080 |
with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
|
3081 |
st.write(f"Size: {file_info['size']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3082 |
if 'metadata' in file_info and file_info['metadata']:
|
3083 |
st.write("Metadata:")
|
3084 |
for k, v in file_info['metadata'].items():
|
3085 |
if k != 'file_id': # Skip technical details
|
3086 |
st.write(f"- {k}: {v}")
|
3087 |
|
|
|
|
|
|
|
|
|
|
|
3088 |
# Add direct download button
|
3089 |
if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
|
3090 |
with st.spinner(f"Downloading {file_info['filename']}..."):
|
@@ -3267,94 +4030,192 @@ def main():
|
|
3267 |
# Create expanders for each result
|
3268 |
for i, url in enumerate(urls, 1):
|
3269 |
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
3270 |
-
|
3271 |
-
st.session_state.deep_search_url = url
|
3272 |
-
st.session_state.do_deep_search = True
|
3273 |
else:
|
3274 |
st.warning("No search results found.")
|
3275 |
|
3276 |
asyncio.run(run_search())
|
3277 |
|
3278 |
-
|
3279 |
-
|
3280 |
-
|
3281 |
-
|
3282 |
-
|
3283 |
-
|
3284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3285 |
|
3286 |
-
#
|
3287 |
-
|
3288 |
-
|
3289 |
|
3290 |
-
|
3291 |
-
|
3292 |
-
|
3293 |
-
|
3294 |
-
|
3295 |
-
|
3296 |
-
|
3297 |
-
|
3298 |
-
|
3299 |
-
|
3300 |
-
|
3301 |
-
|
3302 |
-
|
3303 |
-
|
3304 |
-
|
3305 |
-
|
3306 |
-
|
3307 |
-
|
3308 |
-
|
3309 |
-
|
3310 |
-
|
3311 |
-
|
3312 |
-
|
3313 |
-
|
3314 |
-
|
3315 |
-
for i, file in enumerate(files):
|
3316 |
-
col1, col2, col3 = st.columns([3, 1, 1])
|
3317 |
-
with col1:
|
3318 |
-
filename = file['filename']
|
3319 |
-
size = file['size']
|
3320 |
-
meta = file.get('metadata', {})
|
3321 |
-
file_info = f"{filename} ({size})"
|
3322 |
-
if meta and 'Pages' in meta:
|
3323 |
-
file_info += f" - {meta.get('Pages', '')} pages"
|
3324 |
-
st.markdown(f"**{i+1}. {file_info}**")
|
3325 |
-
|
3326 |
-
with col2:
|
3327 |
-
# Add direct download button for each file
|
3328 |
-
if st.button(f"Download", key=f"direct_dl_{i}"):
|
3329 |
-
with st.spinner(f"Downloading {filename}..."):
|
3330 |
-
async def download_single_file():
|
3331 |
-
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
3332 |
-
path = await dm.download_file(file, download_dir, url)
|
3333 |
-
return path
|
3334 |
|
3335 |
-
|
3336 |
-
|
3337 |
-
|
3338 |
-
|
3339 |
-
|
3340 |
-
|
3341 |
-
|
3342 |
-
|
3343 |
-
|
3344 |
-
|
3345 |
-
|
3346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3347 |
|
3348 |
-
with
|
3349 |
-
#
|
3350 |
-
|
3351 |
-
|
3352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3353 |
else:
|
3354 |
-
|
3355 |
-
|
3356 |
-
|
3357 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3358 |
|
3359 |
# Add a special section for direct Google Drive file download
|
3360 |
st.markdown("---")
|
@@ -3400,7 +4261,11 @@ def main():
|
|
3400 |
|
3401 |
# Add footer with attribution
|
3402 |
st.markdown('---')
|
3403 |
-
st.markdown('Created by [Euler314](https://github.com/
|
|
|
|
|
|
|
|
|
3404 |
|
3405 |
if __name__ == "__main__":
|
3406 |
main()
|
|
|
7 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
8 |
import asyncio
|
9 |
import logging
|
10 |
+
from urllib.parse import urlparse, urljoin, unquote, parse_qs, quote
|
11 |
import re
|
12 |
from pathlib import Path
|
13 |
from io import BytesIO
|
|
|
32 |
import google.auth.transport.requests
|
33 |
import googleapiclient.http
|
34 |
|
35 |
+
# Enhanced RAG search imports
|
36 |
import nltk
|
37 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
38 |
from sklearn.metrics.pairwise import cosine_similarity
|
39 |
import numpy as np
|
40 |
import docx2txt
|
41 |
+
try:
|
42 |
+
from langdetect import detect as detect_language
|
43 |
+
except ImportError:
|
44 |
+
# If langdetect is not available, we'll use a simple fallback
|
45 |
+
def detect_language(text):
|
46 |
+
return "en"
|
47 |
+
|
48 |
+
# Try to download NLTK data if not already present
|
49 |
+
try:
|
50 |
+
nltk.data.find('tokenizers/punkt')
|
51 |
+
except LookupError:
|
52 |
+
try:
|
53 |
+
nltk.download('punkt', quiet=True)
|
54 |
+
except:
|
55 |
+
pass
|
56 |
|
57 |
# -------------------- Logging Setup --------------------
|
58 |
logging.basicConfig(
|
|
|
110 |
"proxies": [] # Will be populated from the UI if needed
|
111 |
}
|
112 |
|
113 |
+
# -------------------- Enhanced RAG Search Class --------------------
|
114 |
+
class EnhancedRAGSearch:
|
115 |
def __init__(self):
|
116 |
self.file_texts = []
|
117 |
+
self.chunks = [] # Document chunks for more targeted search
|
118 |
+
self.chunk_metadata = [] # Metadata for each chunk
|
119 |
self.file_metadata = []
|
120 |
+
self.vectorizer = TfidfVectorizer(
|
121 |
+
stop_words='english',
|
122 |
+
ngram_range=(1, 2), # Use bigrams for better context
|
123 |
+
max_features=10000, # Use more features for better representation
|
124 |
+
min_df=2 # Minimum document frequency
|
125 |
+
)
|
126 |
self.vectors = None
|
127 |
+
self.chunk_vectors = None
|
128 |
+
self.languages = []
|
129 |
|
130 |
def add_file(self, file_data, file_info):
|
131 |
+
"""Add a file to the search index with improved processing"""
|
132 |
+
file_ext = os.path.splitext(file_info['filename'])[1].lower()
|
133 |
text = self.extract_text(file_data, file_ext)
|
134 |
+
|
135 |
if text:
|
136 |
+
# Store the whole document text
|
137 |
self.file_texts.append(text)
|
138 |
self.file_metadata.append(file_info)
|
139 |
+
|
140 |
+
# Try to detect language
|
141 |
+
try:
|
142 |
+
lang = detect_language(text[:1000]) # Use just the first 1000 chars for speed
|
143 |
+
self.languages.append(lang)
|
144 |
+
except:
|
145 |
+
self.languages.append('en') # Default to English
|
146 |
+
|
147 |
+
# Create chunks for more granular search
|
148 |
+
chunks = self.create_chunks(text)
|
149 |
+
for chunk in chunks:
|
150 |
+
self.chunks.append(chunk)
|
151 |
+
self.chunk_metadata.append({
|
152 |
+
'file_info': file_info,
|
153 |
+
'chunk_size': len(chunk),
|
154 |
+
'file_index': len(self.file_texts) - 1
|
155 |
+
})
|
156 |
+
|
157 |
return True
|
158 |
return False
|
159 |
|
160 |
+
def create_chunks(self, text, chunk_size=1000, overlap=200):
|
161 |
+
"""Split text into overlapping chunks for better search precision"""
|
162 |
+
# Try to use NLTK for sentence-aware chunking
|
163 |
+
try:
|
164 |
+
sentences = nltk.sent_tokenize(text)
|
165 |
+
chunks = []
|
166 |
+
current_chunk = ""
|
167 |
+
|
168 |
+
for sentence in sentences:
|
169 |
+
if len(current_chunk) + len(sentence) <= chunk_size:
|
170 |
+
current_chunk += sentence + " "
|
171 |
+
else:
|
172 |
+
# Add current chunk if it has content
|
173 |
+
if current_chunk:
|
174 |
+
chunks.append(current_chunk.strip())
|
175 |
+
|
176 |
+
# Start new chunk with overlap from previous chunk
|
177 |
+
if len(current_chunk) > overlap:
|
178 |
+
# Find the last space within the overlap region
|
179 |
+
overlap_text = current_chunk[-overlap:]
|
180 |
+
last_space = overlap_text.rfind(' ')
|
181 |
+
if last_space != -1:
|
182 |
+
current_chunk = current_chunk[-(overlap-last_space):] + sentence + " "
|
183 |
+
else:
|
184 |
+
current_chunk = sentence + " "
|
185 |
+
else:
|
186 |
+
current_chunk = sentence + " "
|
187 |
+
|
188 |
+
# Add the last chunk if it has content
|
189 |
+
if current_chunk:
|
190 |
+
chunks.append(current_chunk.strip())
|
191 |
+
|
192 |
+
return chunks
|
193 |
+
except:
|
194 |
+
# Fallback to simpler chunking approach
|
195 |
+
chunks = []
|
196 |
+
for i in range(0, len(text), chunk_size - overlap):
|
197 |
+
chunk = text[i:i + chunk_size]
|
198 |
+
if chunk:
|
199 |
+
chunks.append(chunk)
|
200 |
+
return chunks
|
201 |
+
|
202 |
def extract_text(self, file_data, file_ext):
|
203 |
+
"""Extract text from different file types with enhanced support"""
|
204 |
try:
|
205 |
if file_ext.lower() == '.pdf':
|
206 |
reader = PyPDF2.PdfReader(BytesIO(file_data))
|
207 |
text = ""
|
208 |
for page in reader.pages:
|
209 |
+
extracted = page.extract_text()
|
210 |
+
if extracted:
|
211 |
+
text += extracted + "\n"
|
212 |
+
# If text extraction fails, try to OCR (would need extra libraries)
|
213 |
return text
|
214 |
elif file_ext.lower() in ['.docx', '.doc']:
|
215 |
return docx2txt.process(BytesIO(file_data))
|
216 |
+
elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']:
|
217 |
+
# Handle both UTF-8 and other common encodings
|
218 |
+
try:
|
219 |
+
return file_data.decode('utf-8', errors='ignore')
|
220 |
+
except:
|
221 |
+
encodings = ['latin-1', 'iso-8859-1', 'windows-1252']
|
222 |
+
for enc in encodings:
|
223 |
+
try:
|
224 |
+
return file_data.decode(enc, errors='ignore')
|
225 |
+
except:
|
226 |
+
pass
|
227 |
+
# Last resort fallback
|
228 |
return file_data.decode('utf-8', errors='ignore')
|
229 |
+
elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']:
|
230 |
+
# For these types, we would need additional libraries
|
231 |
+
# For now, return a placeholder message
|
232 |
+
return f"[Content of {file_ext} file - install additional libraries for full text extraction]"
|
233 |
else:
|
234 |
return ""
|
235 |
except Exception as e:
|
|
|
237 |
return ""
|
238 |
|
239 |
def build_index(self):
|
240 |
+
"""Build both document and chunk search indices"""
|
241 |
if not self.file_texts:
|
242 |
return False
|
243 |
+
|
244 |
try:
|
245 |
+
# Build document-level index
|
246 |
self.vectors = self.vectorizer.fit_transform(self.file_texts)
|
247 |
+
|
248 |
+
# Build chunk-level index if we have chunks
|
249 |
+
if self.chunks:
|
250 |
+
self.chunk_vectors = self.vectorizer.transform(self.chunks)
|
251 |
+
|
252 |
return True
|
253 |
except Exception as e:
|
254 |
logger.error(f"Error building search index: {e}")
|
255 |
return False
|
256 |
|
257 |
+
def expand_query(self, query):
|
258 |
+
"""Add related terms to query for better recall"""
|
259 |
+
# This is a simple implementation - could be enhanced with a proper synonym API
|
260 |
+
expanded_terms = []
|
261 |
+
|
262 |
+
# Add some common expansions for document search
|
263 |
+
if "exam" in query.lower():
|
264 |
+
expanded_terms.extend(["test", "assessment", "quiz", "paper"])
|
265 |
+
elif "document" in query.lower():
|
266 |
+
expanded_terms.extend(["file", "paper", "report"])
|
267 |
+
elif "manual" in query.lower():
|
268 |
+
expanded_terms.extend(["guide", "instruction", "documentation"])
|
269 |
+
|
270 |
+
# Return original query plus expanded terms
|
271 |
+
if expanded_terms:
|
272 |
+
return f"{query} {' '.join(expanded_terms)}"
|
273 |
+
return query
|
274 |
+
|
275 |
+
def search(self, query, top_k=5, search_chunks=True):
|
276 |
+
"""Enhanced search with both document and chunk-level search"""
|
277 |
if self.vectors is None:
|
278 |
return []
|
279 |
|
280 |
try:
|
281 |
+
# Expand the query for better recall
|
282 |
+
expanded_query = self.expand_query(query)
|
283 |
+
|
284 |
+
# Transform the query
|
285 |
+
query_vector = self.vectorizer.transform([expanded_query])
|
286 |
|
287 |
results = []
|
288 |
+
|
289 |
+
# First search at document level for higher-level matches
|
290 |
+
if self.vectors is not None:
|
291 |
+
doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
|
292 |
+
top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
|
293 |
+
|
294 |
+
for i, idx in enumerate(top_doc_indices):
|
295 |
+
if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results
|
296 |
+
results.append({
|
297 |
+
'file_info': self.file_metadata[idx],
|
298 |
+
'score': float(doc_similarities[idx]),
|
299 |
+
'rank': i+1,
|
300 |
+
'match_type': 'document',
|
301 |
+
'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
|
302 |
+
})
|
303 |
+
|
304 |
+
# Then search at chunk level for more specific matches if enabled
|
305 |
+
if search_chunks and self.chunk_vectors is not None:
|
306 |
+
chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
|
307 |
+
top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results
|
308 |
+
|
309 |
+
# Use a set to avoid duplicate file results
|
310 |
+
seen_files = set(r['file_info']['url'] for r in results)
|
311 |
+
|
312 |
+
for i, idx in enumerate(top_chunk_indices):
|
313 |
+
if chunk_similarities[idx] > 0.15: # Higher threshold for chunks
|
314 |
+
file_index = self.chunk_metadata[idx]['file_index']
|
315 |
+
file_info = self.file_metadata[file_index]
|
316 |
+
|
317 |
+
# Only add if we haven't already included this file
|
318 |
+
if file_info['url'] not in seen_files:
|
319 |
+
seen_files.add(file_info['url'])
|
320 |
+
results.append({
|
321 |
+
'file_info': file_info,
|
322 |
+
'score': float(chunk_similarities[idx]),
|
323 |
+
'rank': len(results) + 1,
|
324 |
+
'match_type': 'chunk',
|
325 |
+
'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
|
326 |
+
'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
|
327 |
+
})
|
328 |
+
|
329 |
+
# Stop after we've found enough results
|
330 |
+
if len(results) >= top_k*1.5:
|
331 |
+
break
|
332 |
+
|
333 |
+
# Sort combined results by score
|
334 |
+
results.sort(key=lambda x: x['score'], reverse=True)
|
335 |
+
|
336 |
+
# Re-rank and truncate
|
337 |
+
for i, result in enumerate(results[:top_k]):
|
338 |
+
result['rank'] = i+1
|
339 |
+
|
340 |
+
return results[:top_k]
|
341 |
except Exception as e:
|
342 |
logger.error(f"Error during search: {e}")
|
343 |
return []
|
|
|
397 |
html_lower = html_content.lower()
|
398 |
return any(pattern in html_lower for pattern in captcha_patterns)
|
399 |
|
400 |
+
def is_download_link(url):
|
401 |
+
"""Enhanced function to detect if a URL is likely a download link"""
|
402 |
+
# Check for obvious download indicators in URL
|
403 |
+
url_lower = url.lower()
|
404 |
+
|
405 |
+
# Check for common download-related terms in the URL
|
406 |
+
download_terms = [
|
407 |
+
'download', 'dl', 'get', 'file', 'attachment', 'export', 'view',
|
408 |
+
'retrieve', 'fetch', 'load', 'open', 'access', 'doc', 'document'
|
409 |
+
]
|
410 |
+
if any(term in url_lower for term in download_terms):
|
411 |
+
return True
|
412 |
+
|
413 |
+
# Check for common download script patterns
|
414 |
+
script_patterns = [
|
415 |
+
'download.php', 'getfile.php', 'fetch.php', 'view.php', 'dl.php',
|
416 |
+
'download.aspx', 'getfile.aspx', 'file.aspx',
|
417 |
+
'downloadhandler', 'filehandler', 'filedownload',
|
418 |
+
'download.jsp', 'download.cgi', 'download.do',
|
419 |
+
'download-file', 'get-file',
|
420 |
+
'downloadfile', 'getfile', 'viewfile',
|
421 |
+
'Action=downloadfile', 'action=download', 'action=view',
|
422 |
+
'download?', 'file?', 'get?', 'view?'
|
423 |
+
]
|
424 |
+
if any(pattern in url_lower for pattern in script_patterns):
|
425 |
+
return True
|
426 |
+
|
427 |
+
# Check for common file extensions in the URL path or parameters
|
428 |
+
path = urlparse(url).path
|
429 |
+
common_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
430 |
+
'.zip', '.rar', '.txt', '.csv', '.json', '.xml', '.jpg',
|
431 |
+
'.png', '.gif', '.mp3', '.mp4', '.avi', '.mov']
|
432 |
+
|
433 |
+
if any(ext in path.lower() for ext in common_extensions):
|
434 |
+
return True
|
435 |
+
|
436 |
+
# Check for file ID or file parameters in URL
|
437 |
+
params = parse_qs(urlparse(url).query)
|
438 |
+
param_keys = params.keys()
|
439 |
+
file_param_indicators = ['file', 'id', 'key', 'filename', 'name', 'fileid', 'attachment', 'attid']
|
440 |
+
if any(key.lower() in file_param_indicators for key in param_keys):
|
441 |
+
return True
|
442 |
+
|
443 |
+
# Check for complex encoding patterns like in the example URL
|
444 |
+
if 'Action=downloadfile' in url or 'fname=' in url:
|
445 |
+
return True
|
446 |
+
|
447 |
+
return False
|
448 |
+
|
449 |
+
def normalize_download_url(url):
|
450 |
+
"""Normalize download URLs to handle various formats and encodings"""
|
451 |
+
try:
|
452 |
+
# Handle common URL shorteners and redirections
|
453 |
+
parsed = urlparse(url)
|
454 |
+
|
455 |
+
# Handle phpMyAdmin-style encoded URLs
|
456 |
+
if 'Action=downloadfile' in url and 'file=' in url:
|
457 |
+
# Extract the encoded file parameter
|
458 |
+
params = parse_qs(parsed.query)
|
459 |
+
if 'file' in params:
|
460 |
+
# This is just a placeholder - in a real implementation,
|
461 |
+
# you would need to handle the specific encoding used
|
462 |
+
encoded_file = params['file'][0]
|
463 |
+
# Keep the URL as is for now, since we'll handle it during download
|
464 |
+
return url
|
465 |
+
|
466 |
+
# Handle URLs with fname parameter (like in the example)
|
467 |
+
if 'fname=' in url:
|
468 |
+
# Keep as is - we'll handle this specially during download
|
469 |
+
return url
|
470 |
+
|
471 |
+
# For other URLs, make sure they are properly quoted
|
472 |
+
path = parsed.path
|
473 |
+
# Only quote the path portion if needed
|
474 |
+
if '%' not in path and ' ' in path:
|
475 |
+
path = quote(path)
|
476 |
+
|
477 |
+
# Reconstruct the URL
|
478 |
+
normalized = parsed._replace(path=path).geturl()
|
479 |
+
return normalized
|
480 |
+
except Exception as e:
|
481 |
+
logger.error(f"Error normalizing URL {url}: {e}")
|
482 |
+
return url
|
483 |
+
|
484 |
# -------------------- Google Drive Functions --------------------
|
485 |
def get_google_auth_url():
|
486 |
client_config = GOOGLE_OAUTH_CONFIG["web"]
|
|
|
573 |
self.request_count = 0
|
574 |
self.captcha_detected = False
|
575 |
self.download_timeout = 300 # 5 minutes timeout for downloads
|
576 |
+
# Track visited URLs to avoid revisiting the same URL multiple times
|
577 |
+
self.visited_urls = set()
|
578 |
+
# Track successfully downloaded files to avoid redownloading
|
579 |
+
self.downloaded_files = set()
|
580 |
|
581 |
async def __aenter__(self):
|
582 |
self.playwright = await async_playwright().start()
|
|
|
857 |
try:
|
858 |
await self.rotate_proxy_if_needed()
|
859 |
|
860 |
+
# For complex download URLs, we need to be careful with HEAD requests
|
861 |
+
if '?' in url or 'Action=downloadfile' in url or 'fname=' in url:
|
862 |
+
# For these URLs, we'll try a more reliable approach using range headers
|
863 |
+
headers = {
|
864 |
+
'User-Agent': get_random_user_agent(),
|
865 |
+
'Range': 'bytes=0-0' # Just request the first byte to check headers
|
866 |
+
}
|
867 |
+
|
868 |
+
try:
|
869 |
+
with requests.get(url, headers=headers, stream=True, timeout=10) as r:
|
870 |
+
if 'Content-Range' in r.headers:
|
871 |
+
content_range = r.headers['Content-Range']
|
872 |
+
match = re.search(r'bytes 0-0/(\d+)', content_range)
|
873 |
+
if match:
|
874 |
+
size = int(match.group(1))
|
875 |
+
return sizeof_fmt(size)
|
876 |
+
|
877 |
+
if 'Content-Length' in r.headers:
|
878 |
+
size = int(r.headers['Content-Length'])
|
879 |
+
# If size is 1, it's likely just our single requested byte
|
880 |
+
if size > 1:
|
881 |
+
return sizeof_fmt(size)
|
882 |
+
except Exception as e:
|
883 |
+
logger.warning(f"Error getting file size with Range request: {e}")
|
884 |
+
|
885 |
+
# Fallback to browser approach
|
886 |
+
try:
|
887 |
+
async with self.context.new_page() as page:
|
888 |
+
response = await page.request.head(url, timeout=15000)
|
889 |
+
length = response.headers.get('Content-Length', None)
|
890 |
+
if length:
|
891 |
+
return sizeof_fmt(int(length))
|
892 |
+
except Exception as e:
|
893 |
+
logger.warning(f"Error getting file size with browser: {e}")
|
894 |
+
|
895 |
+
return "Unknown Size"
|
896 |
+
else:
|
897 |
+
# Standard approach for normal URLs
|
898 |
+
async with self.context.new_page() as page:
|
899 |
+
response = await page.request.head(url, timeout=15000)
|
900 |
+
length = response.headers.get('Content-Length', None)
|
901 |
+
if length:
|
902 |
+
return sizeof_fmt(int(length))
|
903 |
+
else:
|
904 |
+
return "Unknown Size"
|
905 |
except Exception as e:
|
906 |
logger.warning(f"Error getting file size: {e}")
|
907 |
return "Unknown Size"
|
|
|
928 |
return {}
|
929 |
|
930 |
async def extract_real_download_url(self, url):
|
931 |
+
"""Enhanced method to extract real download URL, handling complex URLs"""
|
932 |
try:
|
933 |
+
# Check if this is a complex download URL that needs special handling
|
934 |
+
if 'Action=downloadfile' in url or 'fname=' in url:
|
935 |
+
logger.info(f"Complex download URL detected: {url}")
|
936 |
+
|
937 |
+
# For these special cases, we'll use the browser to navigate and intercept redirects
|
938 |
+
await self.rotate_proxy_if_needed()
|
939 |
+
|
940 |
+
async with self.context.new_page() as page:
|
941 |
+
# Set up request interception to capture redirects
|
942 |
+
await page.route('**', lambda route: route.continue_())
|
943 |
+
|
944 |
+
# Listen for all responses
|
945 |
+
responses = []
|
946 |
+
page.on('response', lambda response: responses.append(response))
|
947 |
+
|
948 |
+
try:
|
949 |
+
# Go to the URL
|
950 |
+
await page.goto(url, wait_until='networkidle', timeout=30000)
|
951 |
+
|
952 |
+
# Check all responses for potential downloads
|
953 |
+
for response in responses:
|
954 |
+
# Look for content-disposition headers indicating a download
|
955 |
+
content_disposition = response.headers.get('Content-Disposition', '')
|
956 |
+
if 'attachment' in content_disposition or 'filename=' in content_disposition:
|
957 |
+
return response.url
|
958 |
+
|
959 |
+
# Look for content-type headers indicating a file
|
960 |
+
content_type = response.headers.get('Content-Type', '')
|
961 |
+
if content_type and content_type != 'text/html' and not content_type.startswith('text/'):
|
962 |
+
return response.url
|
963 |
+
|
964 |
+
# If no clear download was detected, return the final URL
|
965 |
+
return page.url
|
966 |
+
except Exception as e:
|
967 |
+
logger.warning(f"Error extracting real download URL: {e}")
|
968 |
+
return url
|
969 |
+
else:
|
970 |
+
# Standard approach for normal URLs
|
971 |
+
await self.rotate_proxy_if_needed()
|
972 |
+
|
973 |
+
async with self.context.new_page() as page:
|
974 |
+
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
975 |
+
if response and response.headers.get('location'):
|
976 |
+
return response.headers['location']
|
977 |
+
return page.url
|
978 |
except Exception as e:
|
979 |
logger.error(f"Error extracting real download URL: {e}")
|
980 |
return url
|
|
|
1042 |
if any(full_url.lower().endswith(ext) for ext in
|
1043 |
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
1044 |
links.add(full_url)
|
1045 |
+
|
1046 |
+
# Check for download script parameters
|
1047 |
+
if "Action=downloadfile" in url or "fname=" in url:
|
1048 |
+
links.add(url) # Add the URL itself as it's a download link
|
1049 |
except Exception as e:
|
1050 |
logger.warning(f"Request-based extraction failed: {e}")
|
1051 |
|
1052 |
# Browser-based approach for more thorough extraction or if initial approach was inadequate
|
1053 |
try:
|
1054 |
# Check if we need to proceed with browser-based extraction
|
1055 |
+
if len(links) < 5 or "phsms.cloud.ncnu.edu.tw" in url or "Action=downloadfile" in url:
|
1056 |
logger.info("Using browser for enhanced link extraction")
|
1057 |
|
1058 |
# Rotate proxy if needed
|
|
|
1144 |
['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.zip']):
|
1145 |
links.add(href)
|
1146 |
|
1147 |
+
# Check for download links in the page
|
1148 |
+
download_links = await self.page.evaluate("""
|
1149 |
+
() => {
|
1150 |
+
// Find all links that might be download links
|
1151 |
+
const links = Array.from(document.querySelectorAll('a[href]'));
|
1152 |
+
return links
|
1153 |
+
.filter(a => {
|
1154 |
+
const href = a.href.toLowerCase();
|
1155 |
+
return href.includes('download') ||
|
1156 |
+
href.includes('getfile') ||
|
1157 |
+
href.includes('view.php') ||
|
1158 |
+
href.includes('action=downloadfile') ||
|
1159 |
+
href.includes('fname=');
|
1160 |
+
})
|
1161 |
+
.map(a => a.href);
|
1162 |
+
}
|
1163 |
+
""")
|
1164 |
+
|
1165 |
+
for dl_link in download_links:
|
1166 |
+
links.add(dl_link)
|
1167 |
+
|
1168 |
# Check for ASP.NET specific elements that might contain exam links
|
1169 |
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable, .table-responsive')
|
1170 |
for grid in grid_elements:
|
|
|
1293 |
"/resource/", "/material/", "/notes/", "/subjectmaterial/"
|
1294 |
]):
|
1295 |
filtered_links.append(link)
|
1296 |
+
continue
|
1297 |
+
|
1298 |
+
# Check for download links (these may not have obvious extensions)
|
1299 |
+
if is_download_link(link):
|
1300 |
+
filtered_links.append(link)
|
1301 |
|
1302 |
logger.info(f"Found {len(filtered_links)} potential exam document links")
|
1303 |
return filtered_links
|
|
|
1325 |
}
|
1326 |
}
|
1327 |
|
1328 |
+
// Look for download-related variables in scripts
|
1329 |
+
for (const script of scripts) {
|
1330 |
+
const content = script.textContent || '';
|
1331 |
+
// Look for common patterns for file URLs in JavaScript
|
1332 |
+
if (content.includes('downloadURL') || content.includes('fileURL') ||
|
1333 |
+
content.includes('pdfURL') || content.includes('documentURL')) {
|
1334 |
+
|
1335 |
+
// Extract potential URLs
|
1336 |
+
const potentialUrls = content.match(/["']([^"']+\.(pdf|doc|docx|xls|xlsx|zip|ppt|pptx))["']/gi) || [];
|
1337 |
+
for (let match of potentialUrls) {
|
1338 |
+
const url = match.replace(/["']/g, '');
|
1339 |
+
// Try to resolve relative URLs
|
1340 |
+
if (url.startsWith('/') || !url.includes('://')) {
|
1341 |
+
if (url.startsWith('/')) {
|
1342 |
+
links.add(window.location.origin + url);
|
1343 |
+
} else {
|
1344 |
+
// Handle relative paths more carefully
|
1345 |
+
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
|
1346 |
+
links.add(base + url);
|
1347 |
+
}
|
1348 |
+
} else if (url.startsWith('http')) {
|
1349 |
+
links.add(url);
|
1350 |
+
}
|
1351 |
+
}
|
1352 |
+
}
|
1353 |
+
}
|
1354 |
+
|
1355 |
// Check for links in data attributes
|
1356 |
+
const elements = document.querySelectorAll('*[data-url], *[data-href], *[data-src], *[data-link], *[data-file], *[data-download]');
|
1357 |
for (const el of elements) {
|
1358 |
+
for (const attr of ['data-url', 'data-href', 'data-src', 'data-link', 'data-file', 'data-download']) {
|
1359 |
const val = el.getAttribute(attr);
|
1360 |
+
if (val) {
|
1361 |
+
// Try to resolve relative URLs
|
1362 |
+
if (val.startsWith('/')) {
|
1363 |
+
links.add(window.location.origin + val);
|
1364 |
+
} else if (val.startsWith('http')) {
|
1365 |
+
links.add(val);
|
1366 |
+
} else if (!val.startsWith('javascript:') && !val.startsWith('#')) {
|
1367 |
+
// Handle relative paths
|
1368 |
+
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
|
1369 |
+
links.add(base + val);
|
1370 |
+
}
|
1371 |
}
|
1372 |
}
|
1373 |
}
|
1374 |
|
1375 |
// Look for URLs in inline event handlers
|
1376 |
+
const clickableElements = document.querySelectorAll('*[onclick], *[onmousedown], *[onmouseup], *[href]');
|
1377 |
for (const el of clickableElements) {
|
1378 |
+
for (const attr of ['onclick', 'onmousedown', 'onmouseup', 'href']) {
|
1379 |
const val = el.getAttribute(attr);
|
1380 |
if (val) {
|
1381 |
+
// Check for JavaScript URLs with window.location
|
1382 |
+
if (val.includes('window.location') || val.includes('document.location')) {
|
1383 |
+
const urlMatch = val.match(/location(?:.*)=\s*["']([^"']+)["']/);
|
1384 |
+
if (urlMatch && urlMatch[1]) {
|
1385 |
+
const url = urlMatch[1];
|
1386 |
+
if (url.startsWith('/')) {
|
1387 |
+
links.add(window.location.origin + url);
|
1388 |
+
} else if (url.startsWith('http')) {
|
1389 |
+
links.add(url);
|
1390 |
+
} else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
|
1391 |
+
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
|
1392 |
+
links.add(base + url);
|
1393 |
+
}
|
1394 |
+
}
|
1395 |
+
}
|
1396 |
+
|
1397 |
+
// Check for direct URLs in attributes
|
1398 |
const urlMatches = val.match(/["'](https?:\/\/[^"']+)["']/g) || [];
|
1399 |
for (let match of urlMatches) {
|
1400 |
links.add(match.replace(/["']/g, ''));
|
1401 |
}
|
1402 |
+
|
1403 |
+
// Check for download.php and similar patterns
|
1404 |
+
if (val.includes('download.php') || val.includes('getfile.php') ||
|
1405 |
+
val.includes('Action=downloadfile') || val.includes('viewfile.php')) {
|
1406 |
+
|
1407 |
+
// Handle both onclick handlers and direct hrefs
|
1408 |
+
let url = '';
|
1409 |
+
if (attr === 'href') {
|
1410 |
+
url = val;
|
1411 |
+
} else {
|
1412 |
+
// Extract URL from JavaScript
|
1413 |
+
const jsUrlMatch = val.match(/["']([^"']+(?:download|getfile|viewfile|downloadfile)[^"']*)["']/i);
|
1414 |
+
if (jsUrlMatch) {
|
1415 |
+
url = jsUrlMatch[1];
|
1416 |
+
}
|
1417 |
+
}
|
1418 |
+
|
1419 |
+
// Resolve URL if needed
|
1420 |
+
if (url) {
|
1421 |
+
if (url.startsWith('/')) {
|
1422 |
+
links.add(window.location.origin + url);
|
1423 |
+
} else if (url.startsWith('http')) {
|
1424 |
+
links.add(url);
|
1425 |
+
} else if (!url.startsWith('javascript:') && !url.startsWith('#')) {
|
1426 |
+
const base = window.location.href.substring(0, window.location.href.lastIndexOf('/') + 1);
|
1427 |
+
links.add(base + url);
|
1428 |
+
}
|
1429 |
+
}
|
1430 |
+
}
|
1431 |
}
|
1432 |
}
|
1433 |
}
|
1434 |
|
1435 |
+
// Find PHP/ASP file download links
|
1436 |
+
const fileLinks = document.querySelectorAll('a[href*="download.php"], a[href*="getfile.php"], a[href*="viewfile.php"], a[href*="file.aspx"], a[href*="download.aspx"], a[href*="Action=downloadfile"]');
|
1437 |
+
for (const link of fileLinks) {
|
1438 |
+
links.add(link.href);
|
1439 |
+
}
|
1440 |
+
|
1441 |
return Array.from(links);
|
1442 |
}
|
1443 |
""")
|
|
|
1504 |
for link in shadow_links:
|
1505 |
hidden_links.add(link)
|
1506 |
|
1507 |
+
# Look for download links in forms
|
1508 |
+
form_links = await page.evaluate("""
|
1509 |
+
() => {
|
1510 |
+
const links = new Set();
|
1511 |
+
|
1512 |
+
// Check for form actions that might be download endpoints
|
1513 |
+
const forms = document.querySelectorAll('form');
|
1514 |
+
for (const form of forms) {
|
1515 |
+
const action = form.action || '';
|
1516 |
+
if (action && (
|
1517 |
+
action.includes('download') ||
|
1518 |
+
action.includes('getfile') ||
|
1519 |
+
action.includes('viewfile') ||
|
1520 |
+
action.includes('Action=downloadfile')
|
1521 |
+
)) {
|
1522 |
+
// Collect input values that might be needed for the download
|
1523 |
+
const inputs = {};
|
1524 |
+
const formInputs = form.querySelectorAll('input[name]');
|
1525 |
+
for (const input of formInputs) {
|
1526 |
+
inputs[input.name] = input.value;
|
1527 |
+
}
|
1528 |
+
|
1529 |
+
// Store both the form action and any important inputs
|
1530 |
+
links.add(action);
|
1531 |
+
}
|
1532 |
+
}
|
1533 |
+
|
1534 |
+
return Array.from(links);
|
1535 |
+
}
|
1536 |
+
""")
|
1537 |
+
|
1538 |
+
for link in form_links:
|
1539 |
+
hidden_links.add(link)
|
1540 |
+
|
1541 |
return hidden_links
|
1542 |
|
1543 |
async def extract_downloadable_files(self, url, custom_ext_list):
|
1544 |
found_files = []
|
1545 |
try:
|
1546 |
+
# Normalize the URL to handle special cases
|
1547 |
+
normalized_url = normalize_download_url(url)
|
1548 |
+
|
1549 |
+
# Skip if we've already visited this URL
|
1550 |
+
if normalized_url in self.visited_urls:
|
1551 |
+
logger.info(f"Skipping already visited URL: {normalized_url}")
|
1552 |
+
return []
|
1553 |
+
|
1554 |
+
# Mark this URL as visited
|
1555 |
+
self.visited_urls.add(normalized_url)
|
1556 |
+
|
1557 |
# Rotate proxy if needed
|
1558 |
await self.rotate_proxy_if_needed()
|
1559 |
|
1560 |
+
# First check if this is a direct download link (Action=downloadfile or fname parameter)
|
1561 |
+
if is_download_link(normalized_url):
|
1562 |
+
logger.info(f"Processing potential direct download link: {normalized_url}")
|
1563 |
+
|
1564 |
+
# Try to extract the real download URL if needed
|
1565 |
+
real_url = await self.extract_real_download_url(normalized_url)
|
1566 |
+
|
1567 |
+
# Determine filename - for complex URLs this can be tricky
|
1568 |
+
filename = os.path.basename(urlparse(real_url).path)
|
1569 |
+
|
1570 |
+
# Handle URL-encoded filenames
|
1571 |
+
if '%' in filename:
|
1572 |
+
try:
|
1573 |
+
filename = unquote(filename)
|
1574 |
+
except Exception:
|
1575 |
+
pass
|
1576 |
+
|
1577 |
+
# For URLs with download parameters, try to extract filename from query
|
1578 |
+
if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
|
1579 |
+
# Look for file parameter
|
1580 |
+
params = parse_qs(urlparse(normalized_url).query)
|
1581 |
+
|
1582 |
+
# Check common filename parameters
|
1583 |
+
for param in ['file', 'filename', 'name', 'fname', 'f']:
|
1584 |
+
if param in params and params[param]:
|
1585 |
+
potential_filename = params[param][0]
|
1586 |
+
if potential_filename and '/' not in potential_filename and '\\' not in potential_filename:
|
1587 |
+
filename = os.path.basename(potential_filename)
|
1588 |
+
break
|
1589 |
+
|
1590 |
+
# If still no valid filename, use domain-based fallback
|
1591 |
+
if not filename or filename == '/' or filename.endswith('.php') or filename.endswith('.aspx'):
|
1592 |
+
domain = get_domain(real_url)
|
1593 |
+
# Try to determine file type from content-type or extension hints in URL
|
1594 |
+
ext = '.pdf' # Default
|
1595 |
+
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip']:
|
1596 |
+
if common_ext in normalized_url.lower():
|
1597 |
+
ext = common_ext
|
1598 |
+
break
|
1599 |
+
filename = f"file_from_{domain}{ext}"
|
1600 |
+
|
1601 |
+
# Get file size
|
1602 |
+
size_str = await self.get_file_size(real_url)
|
1603 |
+
|
1604 |
+
# Add to found files
|
1605 |
+
found_files.append({
|
1606 |
+
'url': real_url,
|
1607 |
+
'filename': filename,
|
1608 |
+
'size': size_str,
|
1609 |
+
'metadata': {},
|
1610 |
+
'download_url': normalized_url # Keep original URL for downloading
|
1611 |
+
})
|
1612 |
+
|
1613 |
+
# For direct download links, we can return early
|
1614 |
+
if len(found_files) > 0 and (normalized_url.startswith(url) or real_url.startswith(url)):
|
1615 |
+
return found_files
|
1616 |
+
|
1617 |
# Special handling for educational exam sites
|
1618 |
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
|
1619 |
["exam", "test", "pastpaper", "eduexp"]):
|
|
|
1655 |
'url': real_url,
|
1656 |
'filename': filename,
|
1657 |
'size': size_str,
|
1658 |
+
'metadata': meta,
|
1659 |
+
'download_url': link # Store original link for downloading
|
1660 |
})
|
1661 |
|
1662 |
# If we found exam files with the specialized method, return them
|
|
|
1717 |
'url': real_url,
|
1718 |
'filename': filename,
|
1719 |
'size': await self.get_file_size(real_url),
|
1720 |
+
'metadata': {},
|
1721 |
+
'download_url': final_url # Keep original URL for downloading
|
1722 |
})
|
1723 |
return found_files
|
1724 |
|
|
|
1739 |
for a in soup.find_all('a', href=True):
|
1740 |
href = a['href'].strip()
|
1741 |
|
1742 |
+
if '.php' in href.lower() or 'download' in href.lower() or 'action=' in href.lower():
|
1743 |
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
|
1744 |
real_url = await self.extract_real_download_url(full_url)
|
1745 |
if real_url and real_url != full_url:
|
|
|
1747 |
'url': real_url,
|
1748 |
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
1749 |
'size': await self.get_file_size(real_url),
|
1750 |
+
'metadata': {},
|
1751 |
+
'download_url': full_url # Original URL for download
|
1752 |
})
|
1753 |
continue
|
1754 |
|
|
|
1762 |
'url': file_url,
|
1763 |
'filename': os.path.basename(file_url.split('?')[0]),
|
1764 |
'size': size_str,
|
1765 |
+
'metadata': meta,
|
1766 |
+
'download_url': file_url # Same as URL for direct links
|
1767 |
})
|
1768 |
|
1769 |
# Handle Google Drive links
|
|
|
1793 |
'view_only': is_view_only,
|
1794 |
'file_type': file_type,
|
1795 |
'file_id': file_id
|
1796 |
+
},
|
1797 |
+
'download_url': href # Same as URL for Google Drive
|
1798 |
})
|
1799 |
|
1800 |
# Also check for files in other elements (iframe, embed, object, etc.)
|
|
|
1811 |
'url': file_url,
|
1812 |
'filename': os.path.basename(file_url.split('?')[0]),
|
1813 |
'size': size_str,
|
1814 |
+
'metadata': meta,
|
1815 |
+
'download_url': file_url
|
1816 |
})
|
1817 |
|
1818 |
# Check for file links in onclick attributes
|
|
|
1830 |
'url': url_match,
|
1831 |
'filename': os.path.basename(url_match.split('?')[0]),
|
1832 |
'size': size_str,
|
1833 |
+
'metadata': meta,
|
1834 |
+
'download_url': url_match
|
1835 |
})
|
1836 |
|
1837 |
# Also check for data-src and data-url attributes (common in lazy-loaded sites)
|
|
|
1846 |
'url': file_url,
|
1847 |
'filename': os.path.basename(file_url.split('?')[0]),
|
1848 |
'size': await self.get_file_size(file_url),
|
1849 |
+
'metadata': {},
|
1850 |
+
'download_url': file_url
|
1851 |
})
|
1852 |
except:
|
1853 |
pass
|
|
|
1881 |
'url': json_url,
|
1882 |
'filename': os.path.basename(json_url.split('?')[0]),
|
1883 |
'size': await self.get_file_size(json_url),
|
1884 |
+
'metadata': {},
|
1885 |
+
'download_url': json_url
|
1886 |
})
|
1887 |
except:
|
1888 |
pass
|
|
|
1933 |
'url': href,
|
1934 |
'filename': os.path.basename(href.split('?')[0]),
|
1935 |
'size': await self.get_file_size(href),
|
1936 |
+
'metadata': {},
|
1937 |
+
'download_url': href
|
1938 |
})
|
1939 |
|
1940 |
# Check for hidden links that might be in JavaScript, iframes, or dynamic content
|
|
|
1945 |
'url': link,
|
1946 |
'filename': os.path.basename(link.split('?')[0]),
|
1947 |
'size': await self.get_file_size(link),
|
1948 |
+
'metadata': {},
|
1949 |
+
'download_url': link
|
1950 |
})
|
1951 |
|
1952 |
# Deduplicate files by URL
|
|
|
1964 |
return []
|
1965 |
|
1966 |
async def download_file(self, file_info, save_dir, referer):
|
1967 |
+
file_url = file_info.get('download_url', file_info['url']) # Use download_url if available
|
1968 |
fname = file_info['filename']
|
1969 |
path = os.path.join(save_dir, fname)
|
1970 |
base, ext = os.path.splitext(fname)
|
|
|
1974 |
counter += 1
|
1975 |
os.makedirs(save_dir, exist_ok=True)
|
1976 |
|
1977 |
+
# Check if we've already downloaded this file
|
1978 |
+
if file_url in self.downloaded_files:
|
1979 |
+
logger.info(f"File already downloaded: {file_url}")
|
1980 |
+
return None
|
1981 |
+
|
1982 |
try:
|
1983 |
# Special handling for Google Drive files
|
1984 |
if "drive.google.com" in file_url or "docs.google.com" in file_url:
|
|
|
1990 |
logger.info(f"Attempting to download view-only file: {file_url}")
|
1991 |
result_path = await self.force_download_viewonly(file_info, path)
|
1992 |
if result_path:
|
1993 |
+
self.downloaded_files.add(file_url)
|
1994 |
return result_path
|
1995 |
|
1996 |
# If that failed, try the regular download approach
|
|
|
1999 |
# Try regular download methods
|
2000 |
success = await self.download_from_google_drive(file_url, path)
|
2001 |
if success:
|
2002 |
+
self.downloaded_files.add(file_url)
|
2003 |
return path
|
2004 |
|
2005 |
# If all methods failed for Google Drive, try one last approach
|
2006 |
logger.warning("All standard methods failed, attempting force download")
|
2007 |
result_path = await self.force_download_viewonly(file_info, path)
|
2008 |
+
if result_path:
|
2009 |
+
self.downloaded_files.add(file_url)
|
2010 |
return result_path if result_path else None
|
2011 |
|
2012 |
+
# Special handling for complex download URLs
|
2013 |
+
if 'Action=downloadfile' in file_url or 'fname=' in file_url:
|
2014 |
+
logger.info(f"Using browser download approach for complex URL: {file_url}")
|
2015 |
+
|
2016 |
+
# For these URLs, we'll need to navigate to the page and handle the download
|
2017 |
+
await self.rotate_proxy_if_needed()
|
2018 |
+
|
2019 |
+
async with self.context.new_page() as page:
|
2020 |
+
# Set up download event listener
|
2021 |
+
download_promise = page.wait_for_event("download")
|
2022 |
+
|
2023 |
+
# Navigate to the URL
|
2024 |
+
await page.goto(file_url, timeout=60000)
|
2025 |
+
|
2026 |
+
# Wait for the download to start
|
2027 |
+
try:
|
2028 |
+
download = await download_promise
|
2029 |
+
await download.save_as(path)
|
2030 |
+
|
2031 |
+
if os.path.exists(path) and os.path.getsize(path) > 0:
|
2032 |
+
self.downloaded_files.add(file_url)
|
2033 |
+
return path
|
2034 |
+
except Exception as e:
|
2035 |
+
logger.error(f"Browser download failed: {e}")
|
2036 |
+
|
2037 |
+
# If download didn't start automatically, try to find and click download buttons
|
2038 |
+
download_buttons = await page.query_selector_all('input[type="submit"], button[type="submit"], a.btn, a[href*="download"]')
|
2039 |
+
for button in download_buttons:
|
2040 |
+
try:
|
2041 |
+
await button.click()
|
2042 |
+
try:
|
2043 |
+
download = await download_promise
|
2044 |
+
await download.save_as(path)
|
2045 |
+
if os.path.exists(path) and os.path.getsize(path) > 0:
|
2046 |
+
self.downloaded_files.add(file_url)
|
2047 |
+
return path
|
2048 |
+
except:
|
2049 |
+
pass
|
2050 |
+
except:
|
2051 |
+
continue
|
2052 |
+
|
2053 |
+
# If browser approach failed, try direct request as last resort
|
2054 |
+
logger.info("Browser approach failed, trying direct request")
|
2055 |
+
|
2056 |
# Rotate proxy if needed
|
2057 |
await self.rotate_proxy_if_needed()
|
2058 |
|
|
|
2080 |
|
2081 |
# Verify file was downloaded correctly
|
2082 |
if os.path.exists(path) and os.path.getsize(path) > 0:
|
2083 |
+
self.downloaded_files.add(file_url)
|
2084 |
return path
|
2085 |
except Exception as e:
|
2086 |
logger.warning(f"Direct download failed: {e}, trying browser approach")
|
|
|
2100 |
content = await response.body()
|
2101 |
with open(path, 'wb') as f:
|
2102 |
f.write(content)
|
2103 |
+
if os.path.exists(path) and os.path.getsize(path) > 0:
|
2104 |
+
self.downloaded_files.add(file_url)
|
2105 |
+
return path
|
2106 |
else:
|
2107 |
logger.error(f"Download failed with status {response.status}: {file_url}")
|
2108 |
|
|
|
2129 |
await download.save_as(path)
|
2130 |
|
2131 |
if os.path.exists(path) and os.path.getsize(path) > 0:
|
2132 |
+
self.downloaded_files.add(file_url)
|
2133 |
return path
|
2134 |
except Exception as e:
|
2135 |
logger.error(f"Browser download manager approach failed: {e}")
|
|
|
3143 |
try:
|
3144 |
logger.info(f"Fetching sublinks from: {url}")
|
3145 |
|
3146 |
+
# Check if this is a direct download link
|
3147 |
+
if is_download_link(url):
|
3148 |
+
logger.info(f"URL appears to be a direct download link: {url}")
|
3149 |
+
links.add(url)
|
3150 |
+
return list(links)[:limit]
|
3151 |
+
|
3152 |
+
# Skip if we've already visited this URL
|
3153 |
+
normalized_url = normalize_download_url(url)
|
3154 |
+
if normalized_url in self.visited_urls:
|
3155 |
+
logger.info(f"Skipping already visited URL for sublink extraction: {normalized_url}")
|
3156 |
+
return list(links)[:limit]
|
3157 |
+
|
3158 |
+
# Add to visited URLs
|
3159 |
+
self.visited_urls.add(normalized_url)
|
3160 |
+
|
3161 |
# Special handling for educational sites like phsms.cloud.ncnu.edu.tw
|
3162 |
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
|
3163 |
["exam", "test", "pastpaper", "eduexp"]):
|
|
|
3175 |
await self.rotate_proxy_if_needed()
|
3176 |
|
3177 |
# Standard sublink extraction for all sites
|
3178 |
+
try:
|
3179 |
+
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
3180 |
+
except Exception as e:
|
3181 |
+
logger.warning(f"Error navigating to URL for sublink extraction: {e}")
|
3182 |
+
# Continue with what we have, we'll try to extract links anyway
|
3183 |
+
|
3184 |
# Get base URL for resolving relative links
|
3185 |
parsed_base = urlparse(url)
|
3186 |
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
|
3379 |
if href and not href.startswith('javascript:'):
|
3380 |
links.add(href)
|
3381 |
|
3382 |
+
# Find all download links
|
3383 |
+
download_links = await self.page.evaluate("""
|
3384 |
+
() => {
|
3385 |
+
return Array.from(document.querySelectorAll('a[href]'))
|
3386 |
+
.filter(a => {
|
3387 |
+
const href = a.href.toLowerCase();
|
3388 |
+
return href.includes('download') ||
|
3389 |
+
href.includes('file') ||
|
3390 |
+
href.includes('get') ||
|
3391 |
+
href.includes('view.php') ||
|
3392 |
+
href.includes('action=') ||
|
3393 |
+
href.includes('fname=');
|
3394 |
+
})
|
3395 |
+
.map(a => a.href);
|
3396 |
+
}
|
3397 |
+
""")
|
3398 |
+
|
3399 |
+
for download_link in download_links:
|
3400 |
+
links.add(download_link)
|
3401 |
+
|
3402 |
+
# Also check for hidden links in JavaScript, iframes, or dynamic content
|
3403 |
+
js_links = await self.discover_hidden_links(self.page)
|
3404 |
+
for link in js_links:
|
3405 |
+
links.add(link)
|
3406 |
+
|
3407 |
logger.info(f"Found {len(links)} sublinks")
|
3408 |
+
|
3409 |
+
# Prioritize download links
|
3410 |
+
prioritized_links = []
|
3411 |
+
normal_links = []
|
3412 |
+
|
3413 |
+
for link in links:
|
3414 |
+
if is_download_link(link):
|
3415 |
+
prioritized_links.append(link)
|
3416 |
+
else:
|
3417 |
+
normal_links.append(link)
|
3418 |
+
|
3419 |
+
# Return prioritized links first, then normal links, up to the limit
|
3420 |
+
result = prioritized_links + normal_links
|
3421 |
+
return result[:limit]
|
3422 |
|
3423 |
except Exception as e:
|
3424 |
logger.error(f"Error getting sublinks from {url}: {e}")
|
|
|
3519 |
file_count_text = st.empty()
|
3520 |
|
3521 |
try:
|
3522 |
+
# Reset the visited URLs for a fresh deep search
|
3523 |
+
self.visited_urls = set()
|
3524 |
+
|
3525 |
progress_text.text("Analyzing main page...")
|
3526 |
# Special handling for ASP.NET pages
|
3527 |
is_aspnet = False
|
|
|
3536 |
except Exception:
|
3537 |
pass
|
3538 |
|
3539 |
+
# Check if this URL is a direct download
|
3540 |
+
if is_download_link(url):
|
3541 |
+
progress_text.text("URL appears to be a direct download. Analyzing...")
|
3542 |
+
|
3543 |
+
# Try to extract file directly
|
3544 |
+
normalized_url = normalize_download_url(url)
|
3545 |
+
file_info = {
|
3546 |
+
'url': normalized_url,
|
3547 |
+
'download_url': normalized_url,
|
3548 |
+
'filename': os.path.basename(urlparse(normalized_url).path) or 'download',
|
3549 |
+
'size': 'Unknown Size',
|
3550 |
+
'metadata': {}
|
3551 |
+
}
|
3552 |
+
|
3553 |
+
# Add to visited URLs
|
3554 |
+
self.visited_urls.add(normalized_url)
|
3555 |
+
progress_bar.progress(1.0)
|
3556 |
+
return [file_info]
|
3557 |
+
|
3558 |
# Extract files from main page
|
3559 |
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
3560 |
initial_count = len(main_files)
|
|
|
3580 |
progress_bar.progress(progress)
|
3581 |
|
3582 |
try:
|
3583 |
+
# Check if this is a direct download link
|
3584 |
+
if is_download_link(sublink):
|
3585 |
+
# For download links, just add the link directly
|
3586 |
+
normalized_url = normalize_download_url(sublink)
|
3587 |
+
|
3588 |
+
# Skip if already visited
|
3589 |
+
if normalized_url in self.visited_urls:
|
3590 |
+
continue
|
3591 |
+
|
3592 |
+
# Mark as visited
|
3593 |
+
self.visited_urls.add(normalized_url)
|
3594 |
+
|
3595 |
+
# Get file size if possible
|
3596 |
+
size_str = await self.get_file_size(normalized_url)
|
3597 |
+
|
3598 |
+
# Get filename, with fallback to domain-based name
|
3599 |
+
filename = os.path.basename(urlparse(normalized_url).path)
|
3600 |
+
if not filename or filename == '/' or '?' in filename:
|
3601 |
+
domain = get_domain(normalized_url)
|
3602 |
+
ext = '.pdf' # Default extension
|
3603 |
+
for common_ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.zip']:
|
3604 |
+
if common_ext in normalized_url.lower():
|
3605 |
+
ext = common_ext
|
3606 |
+
break
|
3607 |
+
filename = f"file_from_{domain}{ext}"
|
3608 |
+
|
3609 |
+
# Add file to results
|
3610 |
+
all_files.append({
|
3611 |
+
'url': normalized_url,
|
3612 |
+
'download_url': normalized_url,
|
3613 |
+
'filename': filename,
|
3614 |
+
'size': size_str,
|
3615 |
+
'metadata': {}
|
3616 |
+
})
|
3617 |
+
file_count_text.text(f"Found {len(all_files)} total files")
|
3618 |
+
continue
|
3619 |
+
|
3620 |
+
# For regular links, use a longer timeout for ASP.NET pages which can be slower
|
3621 |
sub_timeout = timeout * 2 if is_aspnet else timeout
|
3622 |
|
3623 |
+
# Skip already visited URLs
|
3624 |
+
if sublink in self.visited_urls:
|
3625 |
+
continue
|
3626 |
+
|
3627 |
# Extract files from sublink
|
3628 |
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
3629 |
all_files.extend(sub_files)
|
|
|
3742 |
if custom_ext_list != valid_ext_list:
|
3743 |
st.warning("Invalid extensions ignored. Use format like '.csv'.")
|
3744 |
|
3745 |
+
# Reset RAG engine for new search
|
3746 |
+
st.session_state.rag_indexed = False
|
3747 |
+
st.session_state.rag_engine = None
|
3748 |
+
|
3749 |
+
# Define a function to run the deep search
|
3750 |
+
async def run_deep_search():
|
3751 |
+
async with DownloadManager(
|
3752 |
+
use_proxy=use_proxy,
|
3753 |
+
proxy=proxy,
|
3754 |
+
use_stealth=use_stealth
|
3755 |
+
) as dm:
|
3756 |
+
files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
|
3757 |
+
return files
|
3758 |
|
3759 |
with st.spinner("Searching for files..."):
|
3760 |
+
files = asyncio.run(run_deep_search())
|
|
|
3761 |
|
3762 |
if files:
|
3763 |
st.session_state.discovered_files = files
|
|
|
3780 |
if st.button("Search Files", key="rag_search_btn") and search_query:
|
3781 |
# Initialize RAG search engine
|
3782 |
if not st.session_state.rag_indexed:
|
3783 |
+
rag_search = EnhancedRAGSearch()
|
3784 |
|
3785 |
with st.spinner("Indexing files for search..."):
|
3786 |
# First download files to extract text
|
|
|
3793 |
for i, file_info in enumerate(files):
|
3794 |
# Only process common text-based file formats
|
3795 |
ext = os.path.splitext(file_info['filename'])[1].lower()
|
3796 |
+
if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
|
3797 |
path = await dm.download_file(file_info, temp_dir, url)
|
3798 |
if path:
|
3799 |
with open(path, 'rb') as f:
|
|
|
3826 |
for result in search_results:
|
3827 |
file_info = result['file_info']
|
3828 |
score = result['score']
|
3829 |
+
match_type = result.get('match_type', 'document')
|
3830 |
+
|
3831 |
with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
|
3832 |
st.write(f"Size: {file_info['size']}")
|
3833 |
+
st.write(f"Match type: {match_type}")
|
3834 |
+
|
3835 |
+
# Show language if available
|
3836 |
+
if 'language' in result:
|
3837 |
+
st.write(f"Language: {result['language']}")
|
3838 |
+
|
3839 |
+
# Show metadata if available
|
3840 |
if 'metadata' in file_info and file_info['metadata']:
|
3841 |
st.write("Metadata:")
|
3842 |
for k, v in file_info['metadata'].items():
|
3843 |
if k != 'file_id': # Skip technical details
|
3844 |
st.write(f"- {k}: {v}")
|
3845 |
|
3846 |
+
# Show content preview for chunk matches
|
3847 |
+
if 'chunk_preview' in result:
|
3848 |
+
st.write("Content preview:")
|
3849 |
+
st.text(result['chunk_preview'])
|
3850 |
+
|
3851 |
# Add direct download button
|
3852 |
if st.button(f"Download this file", key=f"rag_dl_{result['rank']}"):
|
3853 |
with st.spinner(f"Downloading {file_info['filename']}..."):
|
|
|
4030 |
# Create expanders for each result
|
4031 |
for i, url in enumerate(urls, 1):
|
4032 |
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
|
4033 |
+
st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}", on_click=set_deep_search_url, args=(url,))
|
|
|
|
|
4034 |
else:
|
4035 |
st.warning("No search results found.")
|
4036 |
|
4037 |
asyncio.run(run_search())
|
4038 |
|
4039 |
+
# Handle deep search - using on_click function to avoid state issues
|
4040 |
+
if 'deep_search_url' in st.session_state and st.session_state.deep_search_url:
|
4041 |
+
url = st.session_state.deep_search_url
|
4042 |
+
st.info(f"Deep searching: {url}")
|
4043 |
+
|
4044 |
+
# Set up custom extensions
|
4045 |
+
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
|
4046 |
+
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
|
4047 |
+
|
4048 |
+
# Reset RAG engine for new search
|
4049 |
+
st.session_state.rag_indexed = False
|
4050 |
+
st.session_state.rag_engine = None
|
4051 |
+
|
4052 |
+
# Run the deep search
|
4053 |
+
async def run_bing_deep_search():
|
4054 |
+
async with DownloadManager(
|
4055 |
+
use_proxy=use_proxy,
|
4056 |
+
proxy=proxy,
|
4057 |
+
use_stealth=use_stealth
|
4058 |
+
) as dm:
|
4059 |
+
files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
|
4060 |
+
return files
|
4061 |
+
|
4062 |
+
with st.spinner("Searching for files..."):
|
4063 |
+
files = asyncio.run(run_bing_deep_search())
|
4064 |
+
|
4065 |
+
if files:
|
4066 |
+
st.session_state.discovered_files = files
|
4067 |
+
st.session_state.current_url = url
|
4068 |
+
st.success(f"Found {len(files)} files!")
|
4069 |
|
4070 |
+
# Show files with direct download options
|
4071 |
+
download_dir = "./downloads"
|
4072 |
+
os.makedirs(download_dir, exist_ok=True)
|
4073 |
|
4074 |
+
# Individual file display with direct download buttons
|
4075 |
+
for i, file in enumerate(files):
|
4076 |
+
col1, col2, col3 = st.columns([3, 1, 1])
|
4077 |
+
with col1:
|
4078 |
+
filename = file['filename']
|
4079 |
+
size = file['size']
|
4080 |
+
meta = file.get('metadata', {})
|
4081 |
+
file_info = f"{filename} ({size})"
|
4082 |
+
if meta and 'Pages' in meta:
|
4083 |
+
file_info += f" - {meta.get('Pages', '')} pages"
|
4084 |
+
st.markdown(f"**{i+1}. {file_info}**")
|
4085 |
+
|
4086 |
+
with col2:
|
4087 |
+
# Add direct download button for each file
|
4088 |
+
if st.button(f"Download", key=f"direct_dl_bing_{i}"):
|
4089 |
+
with st.spinner(f"Downloading {filename}..."):
|
4090 |
+
async def download_single_file():
|
4091 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
4092 |
+
path = await dm.download_file(file, download_dir, url)
|
4093 |
+
return path
|
4094 |
+
|
4095 |
+
downloaded_path = asyncio.run(download_single_file())
|
4096 |
+
if downloaded_path:
|
4097 |
+
with open(downloaded_path, "rb") as f:
|
4098 |
+
file_data = f.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4099 |
|
4100 |
+
st.download_button(
|
4101 |
+
label=f"Save {filename}",
|
4102 |
+
data=file_data,
|
4103 |
+
file_name=filename,
|
4104 |
+
mime=mimetypes.guess_type(downloaded_path)[0] or "application/octet-stream",
|
4105 |
+
key=f"save_bing_file_{i}"
|
4106 |
+
)
|
4107 |
+
|
4108 |
+
with col3:
|
4109 |
+
# Add to selection for batch download
|
4110 |
+
if i in st.session_state.selected_files:
|
4111 |
+
if st.button("Unselect", key=f"bing_unselect_{i}"):
|
4112 |
+
st.session_state.selected_files.remove(i)
|
4113 |
+
else:
|
4114 |
+
if st.button("Select", key=f"bing_select_{i}"):
|
4115 |
+
st.session_state.selected_files.append(i)
|
4116 |
+
|
4117 |
+
# Add RAG Search interface for Bing results
|
4118 |
+
st.markdown("### Search Within Discovered Files")
|
4119 |
+
search_query = st.text_input("Enter search terms", key="bing_rag_search_query")
|
4120 |
+
|
4121 |
+
if st.button("Search Files", key="bing_rag_search_btn") and search_query:
|
4122 |
+
# Initialize RAG search engine
|
4123 |
+
if not st.session_state.rag_indexed:
|
4124 |
+
rag_search = EnhancedRAGSearch()
|
4125 |
|
4126 |
+
with st.spinner("Indexing files for search..."):
|
4127 |
+
# First download files to extract text
|
4128 |
+
temp_dir = "./temp_downloads"
|
4129 |
+
os.makedirs(temp_dir, exist_ok=True)
|
4130 |
+
|
4131 |
+
async def download_for_indexing():
|
4132 |
+
downloaded = 0
|
4133 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
4134 |
+
for i, file_info in enumerate(files):
|
4135 |
+
# Only process common text-based file formats
|
4136 |
+
ext = os.path.splitext(file_info['filename'])[1].lower()
|
4137 |
+
if ext in ['.pdf', '.doc', '.docx', '.txt', '.csv', '.json', '.html', '.htm']:
|
4138 |
+
path = await dm.download_file(file_info, temp_dir, url)
|
4139 |
+
if path:
|
4140 |
+
with open(path, 'rb') as f:
|
4141 |
+
file_data = f.read()
|
4142 |
+
|
4143 |
+
# Add to search index
|
4144 |
+
if rag_search.add_file(file_data, file_info):
|
4145 |
+
downloaded += 1
|
4146 |
+
|
4147 |
+
# Clean up
|
4148 |
+
os.remove(path)
|
4149 |
+
return downloaded
|
4150 |
+
|
4151 |
+
indexed_count = asyncio.run(download_for_indexing())
|
4152 |
+
if indexed_count > 0:
|
4153 |
+
rag_search.build_index()
|
4154 |
+
st.session_state.rag_engine = rag_search
|
4155 |
+
st.session_state.rag_indexed = True
|
4156 |
+
st.success(f"Indexed {indexed_count} files for search")
|
4157 |
else:
|
4158 |
+
st.warning("Could not index any files. Try with more text-based documents.")
|
4159 |
+
|
4160 |
+
# Perform the search
|
4161 |
+
if st.session_state.rag_indexed:
|
4162 |
+
search_results = st.session_state.rag_engine.search(search_query)
|
4163 |
+
|
4164 |
+
if search_results:
|
4165 |
+
st.write(f"Found {len(search_results)} relevant files:")
|
4166 |
+
|
4167 |
+
for result in search_results:
|
4168 |
+
file_info = result['file_info']
|
4169 |
+
score = result['score']
|
4170 |
+
match_type = result.get('match_type', 'document')
|
4171 |
+
|
4172 |
+
with st.expander(f"{file_info['filename']} (Relevance: {score:.2f})"):
|
4173 |
+
st.write(f"Size: {file_info['size']}")
|
4174 |
+
st.write(f"Match type: {match_type}")
|
4175 |
+
|
4176 |
+
# Show language if available
|
4177 |
+
if 'language' in result:
|
4178 |
+
st.write(f"Language: {result['language']}")
|
4179 |
+
|
4180 |
+
# Show metadata if available
|
4181 |
+
if 'metadata' in file_info and file_info['metadata']:
|
4182 |
+
st.write("Metadata:")
|
4183 |
+
for k, v in file_info['metadata'].items():
|
4184 |
+
if k != 'file_id': # Skip technical details
|
4185 |
+
st.write(f"- {k}: {v}")
|
4186 |
+
|
4187 |
+
# Show content preview for chunk matches
|
4188 |
+
if 'chunk_preview' in result:
|
4189 |
+
st.write("Content preview:")
|
4190 |
+
st.text(result['chunk_preview'])
|
4191 |
+
|
4192 |
+
# Add direct download button
|
4193 |
+
if st.button(f"Download this file", key=f"bing_rag_dl_{result['rank']}"):
|
4194 |
+
with st.spinner(f"Downloading {file_info['filename']}..."):
|
4195 |
+
async def download_search_result():
|
4196 |
+
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, use_stealth=use_stealth) as dm:
|
4197 |
+
path = await dm.download_file(file_info, download_dir, url)
|
4198 |
+
return path
|
4199 |
+
|
4200 |
+
path = asyncio.run(download_search_result())
|
4201 |
+
if path:
|
4202 |
+
with open(path, "rb") as f:
|
4203 |
+
file_data = f.read()
|
4204 |
+
|
4205 |
+
st.download_button(
|
4206 |
+
label=f"Save {file_info['filename']}",
|
4207 |
+
data=file_data,
|
4208 |
+
file_name=file_info['filename'],
|
4209 |
+
mime=mimetypes.guess_type(path)[0] or "application/octet-stream",
|
4210 |
+
key=f"save_bing_rag_{result['rank']}"
|
4211 |
+
)
|
4212 |
+
else:
|
4213 |
+
st.warning("No matching files found for your query.")
|
4214 |
+
else:
|
4215 |
+
st.warning("No files found.")
|
4216 |
+
|
4217 |
+
# Reset the deep search URL after processing
|
4218 |
+
st.session_state.deep_search_url = None
|
4219 |
|
4220 |
# Add a special section for direct Google Drive file download
|
4221 |
st.markdown("---")
|
|
|
4261 |
|
4262 |
# Add footer with attribution
|
4263 |
st.markdown('---')
|
4264 |
+
st.markdown('Created by [Euler314](https://github.com/euler314)')
|
4265 |
+
|
4266 |
+
# Helper function for Bing search deep search URL setting
|
4267 |
+
def set_deep_search_url(url):
|
4268 |
+
st.session_state.deep_search_url = url
|
4269 |
|
4270 |
if __name__ == "__main__":
|
4271 |
main()
|