Update app.py
Browse files
app.py
CHANGED
@@ -193,10 +193,371 @@ def nlp_extract_entities(text: str):
|
|
193 |
# ---------- AI-enhanced Query Preprocessing -------------
|
194 |
def ai_preprocess_query(query: str) -> str:
|
195 |
return query
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
# ---------- Main Streamlit UI Implementation -------------
|
202 |
def main():
|
|
|
193 |
# ---------- AI-enhanced Query Preprocessing -------------
|
194 |
def ai_preprocess_query(query: str) -> str:
|
195 |
return query
|
196 |
+
class DownloadManager:
|
197 |
+
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
|
198 |
+
self.use_proxy = use_proxy
|
199 |
+
self.proxy = proxy
|
200 |
+
self.query = query
|
201 |
+
self.num_results = num_results
|
202 |
+
self.playwright = None
|
203 |
+
self.browser = None
|
204 |
+
self.context = None
|
205 |
+
self.page = None
|
206 |
+
|
207 |
+
async def __aenter__(self):
|
208 |
+
self.playwright = await async_playwright().start()
|
209 |
+
opts = {"headless": True}
|
210 |
+
if self.use_proxy and self.proxy:
|
211 |
+
opts["proxy"] = {"server": self.proxy}
|
212 |
+
self.browser = await self.playwright.chromium.launch(**opts)
|
213 |
+
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
|
214 |
+
self.page = await self.context.new_page()
|
215 |
+
await self.page.set_extra_http_headers({
|
216 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
217 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
218 |
+
'Referer': 'https://www.bing.com/'
|
219 |
+
})
|
220 |
+
return self
|
221 |
+
|
222 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
223 |
+
if self.browser:
|
224 |
+
await self.browser.close()
|
225 |
+
if self.playwright:
|
226 |
+
await self.playwright.stop()
|
227 |
+
|
228 |
+
async def get_file_size(self, url):
|
229 |
+
try:
|
230 |
+
async with self.context.new_page() as page:
|
231 |
+
response = await page.request.head(url, timeout=15000)
|
232 |
+
length = response.headers.get('Content-Length', None)
|
233 |
+
if length:
|
234 |
+
return sizeof_fmt(int(length))
|
235 |
+
else:
|
236 |
+
return "Unknown Size"
|
237 |
+
except Exception:
|
238 |
+
return "Unknown Size"
|
239 |
+
|
240 |
+
async def get_pdf_metadata(self, url):
|
241 |
+
try:
|
242 |
+
async with self.context.new_page() as page:
|
243 |
+
resp = await page.request.get(url, timeout=15000)
|
244 |
+
if resp.ok:
|
245 |
+
content = await resp.body()
|
246 |
+
pdf = BytesIO(content)
|
247 |
+
reader = PdfReader(pdf)
|
248 |
+
return {
|
249 |
+
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
|
250 |
+
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
|
251 |
+
'Pages': len(reader.pages),
|
252 |
+
}
|
253 |
+
else:
|
254 |
+
return {}
|
255 |
+
except Exception:
|
256 |
+
return {}
|
257 |
+
|
258 |
+
async def extract_real_download_url(self, url):
|
259 |
+
try:
|
260 |
+
async with self.context.new_page() as page:
|
261 |
+
response = await page.goto(url, wait_until='networkidle', timeout=30000)
|
262 |
+
|
263 |
+
# Check if the response is a redirect
|
264 |
+
if response and response.headers.get('location'):
|
265 |
+
return response.headers['location']
|
266 |
+
|
267 |
+
# Check if response is a file
|
268 |
+
content_type = response.headers.get('content-type', '')
|
269 |
+
if 'text/html' not in content_type.lower():
|
270 |
+
return url
|
271 |
+
|
272 |
+
# Look for meta refresh
|
273 |
+
content = await page.content()
|
274 |
+
soup = BeautifulSoup(content, 'html.parser')
|
275 |
+
meta_refresh = soup.find('meta', {'http-equiv': 'refresh'})
|
276 |
+
if meta_refresh:
|
277 |
+
content = meta_refresh.get('content', '')
|
278 |
+
if 'url=' in content.lower():
|
279 |
+
return content.split('url=')[-1].strip()
|
280 |
+
|
281 |
+
return page.url
|
282 |
+
except Exception as e:
|
283 |
+
logger.error(f"Error extracting real download URL: {e}")
|
284 |
+
return url
|
285 |
+
|
286 |
+
async def extract_downloadable_files(self, url, custom_ext_list):
|
287 |
+
found_files = []
|
288 |
+
try:
|
289 |
+
# First try to load the page
|
290 |
+
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
291 |
+
if not response:
|
292 |
+
return []
|
293 |
+
|
294 |
+
final_url = self.page.url
|
295 |
+
|
296 |
+
# Handle redirects and download scripts
|
297 |
+
if '.php' in final_url or 'download' in final_url or 'get' in final_url:
|
298 |
+
real_url = await self.extract_real_download_url(final_url)
|
299 |
+
if real_url != final_url:
|
300 |
+
content_type = (await self.page.request.head(real_url)).headers.get('content-type', '')
|
301 |
+
if content_type and 'text/html' not in content_type.lower():
|
302 |
+
found_files.append({
|
303 |
+
'url': real_url,
|
304 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
305 |
+
'size': await self.get_file_size(real_url),
|
306 |
+
'metadata': {}
|
307 |
+
})
|
308 |
+
return found_files
|
309 |
+
|
310 |
+
await self.page.wait_for_load_state('networkidle', timeout=30000)
|
311 |
+
await human_like_interactions(self.page)
|
312 |
+
|
313 |
+
content = await self.page.content()
|
314 |
+
soup = BeautifulSoup(content, 'html.parser')
|
315 |
+
|
316 |
+
# Define extensions to look for
|
317 |
+
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', '.avi', '.mkv',
|
318 |
+
'.png', '.jpg', '.jpeg', '.gif', '.xlsx', '.xls', '.ppt', '.pptx', '.txt']
|
319 |
+
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
|
320 |
+
|
321 |
+
# Parse base URL for relative links
|
322 |
+
parsed_base = urlparse(final_url)
|
323 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
324 |
+
|
325 |
+
# Find all links
|
326 |
+
for a in soup.find_all('a', href=True):
|
327 |
+
href = a['href'].strip()
|
328 |
+
|
329 |
+
# Skip empty or javascript links
|
330 |
+
if not href or href.startswith('javascript:') or href == '#':
|
331 |
+
continue
|
332 |
+
|
333 |
+
# Handle special cases (PHP scripts, download handlers)
|
334 |
+
if '.php' in href.lower() or 'download' in href.lower() or 'get' in href.lower():
|
335 |
+
full_url = href if href.startswith('http') else urljoin(base_url, href)
|
336 |
+
real_url = await self.extract_real_download_url(full_url)
|
337 |
+
if real_url and real_url != full_url:
|
338 |
+
size_str = await self.get_file_size(real_url)
|
339 |
+
found_files.append({
|
340 |
+
'url': real_url,
|
341 |
+
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
|
342 |
+
'size': size_str,
|
343 |
+
'metadata': {}
|
344 |
+
})
|
345 |
+
continue
|
346 |
+
|
347 |
+
# Handle direct file links
|
348 |
+
if any(href.lower().endswith(ext) for ext in all_exts):
|
349 |
+
file_url = href if href.startswith('http') else urljoin(base_url, href)
|
350 |
+
size_str = await self.get_file_size(file_url)
|
351 |
+
meta = {}
|
352 |
+
|
353 |
+
if file_url.lower().endswith('.pdf'):
|
354 |
+
meta = await self.get_pdf_metadata(file_url)
|
355 |
+
|
356 |
+
found_files.append({
|
357 |
+
'url': file_url,
|
358 |
+
'filename': os.path.basename(urlparse(file_url).path),
|
359 |
+
'size': size_str,
|
360 |
+
'metadata': meta
|
361 |
+
})
|
362 |
+
|
363 |
+
# Handle Google Drive links
|
364 |
+
elif any(x in href for x in ['drive.google.com', 'docs.google.com']):
|
365 |
+
file_id = None
|
366 |
+
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
|
367 |
+
match = re.search(pattern, href)
|
368 |
+
if match:
|
369 |
+
file_id = match.group(1)
|
370 |
+
break
|
371 |
+
|
372 |
+
if file_id:
|
373 |
+
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
374 |
+
async with self.context.new_page() as page:
|
375 |
+
try:
|
376 |
+
response = await page.request.head(direct_url, timeout=15000)
|
377 |
+
filename = file_id
|
378 |
+
content_disposition = response.headers.get('content-disposition', '')
|
379 |
+
if content_disposition:
|
380 |
+
filename_match = re.findall('filename="(.+?)"', content_disposition)
|
381 |
+
if filename_match:
|
382 |
+
filename = filename_match[0]
|
383 |
+
|
384 |
+
found_files.append({
|
385 |
+
'url': direct_url,
|
386 |
+
'filename': filename,
|
387 |
+
'size': await self.get_file_size(direct_url),
|
388 |
+
'metadata': {}
|
389 |
+
})
|
390 |
+
except Exception as e:
|
391 |
+
logger.error(f"Error processing Google Drive link: {e}")
|
392 |
+
|
393 |
+
# Make list unique based on URLs
|
394 |
+
seen_urls = set()
|
395 |
+
unique_files = []
|
396 |
+
for f in found_files:
|
397 |
+
if f['url'] not in seen_urls:
|
398 |
+
seen_urls.add(f['url'])
|
399 |
+
unique_files.append(f)
|
400 |
+
|
401 |
+
return unique_files
|
402 |
+
|
403 |
+
except Exception as e:
|
404 |
+
logger.error(f"Error extracting files from {url}: {e}")
|
405 |
+
return []
|
406 |
+
|
407 |
+
async def download_file(self, file_info, save_dir, referer):
|
408 |
+
file_url = file_info['url']
|
409 |
+
fname = file_info['filename']
|
410 |
+
path = os.path.join(save_dir, fname)
|
411 |
+
|
412 |
+
# Handle duplicate filenames
|
413 |
+
base, ext = os.path.splitext(fname)
|
414 |
+
counter = 1
|
415 |
+
while os.path.exists(path):
|
416 |
+
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
|
417 |
+
counter += 1
|
418 |
+
|
419 |
+
os.makedirs(save_dir, exist_ok=True)
|
420 |
+
|
421 |
+
try:
|
422 |
+
# Special handling for Google Drive
|
423 |
+
if 'drive.google.com' in file_url:
|
424 |
+
import gdown
|
425 |
+
try:
|
426 |
+
st.write(f"Downloading from Google Drive: {fname}")
|
427 |
+
output = gdown.download(file_url, path, quiet=False)
|
428 |
+
if output:
|
429 |
+
return path
|
430 |
+
return None
|
431 |
+
except Exception as e:
|
432 |
+
logger.error(f"Google Drive download error: {e}")
|
433 |
+
return None
|
434 |
+
|
435 |
+
# Handle normal downloads
|
436 |
+
async with self.context.new_page() as page:
|
437 |
+
st.write(f"Downloading: {fname}")
|
438 |
+
|
439 |
+
headers = {
|
440 |
+
'Accept': '*/*',
|
441 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
442 |
+
'Referer': referer
|
443 |
+
}
|
444 |
+
|
445 |
+
response = await page.request.get(file_url, headers=headers, timeout=30000)
|
446 |
+
|
447 |
+
if response.status == 200:
|
448 |
+
content = await response.body()
|
449 |
+
with open(path, 'wb') as f:
|
450 |
+
f.write(content)
|
451 |
+
return path
|
452 |
+
else:
|
453 |
+
logger.error(f"Download failed with status {response.status}: {file_url}")
|
454 |
+
return None
|
455 |
+
|
456 |
+
except Exception as e:
|
457 |
+
logger.error(f"Error downloading {file_url}: {e}")
|
458 |
+
return None
|
459 |
+
|
460 |
+
async def search_bing(self):
|
461 |
+
if not self.query:
|
462 |
+
return [], []
|
463 |
+
|
464 |
+
search_query = self.query
|
465 |
+
if "filetype:pdf" not in search_query.lower():
|
466 |
+
search_query += " filetype:pdf"
|
467 |
+
|
468 |
+
search_url = f"https://www.bing.com/search?q={search_query}&count={self.num_results}"
|
469 |
+
|
470 |
+
try:
|
471 |
+
await self.page.goto(search_url, timeout=30000)
|
472 |
+
await self.page.wait_for_selector('li.b_algo', timeout=30000)
|
473 |
+
await human_like_scroll(self.page)
|
474 |
+
|
475 |
+
results = []
|
476 |
+
elements = await self.page.query_selector_all('li.b_algo')
|
477 |
+
|
478 |
+
for element in elements:
|
479 |
+
link = await element.query_selector('h2 a')
|
480 |
+
if link:
|
481 |
+
url = await link.get_attribute('href')
|
482 |
+
if url:
|
483 |
+
results.append(url)
|
484 |
+
|
485 |
+
return results[:self.num_results]
|
486 |
+
|
487 |
+
except Exception as e:
|
488 |
+
logger.error(f"Bing search error: {e}")
|
489 |
+
return []
|
490 |
+
|
491 |
+
async def get_sublinks(self, url, limit=100):
|
492 |
+
try:
|
493 |
+
await self.page.goto(url, timeout=30000)
|
494 |
+
content = await self.page.content()
|
495 |
+
soup = BeautifulSoup(content, 'html.parser')
|
496 |
+
|
497 |
+
parsed_base = urlparse(url)
|
498 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
499 |
+
|
500 |
+
links = set()
|
501 |
+
for a in soup.find_all('a', href=True):
|
502 |
+
href = a['href'].strip()
|
503 |
+
if href.startswith('http'):
|
504 |
+
links.add(href)
|
505 |
+
elif href.startswith('/'):
|
506 |
+
links.add(f"{base_url}{href}")
|
507 |
+
|
508 |
+
return list(links)[:limit]
|
509 |
+
|
510 |
+
except Exception as e:
|
511 |
+
logger.error(f"Error getting sublinks: {e}")
|
512 |
+
return []
|
513 |
+
|
514 |
+
async def deep_search(self, url, custom_ext_list=None, sublink_limit=100):
|
515 |
+
if not custom_ext_list:
|
516 |
+
custom_ext_list = []
|
517 |
+
|
518 |
+
progress_text = st.empty()
|
519 |
+
progress_bar = st.progress(0)
|
520 |
+
|
521 |
+
try:
|
522 |
+
# Search main page
|
523 |
+
progress_text.text("Analyzing main page...")
|
524 |
+
main_files = await self.extract_downloadable_files(url, custom_ext_list)
|
525 |
+
|
526 |
+
# Get and search sublinks
|
527 |
+
progress_text.text("Getting sublinks...")
|
528 |
+
sublinks = await self.get_sublinks(url, sublink_limit)
|
529 |
+
|
530 |
+
if not sublinks:
|
531 |
+
progress_bar.progress(1.0)
|
532 |
+
return main_files
|
533 |
+
|
534 |
+
# Process sublinks
|
535 |
+
all_files = main_files
|
536 |
+
total_links = len(sublinks)
|
537 |
+
|
538 |
+
for i, sublink in enumerate(sublinks, 1):
|
539 |
+
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
|
540 |
+
progress_bar.progress(i/total_links)
|
541 |
+
|
542 |
+
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
|
543 |
+
all_files.extend(sub_files)
|
544 |
+
|
545 |
+
# Make results unique
|
546 |
+
seen_urls = set()
|
547 |
+
unique_files = []
|
548 |
+
for f in all_files:
|
549 |
+
if f['url'] not in seen_urls:
|
550 |
+
seen_urls.add(f['url'])
|
551 |
+
unique_files.append(f)
|
552 |
+
|
553 |
+
progress_text.text(f"Found {len(unique_files)} unique files")
|
554 |
+
progress_bar.progress(1.0)
|
555 |
+
|
556 |
+
return unique_files
|
557 |
+
|
558 |
+
except Exception as e:
|
559 |
+
logger.error(f"Deep search error: {e}")
|
560 |
+
return []
|
561 |
|
562 |
# ---------- Main Streamlit UI Implementation -------------
|
563 |
def main():
|