File size: 98,787 Bytes
7b773e7 fd2ce95 4b9057c fd2ce95 96b5b31 ac436f8 0f88c1d b9e60db 0444076 ac436f8 fd2ce95 cd5e6c2 ac436f8 cd5e6c2 9fa91d7 fd2ce95 4b9057c 7b773e7 4b9057c ac436f8 7b773e7 ac436f8 7b773e7 907ffd6 7b773e7 ac436f8 7b773e7 ac436f8 7b773e7 940f220 ac436f8 4b9057c fd2ce95 4b9057c fd2ce95 4b9057c fd2ce95 4b9057c fd2ce95 4b9057c 9fa91d7 ac436f8 4b9057c fd2ce95 ac436f8 fd2ce95 4b9057c fd2ce95 ac436f8 7dc96a3 fb399cc 6b8a747 fb399cc fd2ce95 fb399cc 7dc96a3 f81cb89 fb399cc 5a2226b 3b03ee1 5a2226b 3b03ee1 907ffd6 3b03ee1 907ffd6 3b03ee1 907ffd6 3b03ee1 907ffd6 3b03ee1 907ffd6 3b03ee1 907ffd6 3b03ee1 907ffd6 942484e 3b03ee1 942484e 3b03ee1 942484e 3b03ee1 942484e 907ffd6 942484e 3b03ee1 ac436f8 aaa2984 ac436f8 5b830c2 ac436f8 4b9057c 9fa91d7 7e696de ac436f8 4b9057c ac436f8 4ce7f57 ac436f8 4ce7f57 ac436f8 4b9057c 9fa91d7 39a7a36 4ce7f57 ac436f8 4ce7f57 ac436f8 4ce7f57 5b830c2 4ce7f57 ac436f8 9ad3033 7e696de ac436f8 7e696de ac436f8 7e696de ac436f8 7e696de ac436f8 7e696de ac436f8 7e696de ac436f8 7e696de ac436f8 7e696de ac436f8 7e696de d9324de 7e696de d9324de ac436f8 7e696de d9324de ac436f8 9fa91d7 39a7a36 9fa91d7 ac436f8 9fa91d7 6b8a747 ac436f8 6b8a747 d9324de 39a7a36 ac436f8 6b8a747 ac436f8 9fa91d7 2ee6b58 39a7a36 2ee6b58 fd2ce95 ac436f8 fd2ce95 4b9057c ac436f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 |
import streamlit as st
st.set_page_config(page_title="Advanced File Downloader", layout="wide")
# Core imports
import os
import subprocess
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import asyncio
import logging
from urllib.parse import urlparse
import re
from pathlib import Path
from io import BytesIO
import random
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import zipfile
import tempfile
import mimetypes
import requests
import datetime
import spacy
import spacy.cli
from spacy.language import Language
import google_auth_oauthlib.flow
import googleapiclient.discovery
import google.auth.transport.requests
from async_timeout import timeout as async_timeout
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import schedule
import threading
import time
import hashlib
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from sklearn.cluster import KMeans
import numpy as np
import base64
import shutil
from PIL import Image # Make sure to pip install Pillow
from reportlab.pdfgen import canvas
# -------------------- Logging Setup --------------------
logging.basicConfig(
filename='advanced_download_log.txt',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
GOOGLE_OAUTH_CONFIG = {
"web": {
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com",
"project_id": "huggingface-449214",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_secret": "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",
"redirect_uris": ["https://euler314-craw-web.hf.space/"]
}
}
# Playwright Setup
def install_playwright_dependencies():
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright")
subprocess.run(['apt-get', 'update', '-y'], check=True)
packages = [
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0',
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1',
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0'
]
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True)
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True)
install_playwright_dependencies()
# Model Loading
@st.cache_resource
def load_models():
try:
# Load spaCy model
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
st.info("Downloading spaCy model...")
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Load SentenceTransformer
try:
semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat')
except Exception as e:
st.error(f"Error loading SentenceTransformer: {e}")
semantic_model = None
# Load Transformers pipeline
try:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
st.error(f"Error loading Transformers: {e}")
summarizer = None
return nlp, semantic_model, summarizer
except Exception as e:
st.error(f"Error loading models: {e}")
return None, None, None
nlp_model, semantic_model, summarizer = load_models()
# Utility Functions
def get_random_user_agent():
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0',
]
return random.choice(USER_AGENTS)
def sizeof_fmt(num, suffix='B'):
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Y{suffix}"
def create_zip_file(file_paths, output_dir):
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip")
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file_path in file_paths:
zipf.write(file_path, os.path.basename(file_path))
return zip_path
# Google Drive Functions
def get_google_auth_url():
client_config = GOOGLE_OAUTH_CONFIG["web"]
flow = google_auth_oauthlib.flow.Flow.from_client_config(
{"web": client_config},
scopes=["https://www.googleapis.com/auth/drive.file"]
)
flow.redirect_uri = client_config["redirect_uris"][0]
authorization_url, _ = flow.authorization_url(
access_type="offline",
include_granted_scopes="true",
prompt="consent"
)
return authorization_url
def exchange_code_for_credentials(auth_code):
if not auth_code.strip():
return None, "No code provided."
try:
client_config = GOOGLE_OAUTH_CONFIG["web"]
flow = google_auth_oauthlib.flow.Flow.from_client_config(
{"web": client_config},
scopes=["https://www.googleapis.com/auth/drive.file"]
)
flow.redirect_uri = client_config["redirect_uris"][0]
flow.fetch_token(code=auth_code.strip())
creds = flow.credentials
if not creds or not creds.valid:
return None, "Could not validate credentials. Check code and try again."
return creds, "Google Sign-In successful!"
except Exception as e:
return None, f"Error during token exchange: {e}"
def google_drive_upload(file_path, credentials, folder_id=None):
try:
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials)
file_metadata = {'name': os.path.basename(file_path)}
if folder_id:
file_metadata['parents'] = [folder_id]
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True)
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
return created.get("id", "")
except Exception as e:
return f"Error uploading to Drive: {str(e)}"
def create_drive_folder(drive_service, name):
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'}
folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
return folder.get('id')
# DownloadManager Class
class DownloadManager:
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5):
self.use_proxy = use_proxy
self.proxy = proxy
self.query = query
self.num_results = num_results
self.playwright = None
self.browser = None
self.context = None
self.page = None
async def __aenter__(self):
self.playwright = await async_playwright().start()
opts = {
"headless": True,
"args": [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--no-zygote',
'--single-process'
]
}
if self.use_proxy and self.proxy:
opts["proxy"] = {"server": self.proxy}
self.browser = await self.playwright.chromium.launch(**opts)
self.context = await self.browser.new_context(user_agent=get_random_user_agent())
self.page = await self.context.new_page()
await self.page.set_extra_http_headers({
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.bing.com/'
})
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
async def search_bing(self):
urls = []
try:
search_url = f"https://www.bing.com/search?q={self.query}"
await self.page.goto(search_url, timeout=30000)
await self.page.wait_for_load_state('networkidle')
links = await self.page.query_selector_all("li.b_algo h2 a")
for link in links[:self.num_results]:
href = await link.get_attribute('href')
if href:
urls.append(href)
return urls
except Exception as e:
logger.error(f"Error searching Bing: {e}")
return []
async def get_file_size(self, url):
try:
async with self.context.new_page() as page:
response = await page.request.head(url, timeout=15000)
length = response.headers.get('Content-Length', None)
if length:
return sizeof_fmt(int(length))
else:
return "Unknown Size"
except Exception:
return "Unknown Size"
async def get_pdf_metadata(self, url):
try:
async with self.context.new_page() as page:
resp = await page.request.get(url, timeout=15000)
if resp.ok:
content = await resp.body()
pdf = BytesIO(content)
reader = PdfReader(pdf)
return {
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A',
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A',
'Pages': len(reader.pages),
}
else:
return {}
except Exception:
return {}
async def extract_real_download_url(self, url):
try:
async with self.context.new_page() as page:
response = await page.goto(url, wait_until='networkidle', timeout=30000)
if response and response.headers.get('location'):
return response.headers['location']
return page.url
except Exception as e:
logger.error(f"Error extracting real download URL: {e}")
return url
async def get_edu_exam_links(self, url):
"""Specialized method for educational exam websites that follows a common pattern."""
try:
logger.info(f"Fetching exam links from {url}")
links = set()
# Use requests for a faster initial scan
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
headers = {"User-Agent": get_random_user_agent()}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code != 200:
logger.warning(f"Failed to fetch page: {response.status_code}")
return []
# Parse with BeautifulSoup first for efficiency
soup = BeautifulSoup(response.text, "html.parser")
parsed_base = urlparse(url)
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
# Look for all links
for a in soup.find_all("a", href=True):
href = a["href"]
full_url = urljoin(url, href)
# Special patterns for exam sites
for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
"/test/", "/download/", "/files/", "/assignments/"]:
if pattern in full_url.lower():
links.add(full_url)
break
# If we didn't find many links with direct approach, use Playwright for more thorough extraction
if len(links) < 5:
logger.info("Using browser for enhanced link extraction")
await self.page.goto(url, timeout=30000, wait_until='networkidle')
# Check for ASP.NET specific elements that might contain exam links
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
if grid_elements:
for grid in grid_elements:
grid_links = await grid.query_selector_all('a[href]')
for a in grid_links:
href = await a.get_attribute('href')
if href:
full_url = href if href.startswith('http') else urljoin(url, href)
links.add(full_url)
# Try clicking any controls that might reveal more exam links
show_buttons = await self.page.query_selector_all('input[type="button"], button')
for button in show_buttons:
button_text = await button.text_content() or ""
button_value = await button.get_attribute("value") or ""
if any(keyword in (button_text + button_value).lower() for keyword in
["show", "view", "display", "list", "exam", "paper", "test"]):
try:
await button.click()
await self.page.wait_for_timeout(1000)
await self.page.wait_for_load_state('networkidle', timeout=5000)
# Get any new links that appeared
new_links = await self.page.query_selector_all('a[href]')
for a in new_links:
href = await a.get_attribute('href')
if href:
full_url = href if href.startswith('http') else urljoin(url, href)
links.add(full_url)
except Exception as e:
logger.warning(f"Error clicking button: {e}")
# Filter links to likely contain exam documents
filtered_links = []
for link in links:
# Common file extensions for exam documents
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']):
filtered_links.append(link)
continue
# Common paths for exam documents
if any(pattern in link.lower() for pattern in [
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/",
"/pastpapers/", "/questionpapers/", "/tests/"
]):
filtered_links.append(link)
logger.info(f"Found {len(filtered_links)} potential exam document links")
return filtered_links
except Exception as e:
logger.error(f"Error getting exam links: {e}")
return []
async def extract_downloadable_files(self, url, custom_ext_list):
found_files = []
try:
# Special handling for educational exam sites
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
["exam", "test", "pastpaper", "eduexp"]):
logger.info("Using specialized handler for educational exam site")
# Get direct links to exam files
exam_links = await self.get_edu_exam_links(url)
for link in exam_links:
# Try to resolve any redirection
real_url = await self.extract_real_download_url(link)
filename = os.path.basename(urlparse(real_url).path)
# If filename is URL encoded (common with Chinese/international sites)
if '%' in filename:
try:
from urllib.parse import unquote
filename = unquote(filename)
except Exception:
pass
# Get file size
size_str = await self.get_file_size(real_url)
# Get metadata for PDFs
meta = {}
if real_url.lower().endswith('.pdf'):
try:
meta = await self.get_pdf_metadata(real_url)
except Exception:
pass
found_files.append({
'url': real_url,
'filename': filename,
'size': size_str,
'metadata': meta
})
# If we found exam files with the specialized method, return them
if found_files:
return found_files
# Standard extraction method if specialized method didn't find files
response = await self.page.goto(url, timeout=30000, wait_until='networkidle')
if not response:
return []
final_url = self.page.url
if '.php' in final_url or 'download' in final_url:
real_url = await self.extract_real_download_url(final_url)
if real_url != final_url:
found_files.append({
'url': real_url,
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
'size': await self.get_file_size(real_url),
'metadata': {}
})
return found_files
await self.page.wait_for_load_state('networkidle', timeout=30000)
content = await self.page.content()
soup = BeautifulSoup(content, 'html.parser')
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4',
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx',
'.pptx', '.odt', '.txt']
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()])
parsed_base = urlparse(final_url)
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
path_base = os.path.dirname(parsed_base.path)
# Process all anchor tags
for a in soup.find_all('a', href=True):
href = a['href'].strip()
if '.php' in href.lower() or 'download' in href.lower():
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
real_url = await self.extract_real_download_url(full_url)
if real_url and real_url != full_url:
found_files.append({
'url': real_url,
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file',
'size': await self.get_file_size(real_url),
'metadata': {}
})
continue
if any(href.lower().endswith(ext) for ext in all_exts):
file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
size_str = await self.get_file_size(file_url)
meta = {}
if file_url.lower().endswith('.pdf'):
meta = await self.get_pdf_metadata(file_url)
found_files.append({
'url': file_url,
'filename': os.path.basename(file_url.split('?')[0]),
'size': size_str,
'metadata': meta
})
# Handle Google Drive links
elif ("drive.google.com" in href) or ("docs.google.com" in href):
file_id = None
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
match = re.search(pattern, href)
if match:
file_id = match.group(1)
break
if file_id:
# Get file info to determine type and view-only status
file_type, is_view_only = await self.get_google_drive_file_info(file_id)
# Create a more informative filename based on info
filename = f"gdrive_{file_id}"
if file_type:
filename = f"{filename}.{file_type}"
size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}")
found_files.append({
'url': href, # Use original URL
'filename': filename,
'size': size_str,
'metadata': {
'view_only': is_view_only,
'file_type': file_type,
'file_id': file_id
}
})
# Also check for files in other elements (iframe, embed, object, etc.)
other_elements = soup.find_all(['iframe', 'embed', 'object', 'source'])
for elem in other_elements:
src = elem.get('src') or elem.get('data')
if src and any(src.lower().endswith(ext) for ext in all_exts):
file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
size_str = await self.get_file_size(file_url)
meta = {}
if file_url.lower().endswith('.pdf'):
meta = await self.get_pdf_metadata(file_url)
found_files.append({
'url': file_url,
'filename': os.path.basename(file_url.split('?')[0]),
'size': size_str,
'metadata': meta
})
# Check for file links in onclick attributes
onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]')
for elem in onclick_elements:
onclick = await elem.get_attribute('onclick')
urls = re.findall(r'(https?://[^\'"]+)', onclick)
for url_match in urls:
if any(url_match.lower().endswith(ext) for ext in all_exts):
size_str = await self.get_file_size(url_match)
meta = {}
if url_match.lower().endswith('.pdf'):
meta = await self.get_pdf_metadata(url_match)
found_files.append({
'url': url_match,
'filename': os.path.basename(url_match.split('?')[0]),
'size': size_str,
'metadata': meta
})
seen_urls = set()
unique_files = []
for f in found_files:
if f['url'] not in seen_urls:
seen_urls.add(f['url'])
unique_files.append(f)
return unique_files
except Exception as e:
logger.error(f"Error extracting files from {url}: {e}")
return []
async def download_file(self, file_info, save_dir, referer):
file_url = file_info['url']
fname = file_info['filename']
path = os.path.join(save_dir, fname)
base, ext = os.path.splitext(fname)
counter = 1
while os.path.exists(path):
path = os.path.join(save_dir, f"{base}_{counter}{ext}")
counter += 1
os.makedirs(save_dir, exist_ok=True)
try:
# Special handling for Google Drive files
if "drive.google.com" in file_url or "docs.google.com" in file_url:
# Check if it's marked as view-only in metadata
is_view_only = file_info.get('metadata', {}).get('view_only', False)
# For view-only files, try our most robust approach first
if is_view_only:
logger.info(f"Attempting to download view-only file: {file_url}")
result_path = await self.force_download_viewonly(file_info, path)
if result_path:
return result_path
# If that failed, try the regular download approach
logger.info("Primary method failed, trying fallback methods")
# Try regular download methods
success = await self.download_from_google_drive(file_url, path)
if success:
return path
# If all methods failed for Google Drive, try one last approach
logger.warning("All standard methods failed, attempting force download")
result_path = await self.force_download_viewonly(file_info, path)
return result_path if result_path else None
# Original code for non-Google Drive downloads
async with self.context.new_page() as page:
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': referer
}
response = await page.request.get(file_url, headers=headers, timeout=30000)
if response.status == 200:
content = await response.body()
with open(path, 'wb') as f:
f.write(content)
return path
else:
logger.error(f"Download failed with status {response.status}: {file_url}")
return None
except Exception as e:
logger.error(f"Error downloading {file_url}: {e}")
return None
async def force_download_viewonly(self, file_info, save_path):
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
try:
# Extract file ID
file_id = file_info.get('metadata', {}).get('file_id')
if not file_id:
url = file_info['url']
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']:
match = re.search(pattern, url)
if match:
file_id = match.group(1)
break
if not file_id:
logger.error("Could not extract file ID")
return None
file_type = file_info.get('metadata', {}).get('file_type', 'pdf')
base, ext = os.path.splitext(save_path)
if not ext:
save_path = f"{base}.{file_type}"
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})")
# Create a dedicated browser instance with better resolution
browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-site-isolation-trials'
]
)
# Use higher resolution for better quality
context = await browser.new_context(
viewport={'width': 1600, 'height': 1200},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
device_scale_factor=2.0
)
page = await context.new_page()
try:
# Go to the file view page
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view")
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000)
await page.wait_for_load_state('networkidle')
await page.wait_for_timeout(5000) # Wait longer for everything to load
# Create temp directory
temp_dir = tempfile.mkdtemp()
# Special handling for PDFs
if file_type.lower() == 'pdf':
# Check if there's a pagination control
pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
# Try multiple methods to extract total pages
total_pages = await page.evaluate("""
() => {
// Method 1: Check page counter text
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
const text = el.textContent || '';
return /\\d+\\s*\\/\\s*\\d+/.test(text);
});
if (pageCounters.length > 0) {
const text = pageCounters[0].textContent || '';
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
if (match && match[2]) return parseInt(match[2]);
}
// Method 2: Check actual page elements
const pageElements = document.querySelectorAll('.drive-viewer-paginated-page');
if (pageElements.length > 0) return pageElements.length;
// Method 3: Look for page thumbnails
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
if (thumbnails.length > 0) return thumbnails.length;
// Fallback: conservative guess based on UI
return 50; // Safe default when we can't determine
}
""")
logger.info(f"Detected {total_pages} pages in PDF")
if total_pages <= 1:
# Additional check - sometimes the page count detection fails
# Let's double-check by looking for next/previous buttons
next_button = await page.query_selector('button[aria-label="Next page"]')
if next_button:
disabled = await next_button.get_attribute('disabled')
if not disabled:
logger.info("Found next button that's not disabled, document has multiple pages")
total_pages = 100 # Set a high number, we'll stop when we can't go further
# If we still think it's a single page, use a more direct approach
if total_pages <= 1:
# Single page approach
logger.info("Using single-page capture approach")
# Take a screenshot of the current view (should be the full document or first page)
screenshot_path = os.path.join(temp_dir, "page.png")
# Try to screenshot just the document area if we can find it
document_area = await page.query_selector('.drive-viewer-paginated-page')
if document_area:
await document_area.screenshot(path=screenshot_path)
else:
# Otherwise take a full screenshot
await page.screenshot(path=screenshot_path)
# Convert to PDF
from PIL import Image
from reportlab.pdfgen import canvas as pdf_canvas
img = Image.open(screenshot_path)
width, height = img.size
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
c.drawImage(screenshot_path, 0, 0, width, height)
c.save()
os.remove(screenshot_path)
os.rmdir(temp_dir)
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
return save_path
return None
# Multi-page approach
logger.info(f"Using multi-page capture approach for {total_pages} pages")
# CRITICAL: We need to go to the first page first
# Check if we need to reset to first page
current_page_text = await page.evaluate("""
() => {
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
const text = el.textContent || '';
return /\\d+\\s*\\/\\s*\\d+/.test(text);
});
if (pageCounters.length > 0) {
return pageCounters[0].textContent || '';
}
return '';
}
""")
current_page = 1
if current_page_text:
match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
if match:
current_page = int(match.group(1))
# If we're not on page 1, go back to first page
if current_page > 1:
logger.info(f"Currently on page {current_page}, navigating back to page 1")
# Look for an input field where we can directly set the page number
page_input = await page.query_selector('input[aria-label="Page"]')
if page_input:
await page_input.fill("1")
await page_input.press("Enter")
await page.wait_for_timeout(1000)
else:
# Use prev button to go back to first page
prev_button = await page.query_selector('button[aria-label="Previous page"]')
if prev_button:
# Keep clicking until we can't anymore
for _ in range(current_page - 1):
try:
await prev_button.click()
await page.wait_for_timeout(500)
except Exception as e:
logger.warning(f"Error clicking prev button: {e}")
break
# Capture each page
screenshots = []
page_num = 1
max_tries = min(total_pages + 10, 200) # Set a reasonable limit
next_button = await page.query_selector('button[aria-label="Next page"]')
# Maximize the PDF view if possible
await page.evaluate("""
() => {
// Try to find and click any "full page" or "maximize" buttons
const fullViewButtons = Array.from(document.querySelectorAll('button'))
.filter(b => b.textContent?.includes('Full') ||
b.getAttribute('aria-label')?.includes('Full') ||
b.getAttribute('aria-label')?.includes('fit page'));
if (fullViewButtons.length > 0) {
fullViewButtons[0].click();
}
}
""")
await page.wait_for_timeout(1000) # Wait for view to adjust
while page_num <= max_tries:
# Wait for the page to be fully loaded
await page.wait_for_timeout(800)
# Take a screenshot of the current page
screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
# Try different methods to identify and capture just the page content
page_content = await page.query_selector('.drive-viewer-paginated-page')
if page_content:
# Found the specific page element
await page_content.screenshot(path=screenshot_path)
else:
# Fall back to screenshot of visible viewport
await page.screenshot(path=screenshot_path)
screenshots.append(screenshot_path)
logger.info(f"Captured page {page_num}")
# Check if we have a disabled next button (reached the end)
if next_button:
is_disabled = await next_button.get_attribute('disabled')
if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
logger.info(f"Reached end of document after {page_num} pages")
break
# Click the next button
try:
await next_button.click()
await page.wait_for_timeout(800) # Wait for page transition
page_num += 1
except Exception as e:
logger.error(f"Error clicking next button: {e}")
# Try to get a fresh reference to the button
next_button = await page.query_selector('button[aria-label="Next page"]')
if not next_button:
logger.warning("Next button disappeared, assuming end of document")
break
else:
# Try to find the next button again
next_button = await page.query_selector('button[aria-label="Next page"]')
if not next_button:
logger.warning("Could not find next button, stopping navigation")
break
# Double-check if we've reached the expected total
if page_num >= total_pages:
logger.info(f"Reached expected total of {total_pages} pages")
break
# Combine screenshots into PDF
logger.info(f"Creating PDF from {len(screenshots)} captured pages")
from PIL import Image
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas as pdf_canvas
# Use the size of the first screenshot to set PDF dimensions
if screenshots:
try:
img = Image.open(screenshots[0])
width, height = img.size
c = pdf_canvas.Canvas(save_path, pagesize=(width, height))
for screenshot in screenshots:
try:
if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
img = Image.open(screenshot)
c.drawImage(screenshot, 0, 0, width, height)
c.showPage()
except Exception as e:
logger.error(f"Error adding page to PDF: {e}")
c.save()
# Clean up screenshots
for screenshot in screenshots:
if os.path.exists(screenshot):
os.remove(screenshot)
logger.info(f"Successfully created PDF with {len(screenshots)} pages")
except Exception as e:
logger.error(f"Error creating PDF: {e}")
else:
logger.error("No screenshots captured to create PDF")
else:
# Non-PDF file handling
screenshot_path = os.path.join(temp_dir, "file.png")
await page.screenshot(path=screenshot_path)
if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']:
# For document types, try to export directly
await self.export_google_doc(file_id, file_type, save_path)
else:
# For other types, save the screenshot with appropriate extension
shutil.copy(screenshot_path, save_path)
os.remove(screenshot_path)
# Clean up temp directory
try:
os.rmdir(temp_dir)
except:
pass
# Close browser
await browser.close()
# Verify file exists and has content
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000:
logger.info(f"Successfully downloaded file to {save_path}")
return save_path
else:
logger.error(f"Generated file is too small or missing: {save_path}")
return None
except Exception as e:
logger.error(f"Error during force download: {e}")
if browser:
await browser.close()
return None
except Exception as e:
logger.error(f"Force download preparation failed: {e}")
return None
async def download_from_google_drive(self, url, save_path):
"""Enhanced method to download from Google Drive with multiple fallback approaches"""
# Extract the file ID from different URL formats
file_id = None
url_patterns = [
r'drive\.google\.com/file/d/([^/]+)',
r'drive\.google\.com/open\?id=([^&]+)',
r'docs\.google\.com/\w+/d/([^/]+)',
r'id=([^&]+)',
r'drive\.google\.com/uc\?id=([^&]+)',
]
for pattern in url_patterns:
match = re.search(pattern, url)
if match:
file_id = match.group(1)
break
if not file_id:
logger.error(f"Could not extract file ID from URL: {url}")
return False
# Determine file type first (important for handling different file types)
file_type, is_view_only = await self.get_google_drive_file_info(file_id)
logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}")
base, ext = os.path.splitext(save_path)
if not ext and file_type:
# Add the correct extension if missing
save_path = f"{base}.{file_type}"
# For view-only files, use specialized approaches
if is_view_only:
# Approach 1: For PDFs, use the JS method
if file_type == 'pdf':
success = await self.download_viewonly_pdf_with_js(file_id, save_path)
if success:
return True
# Approach 2: For Google Docs, Sheets, etc., use export API
if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']:
success = await self.export_google_doc(file_id, file_type, save_path)
if success:
return True
# Approach 3: Try the direct screenshot method for any view-only file
success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type)
if success:
return True
# Try standard approaches for non-view-only files
try:
# Try with gdown first
import gdown
output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True)
if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0:
with open(save_path, 'rb') as f:
content = f.read(100) # Read first 100 bytes
if b'<!DOCTYPE html>' not in content: # Check not HTML error page
logger.info(f"Successfully downloaded with gdown: {url}")
return True
except Exception as e:
logger.warning(f"gdown download failed: {e}")
# Try with requests and session cookies
try:
session = requests.Session()
session.headers.update({'User-Agent': get_random_user_agent()})
# Visit the page first to get cookies
session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30)
# Try download
url = f"https://drive.google.com/uc?id={file_id}&export=download"
response = session.get(url, stream=True, timeout=30)
# Check for confirmation token
confirmation_token = None
for k, v in response.cookies.items():
if k.startswith('download_warning'):
confirmation_token = v
break
# Use confirmation token if found
if confirmation_token:
url = f"{url}&confirm={confirmation_token}"
response = session.get(url, stream=True, timeout=60)
# Check if we're getting HTML instead of the file
content_type = response.headers.get('Content-Type', '')
if 'text/html' in content_type:
logger.warning("Received HTML instead of file - likely download restriction")
else:
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024*1024):
if chunk:
f.write(chunk)
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
with open(save_path, 'rb') as f:
content = f.read(100)
if b'<!DOCTYPE html>' not in content:
logger.info("Successfully downloaded with requests session")
return True
except Exception as e:
logger.warning(f"Requests session download failed: {e}")
logger.warning("Standard download methods failed")
return False
async def download_viewonly_pdf_with_js(self, file_id, save_path):
"""Download view-only PDF using the enhanced blob image caching technique"""
try:
# Create a dedicated browser instance
browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security'
]
)
context = await browser.new_context(
viewport={'width': 1600, 'height': 1200},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
accept_downloads=True # Critical for handling the download event
)
page = await context.new_page()
try:
# Step 1: Navigate to the file
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view")
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000)
await page.wait_for_load_state('networkidle')
await page.wait_for_timeout(5000) # Initial wait for content to load
# Step 2: Estimate the number of pages
estimated_pages = await page.evaluate("""
() => {
// Look for page counter in the interface
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
const text = el.textContent || '';
return /\\d+\\s*\\/\\s*\\d+/.test(text);
});
if (pageCounters.length > 0) {
const text = pageCounters[0].textContent || '';
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/);
if (match && match[2]) return parseInt(match[2]);
}
// If we can't find a counter, check actual pages
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
if (pages.length > 0) return pages.length;
// Default to a reasonable number if we can't determine
return 50;
}
""")
logger.info(f"Estimated number of pages: {estimated_pages}")
# Step 3: Initial scroll to trigger loading
logger.info("Initial scroll to bottom to trigger lazy loading...")
await page.keyboard.press("End")
await page.wait_for_timeout(3000)
# Step 4: Wait for all pages to load by pressing PageDown and checking blob images
logger.info("Waiting for all pages to load...")
max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
attempt = 0
while attempt < max_attempts:
# Count blob images (which are the PDF pages)
blob_count = await page.evaluate("""
Array.from(document.getElementsByTagName('img'))
.filter(img => img.src.startsWith('blob:') && img.width > 100)
.length
""")
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
# If we've loaded enough pages or reached estimated count
if blob_count >= estimated_pages:
logger.info("All pages appear to be loaded.")
break
# Press PageDown to scroll further and trigger more loading
await page.keyboard.press("PageDown")
await page.wait_for_timeout(2000) # Wait for content to load
attempt += 1
# Extra wait to ensure everything is fully loaded
await page.wait_for_timeout(5000)
# Step 5: Set up a download event listener
download_promise = page.wait_for_event("download")
# Step 6: Inject the jsPDF script to generate PDF
logger.info("Generating PDF from loaded pages...")
result = await page.evaluate(r'''
(function() {
return new Promise((resolve, reject) => {
let script = document.createElement("script");
script.onload = function () {
try {
let pdf = new jsPDF();
let imgs = document.getElementsByTagName("img");
let added = 0;
// First collect and sort all valid blob images
let validImages = [];
for (let i = 0; i < imgs.length; i++) {
let img = imgs[i];
if (!/^blob:/.test(img.src)) continue;
if (img.width < 100 || img.height < 100) continue;
validImages.push(img);
}
// Sort by vertical position
validImages.sort((a, b) => {
const rectA = a.getBoundingClientRect();
const rectB = b.getBoundingClientRect();
return rectA.top - rectB.top;
});
console.log(`Found ${validImages.length} valid page images to add to PDF`);
// Process each image as a page
for (let i = 0; i < validImages.length; i++) {
let img = validImages[i];
let canvas = document.createElement("canvas");
let ctx = canvas.getContext("2d");
canvas.width = img.width;
canvas.height = img.height;
ctx.drawImage(img, 0, 0, img.width, img.height);
let imgData = canvas.toDataURL("image/jpeg", 1.0);
if (added > 0) {
pdf.addPage();
}
pdf.addImage(imgData, 'JPEG', 0, 0);
added++;
}
pdf.save("download.pdf");
resolve({success: true, pageCount: added});
} catch (error) {
reject({success: false, error: error.toString()});
}
};
script.onerror = function() {
reject({success: false, error: "Failed to load jsPDF library"});
};
// Use a reliable CDN
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
document.body.appendChild(script);
});
})();
''')
if not result.get('success'):
logger.error(f"Error in PDF generation: {result.get('error')}")
return False
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
# Step 7: Wait for the download to complete and save the file
download = await download_promise
# Step 8: Save the downloaded file to the specified path
await download.save_as(save_path)
logger.info(f"Successfully saved PDF to {save_path}")
return os.path.exists(save_path) and os.path.getsize(save_path) > 1000
finally:
await browser.close()
except Exception as e:
logger.error(f"Error in viewonly PDF download process: {e}")
return False
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type):
"""Download any view-only file by taking screenshots"""
try:
async with self.context.new_page() as page:
# Set high-resolution viewport
await page.set_viewport_size({"width": 1600, "height": 1200})
# Navigate to the file
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000)
# Make sure the file is loaded
await page.wait_for_load_state('networkidle')
await page.wait_for_timeout(3000) # Extra time for rendering
# Create directory for screenshots if multiple pages
base_dir = os.path.dirname(save_path)
base_name = os.path.splitext(os.path.basename(save_path))[0]
screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots")
os.makedirs(screenshots_dir, exist_ok=True)
# Check if it's a multi-page document
is_multi_page = await page.evaluate("""
() => {
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
return pages.length > 1;
}
""")
if is_multi_page and file_type == 'pdf':
# For multi-page PDFs, take screenshots of each page
page_count = await page.evaluate("""
async () => {
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
const container = document.querySelector('.drive-viewer-paginated-scrollable');
if (!container || pages.length === 0) return 0;
// Scroll through to make sure all pages are loaded
const scrollHeight = container.scrollHeight;
const viewportHeight = container.clientHeight;
const scrollStep = viewportHeight;
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) {
container.scrollTo(0, scrollPos);
await delay(300);
}
// Scroll back to top
container.scrollTo(0, 0);
await delay(300);
return pages.length;
}
""")
logger.info(f"Found {page_count} pages in document")
# Take screenshots of each page
screenshots = []
for i in range(page_count):
# Scroll to page
await page.evaluate(f"""
async () => {{
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
const pages = document.querySelectorAll('.drive-viewer-paginated-page');
if (pages.length <= {i}) return false;
pages[{i}].scrollIntoView();
await delay(500);
return true;
}}
""")
# Take screenshot
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png")
await page.screenshot(path=screenshot_path, clip={
'x': 0,
'y': 0,
'width': 1600,
'height': 1200
})
screenshots.append(screenshot_path)
# Combine screenshots into PDF
from PIL import Image
from reportlab.pdfgen import canvas
c = canvas.Canvas(save_path)
for screenshot in screenshots:
img = Image.open(screenshot)
width, height = img.size
# Add page to PDF
c.setPageSize((width, height))
c.drawImage(screenshot, 0, 0, width, height)
c.showPage()
c.save()
# Clean up screenshots
for screenshot in screenshots:
os.remove(screenshot)
os.rmdir(screenshots_dir)
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
else:
# For single-page or non-PDF files, just take one screenshot
screenshot_path = os.path.join(screenshots_dir, "screenshot.png")
await page.screenshot(path=screenshot_path, fullPage=True)
# Convert to requested format if needed
if file_type == 'pdf':
from PIL import Image
from reportlab.pdfgen import canvas
# Create PDF from screenshot
img = Image.open(screenshot_path)
width, height = img.size
c = canvas.Canvas(save_path, pagesize=(width, height))
c.drawImage(screenshot_path, 0, 0, width, height)
c.save()
else:
# Just copy the screenshot to the destination with proper extension
shutil.copy(screenshot_path, save_path)
# Clean up
os.remove(screenshot_path)
os.rmdir(screenshots_dir)
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
except Exception as e:
logger.error(f"Error taking screenshots: {e}")
return False
async def export_google_doc(self, file_id, file_type, save_path):
"""Export Google Docs/Sheets/Slides to downloadable formats"""
try:
# Map file types to export formats
export_formats = {
'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # docx
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # xlsx
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # pptx
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'pdf': 'application/pdf',
}
export_format = export_formats.get(file_type, 'application/pdf')
export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}"
if 'sheet' in file_type or 'xlsx' in file_type:
export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"
elif 'ppt' in file_type or 'presentation' in file_type:
export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx"
elif file_type == 'pdf':
export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf"
async with self.context.new_page() as page:
# Get cookies from the main view page first
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle')
# Now try the export
response = await page.goto(export_url, wait_until='networkidle')
if response.status == 200:
content = await response.body()
with open(save_path, 'wb') as f:
f.write(content)
return os.path.exists(save_path) and os.path.getsize(save_path) > 0
else:
logger.warning(f"Export failed with status {response.status}")
return False
except Exception as e:
logger.error(f"Error exporting Google Doc: {e}")
return False
async def get_google_drive_file_info(self, file_id):
"""Get file type and view-only status from Google Drive"""
file_type = None
is_view_only = False
try:
async with self.context.new_page() as page:
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000)
# Check if view-only
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"')
is_view_only = view_only_text is not None
# Check for Google Docs viewer
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]')
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]')
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]')
if gdocs_viewer:
file_type = 'docx'
elif gsheets_viewer:
file_type = 'xlsx'
elif gslides_viewer:
file_type = 'pptx'
else:
# Check for PDF viewer
pdf_viewer = await page.query_selector('embed[type="application/pdf"]')
if pdf_viewer:
file_type = 'pdf'
else:
# Check for image viewer
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]')
if img_viewer:
# Get image type from src
img_src = await img_viewer.get_attribute('src')
if 'jpg' in img_src or 'jpeg' in img_src:
file_type = 'jpg'
elif 'png' in img_src:
file_type = 'png'
else:
file_type = 'jpg' # Default to jpg
else:
# Generic file type fallback
file_type = 'pdf' # Default to PDF
# If still no type, check filename
if not file_type:
title_element = await page.query_selector('div[role="heading"]')
if title_element:
title = await title_element.text_content()
if title:
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title)
if ext_match:
file_type = ext_match.group(1).lower()
except Exception as e:
logger.error(f"Error getting Google Drive file info: {e}")
file_type = 'pdf' # Default to PDF if we can't determine
return file_type, is_view_only
async def get_sublinks(self, url, limit=10000):
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
links = set()
try:
logger.info(f"Fetching sublinks from: {url}")
# Special handling for educational sites like phsms.cloud.ncnu.edu.tw
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in
["exam", "test", "pastpaper", "eduexp"]):
logger.info("Using specialized exam site sublink extraction")
edu_links = await self.get_edu_exam_links(url)
for link in edu_links:
links.add(link)
# If we found a good number of links with the specialized method, return them
if len(links) > 5:
logger.info(f"Found {len(links)} sublinks with specialized method")
return list(links)[:limit]
# Standard sublink extraction for all sites
await self.page.goto(url, timeout=30000, wait_until='networkidle')
# Get base URL for resolving relative links
parsed_base = urlparse(url)
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
path_base = os.path.dirname(parsed_base.path)
# Check if page has ASP.NET elements which might need special handling
is_aspnet = await self.page.evaluate('''
() => {
return document.querySelector('form#aspnetForm') !== null ||
document.querySelector('input[name="__VIEWSTATE"]') !== null;
}
''')
if is_aspnet:
logger.info("Detected ASP.NET page, using enhanced extraction method")
# Try to interact with ASP.NET controls that might reveal more links
# Look for dropdowns, buttons, and grid elements
dropdowns = await self.page.query_selector_all('select')
buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button')
# Try interacting with dropdowns first
for dropdown in dropdowns:
try:
# Get all options
options = await self.page.evaluate('''
(dropdown) => {
return Array.from(dropdown.options).map(o => o.value);
}
''', dropdown)
# Try selecting each option
for option in options:
if option:
await dropdown.select_option(value=option)
await self.page.wait_for_timeout(1000)
await self.page.wait_for_load_state('networkidle', timeout=5000)
# Extract any new links that appeared
await self.extract_all_link_types(links, base_url, path_base)
except Exception as e:
logger.warning(f"Error interacting with dropdown: {e}")
# Try clicking buttons (but avoid dangerous ones like "delete")
safe_buttons = []
for button in buttons:
button_text = await button.text_content() or ""
button_value = await button.get_attribute("value") or ""
button_id = await button.get_attribute("id") or ""
combined_text = (button_text + button_value + button_id).lower()
# Skip potentially destructive buttons
if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]):
continue
# Prioritize buttons that might show more content
if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]):
safe_buttons.append(button)
# Click the safe buttons
for button in safe_buttons[:5]: # Limit to first 5 to avoid too many clicks
try:
await button.click()
await self.page.wait_for_timeout(1000)
await self.page.wait_for_load_state('networkidle', timeout=5000)
# Extract any new links that appeared
await self.extract_all_link_types(links, base_url, path_base)
except Exception as e:
logger.warning(f"Error clicking button: {e}")
# Extract links from the initial page state
await self.extract_all_link_types(links, base_url, path_base)
# Look specifically for links inside grid/table views which are common in ASP.NET applications
grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a')
for cell in grid_cells:
try:
href = await cell.get_attribute('href')
if href:
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
links.add(full_url)
except Exception as e:
logger.warning(f"Error extracting grid link: {e}")
# Extract links from onclick attributes and javascript:__doPostBack calls
postback_links = await self.page.evaluate('''
() => {
const results = [];
// Find elements with onclick containing __doPostBack
const elements = document.querySelectorAll('*[onclick*="__doPostBack"]');
for (const el of elements) {
// Extract the postback target
const onclick = el.getAttribute('onclick') || '';
const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/);
if (match && match[1]) {
// Get the visible text to use as description
const text = el.innerText || el.textContent || 'Link';
results.push({
id: match[1],
text: text.trim()
});
}
}
return results;
}
''')
# Try interacting with some of the postback links
for postback in postback_links[:10]: # Limit to first 10 to avoid too many interactions
try:
logger.info(f"Trying postback link: {postback['text']} ({postback['id']})")
await self.page.evaluate(f'''
() => {{
if (typeof __doPostBack === 'function') {{
__doPostBack('{postback["id"]}', '');
}}
}}
''')
await self.page.wait_for_timeout(1500)
await self.page.wait_for_load_state('networkidle', timeout=5000)
# Extract any new links that appeared
await self.extract_all_link_types(links, base_url, path_base)
except Exception as e:
logger.warning(f"Error with postback: {e}")
logger.info(f"Found {len(links)} sublinks")
return list(links)[:limit]
except Exception as e:
logger.error(f"Error getting sublinks from {url}: {e}")
return list(links)[:limit] # Return what we have so far
async def extract_all_link_types(self, links_set, base_url, path_base):
"""Extract all types of links from the current page"""
# Get all <a> tag links
a_links = await self.page.query_selector_all('a[href]')
for a in a_links:
try:
href = await a.get_attribute('href')
if href and not href.startswith('javascript:') and not href.startswith('#'):
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
links_set.add(full_url)
except Exception:
pass
# Get iframe sources
iframes = await self.page.query_selector_all('iframe[src]')
for iframe in iframes:
try:
src = await iframe.get_attribute('src')
if src and not src.startswith('javascript:') and not src.startswith('about:'):
full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base)
links_set.add(full_url)
except Exception:
pass
# Get links from onclick attributes that reference URLs
onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]')
for el in onclick_elements:
try:
onclick = await el.get_attribute('onclick')
urls = re.findall(r'(https?://[^\'"]+)', onclick)
for url in urls:
links_set.add(url)
except Exception:
pass
# Look for URLs in data-* attributes
data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]')
for el in data_elements:
for attr in ['data-url', 'data-href', 'data-src']:
try:
value = await el.get_attribute(attr)
if value and not value.startswith('javascript:'):
full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base)
links_set.add(full_url)
except Exception:
pass
# Look for special anchor links that might not have href attributes
special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a')
for anchor in special_anchors:
try:
href = await anchor.get_attribute('href')
if href and not href.startswith('javascript:') and not href.startswith('#'):
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base)
links_set.add(full_url)
except Exception:
pass
def resolve_relative_url(self, relative_url, base_url, path_base):
"""Properly resolve relative URLs considering multiple formats"""
if relative_url.startswith('/'):
# Absolute path relative to domain
return f"{base_url}{relative_url}"
elif relative_url.startswith('./'):
# Explicit relative path
return f"{base_url}{path_base}/{relative_url[2:]}"
elif relative_url.startswith('../'):
# Parent directory
parent_path = '/'.join(path_base.split('/')[:-1])
return f"{base_url}{parent_path}/{relative_url[3:]}"
else:
# Regular relative path
return f"{base_url}{path_base}/{relative_url}"
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60):
if not custom_ext_list:
custom_ext_list = []
progress_text = st.empty()
progress_bar = st.progress(0)
file_count_text = st.empty()
try:
progress_text.text("Analyzing main page...")
# Special handling for ASP.NET pages
is_aspnet = False
try:
await self.page.goto(url, timeout=30000, wait_until='networkidle')
is_aspnet = await self.page.evaluate('''
() => {
return document.querySelector('form#aspnetForm') !== null ||
document.querySelector('input[name="__VIEWSTATE"]') !== null;
}
''')
except Exception:
pass
# Extract files from main page
main_files = await self.extract_downloadable_files(url, custom_ext_list)
initial_count = len(main_files)
file_count_text.text(f"Found {initial_count} files on main page")
# Get sublinks with enhanced method
progress_text.text("Getting sublinks...")
sublinks = await self.get_sublinks(url, sublink_limit)
total_links = len(sublinks)
progress_text.text(f"Found {total_links} sublinks to process")
if not sublinks:
progress_bar.progress(1.0)
return main_files
# Process each sublink
all_files = main_files
for i, sublink in enumerate(sublinks, 1):
progress = i / total_links
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}")
progress_bar.progress(progress)
try:
# Use a longer timeout for ASP.NET pages which can be slower
sub_timeout = timeout * 2 if is_aspnet else timeout
# Extract files from sublink with appropriate timeout
async with async_timeout(sub_timeout):
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list)
all_files.extend(sub_files)
file_count_text.text(f"Found {len(all_files)} total files")
except Exception as e:
logger.warning(f"Error processing sublink {sublink}: {e}")
# Deduplicate files
seen_urls = set()
unique_files = []
for f in all_files:
if f['url'] not in seen_urls:
seen_urls.add(f['url'])
unique_files.append(f)
final_count = len(unique_files)
progress_text.text(f"Deep search complete!")
file_count_text.text(f"Found {final_count} unique files")
progress_bar.progress(1.0)
return unique_files
except Exception as e:
logger.error(f"Deep search error: {e}")
progress_text.text(f"Error during deep search: {str(e)}")
return []
finally:
await asyncio.sleep(2)
if not st.session_state.get('keep_progress', False):
progress_text.empty()
progress_bar.empty()
# Utility Functions for New Features
def extract_keywords(text, n=5):
doc = nlp_model(text)
keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n]
return keywords
def analyze_sentiment(text):
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
result = sentiment_analyzer(text[:512])[0]
return result['label'], result['score']
def get_file_hash(file_path):
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
hasher.update(f.read())
return hasher.hexdigest()
# Main Function
def main():
if 'initialized' not in st.session_state:
st.session_state.initialized = True
st.session_state.discovered_files = []
st.session_state.current_url = None
st.session_state.google_creds = None
st.session_state.selected_files = []
st.session_state.do_deep_search = False
st.session_state.deep_search_url = None
st.session_state.search_results = []
st.title("Advanced File Downloader")
with st.sidebar:
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select")
with st.expander("Advanced Options", expanded=True):
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt")
max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page")
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink")
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox")
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input")
with st.expander("Google Drive Integration", expanded=False):
if st.button("Start Google Sign-In", key="google_signin_btn"):
auth_url = get_google_auth_url()
st.markdown(f"[Click here to authorize]({auth_url})")
auth_code = st.text_input("Enter authorization code", key="auth_code_input")
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code:
creds, msg = exchange_code_for_credentials(auth_code)
st.session_state.google_creds = creds
st.write(msg)
if mode == "Manual URL":
st.header("Manual URL Mode")
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input")
col1, col2 = st.columns([3, 1])
with col1:
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"):
if url:
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()]
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)]
if custom_ext_list != valid_ext_list:
st.warning("Invalid extensions ignored. Use format like '.csv'.")
async def run_deep_search():
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout)
return files
files = asyncio.run(run_deep_search())
if files:
st.session_state.discovered_files = files
st.session_state.current_url = url
st.success(f"Found {len(files)} files!")
else:
st.warning("No files found.")
if st.session_state.discovered_files:
files = st.session_state.discovered_files
st.success(f"Found {len(files)} files!")
col1, col2 = st.columns([1, 4])
with col1:
if st.button("Select All", key="select_all_btn"):
st.session_state.selected_files = list(range(len(files)))
if st.button("Clear Selection", key="clear_selection_btn"):
st.session_state.selected_files = []
selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect")
st.session_state.selected_files = selected_files
if selected_files:
col1, col2, col3, col4 = st.columns(4)
with col1:
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input")
with col2:
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox")
with col3:
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox")
with col4:
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox")
if st.button("Download Selected", key="download_btn"):
if not os.path.exists(download_dir):
os.makedirs(download_dir)
async def download_files():
downloaded_paths = []
progress_bar = st.progress(0)
status_text = st.empty()
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm:
for i, idx in enumerate(selected_files):
progress = (i + 1) / len(selected_files)
file_info = files[idx]
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})")
progress_bar.progress(progress)
path = await dm.download_file(file_info, download_dir, url)
if path:
downloaded_paths.append(path)
status_text.empty()
progress_bar.empty()
return downloaded_paths
downloaded = asyncio.run(download_files())
if downloaded:
st.success(f"Successfully downloaded {len(downloaded)} files")
if create_zip:
zip_path = create_zip_file(downloaded, download_dir)
st.success(f"Created ZIP file: {zip_path}")
with open(zip_path, "rb") as f:
zip_data = f.read()
st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip")
if upload_to_drive and st.session_state.google_creds:
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds)
folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}")
drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id)
if not isinstance(drive_id, str) or not drive_id.startswith("Error"):
st.success(f"Uploaded to Google Drive. File ID: {drive_id}")
else:
st.error(drive_id)
if delete_after:
for path in downloaded:
try:
os.remove(path)
except Exception as e:
st.warning(f"Could not delete {path}: {e}")
st.info("Deleted original files after ZIP creation")
else:
for path in downloaded:
with open(path, "rb") as f:
file_data = f.read()
st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path))
elif mode == "Bing Search":
st.header("Bing Search Mode")
query = st.text_input("Enter search query", key="search_query_input")
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider")
if st.button("Search", key="search_btn"):
if query:
async def run_search():
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm:
with st.spinner("Searching..."):
urls = await dm.search_bing()
if urls:
st.session_state.search_results = urls
st.success(f"Found {len(urls)} results!")
for i, url in enumerate(urls, 1):
with st.expander(f"Result {i}: {url}", expanded=(i == 1)):
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"):
st.session_state.deep_search_url = url
st.session_state.do_deep_search = True
else:
st.warning("No search results found.")
asyncio.run(run_search())
else: # PDF Summarizer mode
if summarizer is None:
st.error("PDF summarization is not available due to model loading errors.")
else:
st.header("PDF Summarizer")
pdf_url = st.text_input("Enter PDF URL", key="pdf_url_input")
if st.button("Summarize", key="summarize_btn"):
if pdf_url:
with st.spinner("Generating summary..."):
try:
response = requests.get(pdf_url, stream=True)
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
with open(temp_pdf.name, "wb") as f:
f.write(response.content)
reader = PdfReader(temp_pdf.name)
text = " ".join([page.extract_text() or "" for page in reader.pages])
os.remove(temp_pdf.name)
summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False)
st.write("Summary:", summary[0]['summary_text'])
except Exception as e:
st.error(f"Error summarizing PDF: {e}")
if __name__ == "__main__":
main() |