|
import streamlit as st |
|
st.set_page_config(page_title="Advanced File Downloader", layout="wide") |
|
|
|
|
|
import os |
|
import subprocess |
|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
import asyncio |
|
import logging |
|
from urllib.parse import urlparse |
|
import re |
|
from pathlib import Path |
|
from io import BytesIO |
|
import random |
|
from bs4 import BeautifulSoup |
|
from PyPDF2 import PdfReader |
|
import zipfile |
|
import tempfile |
|
import mimetypes |
|
import requests |
|
import datetime |
|
import spacy |
|
import spacy.cli |
|
from spacy.language import Language |
|
import google_auth_oauthlib.flow |
|
import googleapiclient.discovery |
|
import google.auth.transport.requests |
|
from async_timeout import timeout as async_timeout |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
from transformers import pipeline |
|
import schedule |
|
import threading |
|
import time |
|
import hashlib |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas |
|
from sklearn.cluster import KMeans |
|
import numpy as np |
|
import base64 |
|
import shutil |
|
from PIL import Image |
|
from reportlab.pdfgen import canvas |
|
|
|
|
|
logging.basicConfig( |
|
filename='advanced_download_log.txt', |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
GOOGLE_OAUTH_CONFIG = { |
|
"web": { |
|
"client_id": "90798824947-u25obg1q844qeikjoh4jdmi579kn9p1c.apps.googleusercontent.com", |
|
"project_id": "huggingface-449214", |
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth", |
|
"token_uri": "https://oauth2.googleapis.com/token", |
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", |
|
"client_secret": "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", |
|
"redirect_uris": ["https://euler314-craw-web.hf.space/"] |
|
} |
|
} |
|
|
|
|
|
def install_playwright_dependencies(): |
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = os.path.expanduser("~/.cache/ms-playwright") |
|
subprocess.run(['apt-get', 'update', '-y'], check=True) |
|
packages = [ |
|
'libnss3', 'libnss3-tools', 'libnspr4', 'libatk1.0-0', |
|
'libatk-bridge2.0-0', 'libatspi2.0-0', 'libcups2', 'libxcomposite1', |
|
'libxdamage1', 'libdrm2', 'libgbm1', 'libpango-1.0-0' |
|
] |
|
subprocess.run(['apt-get', 'install', '-y', '--no-install-recommends'] + packages, check=True) |
|
subprocess.run(['python3', '-m', 'playwright', 'install', 'chromium'], check=True) |
|
|
|
install_playwright_dependencies() |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
try: |
|
|
|
try: |
|
nlp = spacy.load("en_core_web_sm") |
|
except OSError: |
|
st.info("Downloading spaCy model...") |
|
spacy.cli.download("en_core_web_sm") |
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
try: |
|
semantic_model = SentenceTransformer('Qwen/Qwen1.5-0.5B-Chat') |
|
except Exception as e: |
|
st.error(f"Error loading SentenceTransformer: {e}") |
|
semantic_model = None |
|
|
|
|
|
try: |
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
except Exception as e: |
|
st.error(f"Error loading Transformers: {e}") |
|
summarizer = None |
|
|
|
return nlp, semantic_model, summarizer |
|
except Exception as e: |
|
st.error(f"Error loading models: {e}") |
|
return None, None, None |
|
|
|
nlp_model, semantic_model, summarizer = load_models() |
|
|
|
|
|
def get_random_user_agent(): |
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0', |
|
] |
|
return random.choice(USER_AGENTS) |
|
|
|
def sizeof_fmt(num, suffix='B'): |
|
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: |
|
if abs(num) < 1024.0: |
|
return f"{num:3.1f}{unit}{suffix}" |
|
num /= 1024.0 |
|
return f"{num:.1f}Y{suffix}" |
|
|
|
def create_zip_file(file_paths, output_dir): |
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_path = os.path.join(output_dir, f"downloads_{timestamp}.zip") |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file_path in file_paths: |
|
zipf.write(file_path, os.path.basename(file_path)) |
|
return zip_path |
|
|
|
|
|
def get_google_auth_url(): |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
authorization_url, _ = flow.authorization_url( |
|
access_type="offline", |
|
include_granted_scopes="true", |
|
prompt="consent" |
|
) |
|
return authorization_url |
|
|
|
def exchange_code_for_credentials(auth_code): |
|
if not auth_code.strip(): |
|
return None, "No code provided." |
|
try: |
|
client_config = GOOGLE_OAUTH_CONFIG["web"] |
|
flow = google_auth_oauthlib.flow.Flow.from_client_config( |
|
{"web": client_config}, |
|
scopes=["https://www.googleapis.com/auth/drive.file"] |
|
) |
|
flow.redirect_uri = client_config["redirect_uris"][0] |
|
flow.fetch_token(code=auth_code.strip()) |
|
creds = flow.credentials |
|
if not creds or not creds.valid: |
|
return None, "Could not validate credentials. Check code and try again." |
|
return creds, "Google Sign-In successful!" |
|
except Exception as e: |
|
return None, f"Error during token exchange: {e}" |
|
|
|
def google_drive_upload(file_path, credentials, folder_id=None): |
|
try: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=credentials) |
|
file_metadata = {'name': os.path.basename(file_path)} |
|
if folder_id: |
|
file_metadata['parents'] = [folder_id] |
|
media = googleapiclient.http.MediaFileUpload(file_path, resumable=True) |
|
created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute() |
|
return created.get("id", "") |
|
except Exception as e: |
|
return f"Error uploading to Drive: {str(e)}" |
|
|
|
def create_drive_folder(drive_service, name): |
|
folder_metadata = {'name': name, 'mimeType': 'application/vnd.google-apps.folder'} |
|
folder = drive_service.files().create(body=folder_metadata, fields='id').execute() |
|
return folder.get('id') |
|
|
|
|
|
class DownloadManager: |
|
def __init__(self, use_proxy=False, proxy=None, query=None, num_results=5): |
|
self.use_proxy = use_proxy |
|
self.proxy = proxy |
|
self.query = query |
|
self.num_results = num_results |
|
self.playwright = None |
|
self.browser = None |
|
self.context = None |
|
self.page = None |
|
|
|
async def __aenter__(self): |
|
self.playwright = await async_playwright().start() |
|
opts = { |
|
"headless": True, |
|
"args": [ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-gpu', |
|
'--no-zygote', |
|
'--single-process' |
|
] |
|
} |
|
if self.use_proxy and self.proxy: |
|
opts["proxy"] = {"server": self.proxy} |
|
self.browser = await self.playwright.chromium.launch(**opts) |
|
self.context = await self.browser.new_context(user_agent=get_random_user_agent()) |
|
self.page = await self.context.new_page() |
|
await self.page.set_extra_http_headers({ |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': 'https://www.bing.com/' |
|
}) |
|
return self |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
if self.browser: |
|
await self.browser.close() |
|
if self.playwright: |
|
await self.playwright.stop() |
|
|
|
async def search_bing(self): |
|
urls = [] |
|
try: |
|
search_url = f"https://www.bing.com/search?q={self.query}" |
|
await self.page.goto(search_url, timeout=30000) |
|
await self.page.wait_for_load_state('networkidle') |
|
links = await self.page.query_selector_all("li.b_algo h2 a") |
|
for link in links[:self.num_results]: |
|
href = await link.get_attribute('href') |
|
if href: |
|
urls.append(href) |
|
return urls |
|
except Exception as e: |
|
logger.error(f"Error searching Bing: {e}") |
|
return [] |
|
|
|
async def get_file_size(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.request.head(url, timeout=15000) |
|
length = response.headers.get('Content-Length', None) |
|
if length: |
|
return sizeof_fmt(int(length)) |
|
else: |
|
return "Unknown Size" |
|
except Exception: |
|
return "Unknown Size" |
|
|
|
async def get_pdf_metadata(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
resp = await page.request.get(url, timeout=15000) |
|
if resp.ok: |
|
content = await resp.body() |
|
pdf = BytesIO(content) |
|
reader = PdfReader(pdf) |
|
return { |
|
'Title': reader.metadata.get('/Title', 'N/A') if reader.metadata else 'N/A', |
|
'Author': reader.metadata.get('/Author', 'N/A') if reader.metadata else 'N/A', |
|
'Pages': len(reader.pages), |
|
} |
|
else: |
|
return {} |
|
except Exception: |
|
return {} |
|
|
|
async def extract_real_download_url(self, url): |
|
try: |
|
async with self.context.new_page() as page: |
|
response = await page.goto(url, wait_until='networkidle', timeout=30000) |
|
if response and response.headers.get('location'): |
|
return response.headers['location'] |
|
return page.url |
|
except Exception as e: |
|
logger.error(f"Error extracting real download URL: {e}") |
|
return url |
|
|
|
async def get_edu_exam_links(self, url): |
|
"""Specialized method for educational exam websites that follows a common pattern.""" |
|
try: |
|
logger.info(f"Fetching exam links from {url}") |
|
links = set() |
|
|
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urljoin, urlparse |
|
|
|
headers = {"User-Agent": get_random_user_agent()} |
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
|
if response.status_code != 200: |
|
logger.warning(f"Failed to fetch page: {response.status_code}") |
|
return [] |
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
|
|
|
|
for a in soup.find_all("a", href=True): |
|
href = a["href"] |
|
full_url = urljoin(url, href) |
|
|
|
|
|
for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/", |
|
"/test/", "/download/", "/files/", "/assignments/"]: |
|
if pattern in full_url.lower(): |
|
links.add(full_url) |
|
break |
|
|
|
|
|
if len(links) < 5: |
|
logger.info("Using browser for enhanced link extraction") |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
|
|
|
|
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable') |
|
if grid_elements: |
|
for grid in grid_elements: |
|
grid_links = await grid.query_selector_all('a[href]') |
|
for a in grid_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
links.add(full_url) |
|
|
|
|
|
show_buttons = await self.page.query_selector_all('input[type="button"], button') |
|
for button in show_buttons: |
|
button_text = await button.text_content() or "" |
|
button_value = await button.get_attribute("value") or "" |
|
if any(keyword in (button_text + button_value).lower() for keyword in |
|
["show", "view", "display", "list", "exam", "paper", "test"]): |
|
try: |
|
await button.click() |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
new_links = await self.page.query_selector_all('a[href]') |
|
for a in new_links: |
|
href = await a.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else urljoin(url, href) |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
filtered_links = [] |
|
for link in links: |
|
|
|
if any(ext in link.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.zip']): |
|
filtered_links.append(link) |
|
continue |
|
|
|
|
|
if any(pattern in link.lower() for pattern in [ |
|
"/eduexp/docs/pastexam", "/exam/", "/pastexam/", "/papers/", |
|
"/pastpapers/", "/questionpapers/", "/tests/" |
|
]): |
|
filtered_links.append(link) |
|
|
|
logger.info(f"Found {len(filtered_links)} potential exam document links") |
|
return filtered_links |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting exam links: {e}") |
|
return [] |
|
|
|
async def extract_downloadable_files(self, url, custom_ext_list): |
|
found_files = [] |
|
try: |
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized handler for educational exam site") |
|
|
|
|
|
exam_links = await self.get_edu_exam_links(url) |
|
|
|
for link in exam_links: |
|
|
|
real_url = await self.extract_real_download_url(link) |
|
filename = os.path.basename(urlparse(real_url).path) |
|
|
|
|
|
if '%' in filename: |
|
try: |
|
from urllib.parse import unquote |
|
filename = unquote(filename) |
|
except Exception: |
|
pass |
|
|
|
|
|
size_str = await self.get_file_size(real_url) |
|
|
|
|
|
meta = {} |
|
if real_url.lower().endswith('.pdf'): |
|
try: |
|
meta = await self.get_pdf_metadata(real_url) |
|
except Exception: |
|
pass |
|
|
|
found_files.append({ |
|
'url': real_url, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
if found_files: |
|
return found_files |
|
|
|
|
|
response = await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
if not response: |
|
return [] |
|
|
|
final_url = self.page.url |
|
if '.php' in final_url or 'download' in final_url: |
|
real_url = await self.extract_real_download_url(final_url) |
|
if real_url != final_url: |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {} |
|
}) |
|
return found_files |
|
|
|
await self.page.wait_for_load_state('networkidle', timeout=30000) |
|
content = await self.page.content() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
default_exts = ['.pdf', '.docx', '.doc', '.zip', '.rar', '.mp3', '.mp4', |
|
'.avi', '.mkv', '.png', '.jpg', '.jpeg', '.gif', '.xlsx', |
|
'.pptx', '.odt', '.txt'] |
|
all_exts = set(default_exts + [ext.strip().lower() for ext in custom_ext_list if ext.strip()]) |
|
|
|
parsed_base = urlparse(final_url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
for a in soup.find_all('a', href=True): |
|
href = a['href'].strip() |
|
|
|
if '.php' in href.lower() or 'download' in href.lower(): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
real_url = await self.extract_real_download_url(full_url) |
|
if real_url and real_url != full_url: |
|
found_files.append({ |
|
'url': real_url, |
|
'filename': os.path.basename(urlparse(real_url).path) or 'downloaded_file', |
|
'size': await self.get_file_size(real_url), |
|
'metadata': {} |
|
}) |
|
continue |
|
|
|
if any(href.lower().endswith(ext) for ext in all_exts): |
|
file_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
elif ("drive.google.com" in href) or ("docs.google.com" in href): |
|
file_id = None |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, href) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
if file_id: |
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
|
|
|
|
filename = f"gdrive_{file_id}" |
|
if file_type: |
|
filename = f"{filename}.{file_type}" |
|
|
|
size_str = "View-only" if is_view_only else await self.get_file_size(f"https://drive.google.com/uc?export=download&id={file_id}") |
|
|
|
found_files.append({ |
|
'url': href, |
|
'filename': filename, |
|
'size': size_str, |
|
'metadata': { |
|
'view_only': is_view_only, |
|
'file_type': file_type, |
|
'file_id': file_id |
|
} |
|
}) |
|
|
|
|
|
other_elements = soup.find_all(['iframe', 'embed', 'object', 'source']) |
|
for elem in other_elements: |
|
src = elem.get('src') or elem.get('data') |
|
if src and any(src.lower().endswith(ext) for ext in all_exts): |
|
file_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
size_str = await self.get_file_size(file_url) |
|
meta = {} |
|
if file_url.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(file_url) |
|
found_files.append({ |
|
'url': file_url, |
|
'filename': os.path.basename(file_url.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="download"], *[onclick*="file"]') |
|
for elem in onclick_elements: |
|
onclick = await elem.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url_match in urls: |
|
if any(url_match.lower().endswith(ext) for ext in all_exts): |
|
size_str = await self.get_file_size(url_match) |
|
meta = {} |
|
if url_match.lower().endswith('.pdf'): |
|
meta = await self.get_pdf_metadata(url_match) |
|
found_files.append({ |
|
'url': url_match, |
|
'filename': os.path.basename(url_match.split('?')[0]), |
|
'size': size_str, |
|
'metadata': meta |
|
}) |
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in found_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
return unique_files |
|
except Exception as e: |
|
logger.error(f"Error extracting files from {url}: {e}") |
|
return [] |
|
|
|
async def download_file(self, file_info, save_dir, referer): |
|
file_url = file_info['url'] |
|
fname = file_info['filename'] |
|
path = os.path.join(save_dir, fname) |
|
base, ext = os.path.splitext(fname) |
|
counter = 1 |
|
while os.path.exists(path): |
|
path = os.path.join(save_dir, f"{base}_{counter}{ext}") |
|
counter += 1 |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
try: |
|
|
|
if "drive.google.com" in file_url or "docs.google.com" in file_url: |
|
|
|
is_view_only = file_info.get('metadata', {}).get('view_only', False) |
|
|
|
|
|
if is_view_only: |
|
logger.info(f"Attempting to download view-only file: {file_url}") |
|
result_path = await self.force_download_viewonly(file_info, path) |
|
if result_path: |
|
return result_path |
|
|
|
|
|
logger.info("Primary method failed, trying fallback methods") |
|
|
|
|
|
success = await self.download_from_google_drive(file_url, path) |
|
if success: |
|
return path |
|
|
|
|
|
logger.warning("All standard methods failed, attempting force download") |
|
result_path = await self.force_download_viewonly(file_info, path) |
|
return result_path if result_path else None |
|
|
|
|
|
async with self.context.new_page() as page: |
|
headers = { |
|
'Accept': '*/*', |
|
'Accept-Encoding': 'gzip, deflate, br', |
|
'Referer': referer |
|
} |
|
response = await page.request.get(file_url, headers=headers, timeout=30000) |
|
if response.status == 200: |
|
content = await response.body() |
|
with open(path, 'wb') as f: |
|
f.write(content) |
|
return path |
|
else: |
|
logger.error(f"Download failed with status {response.status}: {file_url}") |
|
return None |
|
except Exception as e: |
|
logger.error(f"Error downloading {file_url}: {e}") |
|
return None |
|
|
|
async def force_download_viewonly(self, file_info, save_path): |
|
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs""" |
|
try: |
|
|
|
file_id = file_info.get('metadata', {}).get('file_id') |
|
if not file_id: |
|
url = file_info['url'] |
|
for pattern in [r'/file/d/([^/]+)', r'id=([^&]+)', r'open\?id=([^&]+)']: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error("Could not extract file ID") |
|
return None |
|
|
|
file_type = file_info.get('metadata', {}).get('file_type', 'pdf') |
|
base, ext = os.path.splitext(save_path) |
|
if not ext: |
|
save_path = f"{base}.{file_type}" |
|
|
|
logger.info(f"Starting reliable download of Google Drive file {file_id} (type: {file_type})") |
|
|
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=[ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security', |
|
'--disable-features=IsolateOrigins,site-per-process', |
|
'--disable-site-isolation-trials' |
|
] |
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
device_scale_factor=2.0 |
|
) |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening file view page: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=90000) |
|
await page.wait_for_load_state('networkidle') |
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
if file_type.lower() == 'pdf': |
|
|
|
pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")') |
|
|
|
|
|
total_pages = await page.evaluate(""" |
|
() => { |
|
// Method 1: Check page counter text |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// Method 2: Check actual page elements |
|
const pageElements = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pageElements.length > 0) return pageElements.length; |
|
|
|
// Method 3: Look for page thumbnails |
|
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb'); |
|
if (thumbnails.length > 0) return thumbnails.length; |
|
|
|
// Fallback: conservative guess based on UI |
|
return 50; // Safe default when we can't determine |
|
} |
|
""") |
|
|
|
logger.info(f"Detected {total_pages} pages in PDF") |
|
|
|
if total_pages <= 1: |
|
|
|
|
|
next_button = await page.query_selector('button[aria-label="Next page"]') |
|
if next_button: |
|
disabled = await next_button.get_attribute('disabled') |
|
if not disabled: |
|
logger.info("Found next button that's not disabled, document has multiple pages") |
|
total_pages = 100 |
|
|
|
|
|
if total_pages <= 1: |
|
|
|
logger.info("Using single-page capture approach") |
|
|
|
|
|
screenshot_path = os.path.join(temp_dir, "page.png") |
|
|
|
|
|
document_area = await page.query_selector('.drive-viewer-paginated-page') |
|
if document_area: |
|
await document_area.screenshot(path=screenshot_path) |
|
else: |
|
|
|
await page.screenshot(path=screenshot_path) |
|
|
|
|
|
from PIL import Image |
|
from reportlab.pdfgen import canvas as pdf_canvas |
|
|
|
img = Image.open(screenshot_path) |
|
width, height = img.size |
|
c = pdf_canvas.Canvas(save_path, pagesize=(width, height)) |
|
c.drawImage(screenshot_path, 0, 0, width, height) |
|
c.save() |
|
|
|
os.remove(screenshot_path) |
|
os.rmdir(temp_dir) |
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
return save_path |
|
return None |
|
|
|
|
|
logger.info(f"Using multi-page capture approach for {total_pages} pages") |
|
|
|
|
|
|
|
current_page_text = await page.evaluate(""" |
|
() => { |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
return pageCounters[0].textContent || ''; |
|
} |
|
return ''; |
|
} |
|
""") |
|
|
|
current_page = 1 |
|
if current_page_text: |
|
match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text) |
|
if match: |
|
current_page = int(match.group(1)) |
|
|
|
|
|
if current_page > 1: |
|
logger.info(f"Currently on page {current_page}, navigating back to page 1") |
|
|
|
|
|
page_input = await page.query_selector('input[aria-label="Page"]') |
|
if page_input: |
|
await page_input.fill("1") |
|
await page_input.press("Enter") |
|
await page.wait_for_timeout(1000) |
|
else: |
|
|
|
prev_button = await page.query_selector('button[aria-label="Previous page"]') |
|
if prev_button: |
|
|
|
for _ in range(current_page - 1): |
|
try: |
|
await prev_button.click() |
|
await page.wait_for_timeout(500) |
|
except Exception as e: |
|
logger.warning(f"Error clicking prev button: {e}") |
|
break |
|
|
|
|
|
screenshots = [] |
|
page_num = 1 |
|
max_tries = min(total_pages + 10, 200) |
|
next_button = await page.query_selector('button[aria-label="Next page"]') |
|
|
|
|
|
await page.evaluate(""" |
|
() => { |
|
// Try to find and click any "full page" or "maximize" buttons |
|
const fullViewButtons = Array.from(document.querySelectorAll('button')) |
|
.filter(b => b.textContent?.includes('Full') || |
|
b.getAttribute('aria-label')?.includes('Full') || |
|
b.getAttribute('aria-label')?.includes('fit page')); |
|
if (fullViewButtons.length > 0) { |
|
fullViewButtons[0].click(); |
|
} |
|
} |
|
""") |
|
|
|
await page.wait_for_timeout(1000) |
|
|
|
while page_num <= max_tries: |
|
|
|
await page.wait_for_timeout(800) |
|
|
|
|
|
screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png") |
|
|
|
|
|
page_content = await page.query_selector('.drive-viewer-paginated-page') |
|
if page_content: |
|
|
|
await page_content.screenshot(path=screenshot_path) |
|
else: |
|
|
|
await page.screenshot(path=screenshot_path) |
|
|
|
screenshots.append(screenshot_path) |
|
logger.info(f"Captured page {page_num}") |
|
|
|
|
|
if next_button: |
|
is_disabled = await next_button.get_attribute('disabled') |
|
if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True: |
|
logger.info(f"Reached end of document after {page_num} pages") |
|
break |
|
|
|
|
|
try: |
|
await next_button.click() |
|
await page.wait_for_timeout(800) |
|
page_num += 1 |
|
except Exception as e: |
|
logger.error(f"Error clicking next button: {e}") |
|
|
|
next_button = await page.query_selector('button[aria-label="Next page"]') |
|
if not next_button: |
|
logger.warning("Next button disappeared, assuming end of document") |
|
break |
|
else: |
|
|
|
next_button = await page.query_selector('button[aria-label="Next page"]') |
|
if not next_button: |
|
logger.warning("Could not find next button, stopping navigation") |
|
break |
|
|
|
|
|
if page_num >= total_pages: |
|
logger.info(f"Reached expected total of {total_pages} pages") |
|
break |
|
|
|
|
|
logger.info(f"Creating PDF from {len(screenshots)} captured pages") |
|
|
|
from PIL import Image |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas as pdf_canvas |
|
|
|
|
|
if screenshots: |
|
try: |
|
img = Image.open(screenshots[0]) |
|
width, height = img.size |
|
|
|
c = pdf_canvas.Canvas(save_path, pagesize=(width, height)) |
|
|
|
for screenshot in screenshots: |
|
try: |
|
if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100: |
|
img = Image.open(screenshot) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
except Exception as e: |
|
logger.error(f"Error adding page to PDF: {e}") |
|
|
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
if os.path.exists(screenshot): |
|
os.remove(screenshot) |
|
|
|
logger.info(f"Successfully created PDF with {len(screenshots)} pages") |
|
except Exception as e: |
|
logger.error(f"Error creating PDF: {e}") |
|
else: |
|
logger.error("No screenshots captured to create PDF") |
|
else: |
|
|
|
screenshot_path = os.path.join(temp_dir, "file.png") |
|
await page.screenshot(path=screenshot_path) |
|
|
|
if file_type.lower() in ['doc', 'docx', 'xlsx', 'pptx']: |
|
|
|
await self.export_google_doc(file_id, file_type, save_path) |
|
else: |
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
os.remove(screenshot_path) |
|
|
|
|
|
try: |
|
os.rmdir(temp_dir) |
|
except: |
|
pass |
|
|
|
|
|
await browser.close() |
|
|
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 1000: |
|
logger.info(f"Successfully downloaded file to {save_path}") |
|
return save_path |
|
else: |
|
logger.error(f"Generated file is too small or missing: {save_path}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error during force download: {e}") |
|
if browser: |
|
await browser.close() |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Force download preparation failed: {e}") |
|
return None |
|
|
|
async def download_from_google_drive(self, url, save_path): |
|
"""Enhanced method to download from Google Drive with multiple fallback approaches""" |
|
|
|
file_id = None |
|
url_patterns = [ |
|
r'drive\.google\.com/file/d/([^/]+)', |
|
r'drive\.google\.com/open\?id=([^&]+)', |
|
r'docs\.google\.com/\w+/d/([^/]+)', |
|
r'id=([^&]+)', |
|
r'drive\.google\.com/uc\?id=([^&]+)', |
|
] |
|
|
|
for pattern in url_patterns: |
|
match = re.search(pattern, url) |
|
if match: |
|
file_id = match.group(1) |
|
break |
|
|
|
if not file_id: |
|
logger.error(f"Could not extract file ID from URL: {url}") |
|
return False |
|
|
|
|
|
file_type, is_view_only = await self.get_google_drive_file_info(file_id) |
|
logger.info(f"Google Drive file type: {file_type}, View-only: {is_view_only}") |
|
|
|
base, ext = os.path.splitext(save_path) |
|
if not ext and file_type: |
|
|
|
save_path = f"{base}.{file_type}" |
|
|
|
|
|
if is_view_only: |
|
|
|
if file_type == 'pdf': |
|
success = await self.download_viewonly_pdf_with_js(file_id, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
if file_type in ['doc', 'docx', 'sheet', 'ppt', 'xlsx', 'pptx']: |
|
success = await self.export_google_doc(file_id, file_type, save_path) |
|
if success: |
|
return True |
|
|
|
|
|
success = await self.download_viewonly_with_screenshots(file_id, save_path, file_type) |
|
if success: |
|
return True |
|
|
|
|
|
try: |
|
|
|
import gdown |
|
output = gdown.download(f"https://drive.google.com/uc?id={file_id}", save_path, quiet=False, fuzzy=True) |
|
if output and os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
with open(save_path, 'rb') as f: |
|
content = f.read(100) |
|
if b'<!DOCTYPE html>' not in content: |
|
logger.info(f"Successfully downloaded with gdown: {url}") |
|
return True |
|
except Exception as e: |
|
logger.warning(f"gdown download failed: {e}") |
|
|
|
|
|
try: |
|
session = requests.Session() |
|
session.headers.update({'User-Agent': get_random_user_agent()}) |
|
|
|
|
|
session.get(f"https://drive.google.com/file/d/{file_id}/view", timeout=30) |
|
|
|
|
|
url = f"https://drive.google.com/uc?id={file_id}&export=download" |
|
response = session.get(url, stream=True, timeout=30) |
|
|
|
|
|
confirmation_token = None |
|
for k, v in response.cookies.items(): |
|
if k.startswith('download_warning'): |
|
confirmation_token = v |
|
break |
|
|
|
|
|
if confirmation_token: |
|
url = f"{url}&confirm={confirmation_token}" |
|
response = session.get(url, stream=True, timeout=60) |
|
|
|
|
|
content_type = response.headers.get('Content-Type', '') |
|
if 'text/html' in content_type: |
|
logger.warning("Received HTML instead of file - likely download restriction") |
|
else: |
|
with open(save_path, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=1024*1024): |
|
if chunk: |
|
f.write(chunk) |
|
|
|
if os.path.exists(save_path) and os.path.getsize(save_path) > 0: |
|
with open(save_path, 'rb') as f: |
|
content = f.read(100) |
|
if b'<!DOCTYPE html>' not in content: |
|
logger.info("Successfully downloaded with requests session") |
|
return True |
|
except Exception as e: |
|
logger.warning(f"Requests session download failed: {e}") |
|
|
|
logger.warning("Standard download methods failed") |
|
return False |
|
|
|
async def download_viewonly_pdf_with_js(self, file_id, save_path): |
|
"""Download view-only PDF using the enhanced blob image caching technique""" |
|
try: |
|
|
|
browser = await self.playwright.chromium.launch( |
|
headless=True, |
|
args=[ |
|
'--no-sandbox', |
|
'--disable-setuid-sandbox', |
|
'--disable-dev-shm-usage', |
|
'--disable-web-security' |
|
] |
|
) |
|
|
|
context = await browser.new_context( |
|
viewport={'width': 1600, 'height': 1200}, |
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
|
accept_downloads=True |
|
) |
|
|
|
page = await context.new_page() |
|
|
|
try: |
|
|
|
logger.info(f"Opening view-only PDF: https://drive.google.com/file/d/{file_id}/view") |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=60000) |
|
await page.wait_for_load_state('networkidle') |
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
estimated_pages = await page.evaluate(""" |
|
() => { |
|
// Look for page counter in the interface |
|
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => { |
|
const text = el.textContent || ''; |
|
return /\\d+\\s*\\/\\s*\\d+/.test(text); |
|
}); |
|
|
|
if (pageCounters.length > 0) { |
|
const text = pageCounters[0].textContent || ''; |
|
const match = text.match(/(\\d+)\\s*\\/\\s*(\\d+)/); |
|
if (match && match[2]) return parseInt(match[2]); |
|
} |
|
|
|
// If we can't find a counter, check actual pages |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length > 0) return pages.length; |
|
|
|
// Default to a reasonable number if we can't determine |
|
return 50; |
|
} |
|
""") |
|
|
|
logger.info(f"Estimated number of pages: {estimated_pages}") |
|
|
|
|
|
logger.info("Initial scroll to bottom to trigger lazy loading...") |
|
await page.keyboard.press("End") |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
logger.info("Waiting for all pages to load...") |
|
max_attempts = min(estimated_pages * 3, 300) |
|
attempt = 0 |
|
|
|
while attempt < max_attempts: |
|
|
|
blob_count = await page.evaluate(""" |
|
Array.from(document.getElementsByTagName('img')) |
|
.filter(img => img.src.startsWith('blob:') && img.width > 100) |
|
.length |
|
""") |
|
|
|
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images") |
|
|
|
|
|
if blob_count >= estimated_pages: |
|
logger.info("All pages appear to be loaded.") |
|
break |
|
|
|
|
|
await page.keyboard.press("PageDown") |
|
await page.wait_for_timeout(2000) |
|
attempt += 1 |
|
|
|
|
|
await page.wait_for_timeout(5000) |
|
|
|
|
|
download_promise = page.wait_for_event("download") |
|
|
|
|
|
logger.info("Generating PDF from loaded pages...") |
|
result = await page.evaluate(r''' |
|
(function() { |
|
return new Promise((resolve, reject) => { |
|
let script = document.createElement("script"); |
|
script.onload = function () { |
|
try { |
|
let pdf = new jsPDF(); |
|
let imgs = document.getElementsByTagName("img"); |
|
let added = 0; |
|
|
|
// First collect and sort all valid blob images |
|
let validImages = []; |
|
for (let i = 0; i < imgs.length; i++) { |
|
let img = imgs[i]; |
|
if (!/^blob:/.test(img.src)) continue; |
|
if (img.width < 100 || img.height < 100) continue; |
|
validImages.push(img); |
|
} |
|
|
|
// Sort by vertical position |
|
validImages.sort((a, b) => { |
|
const rectA = a.getBoundingClientRect(); |
|
const rectB = b.getBoundingClientRect(); |
|
return rectA.top - rectB.top; |
|
}); |
|
|
|
console.log(`Found ${validImages.length} valid page images to add to PDF`); |
|
|
|
// Process each image as a page |
|
for (let i = 0; i < validImages.length; i++) { |
|
let img = validImages[i]; |
|
let canvas = document.createElement("canvas"); |
|
let ctx = canvas.getContext("2d"); |
|
canvas.width = img.width; |
|
canvas.height = img.height; |
|
ctx.drawImage(img, 0, 0, img.width, img.height); |
|
let imgData = canvas.toDataURL("image/jpeg", 1.0); |
|
|
|
if (added > 0) { |
|
pdf.addPage(); |
|
} |
|
|
|
pdf.addImage(imgData, 'JPEG', 0, 0); |
|
added++; |
|
} |
|
|
|
pdf.save("download.pdf"); |
|
resolve({success: true, pageCount: added}); |
|
} catch (error) { |
|
reject({success: false, error: error.toString()}); |
|
} |
|
}; |
|
|
|
script.onerror = function() { |
|
reject({success: false, error: "Failed to load jsPDF library"}); |
|
}; |
|
|
|
// Use a reliable CDN |
|
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js'; |
|
document.body.appendChild(script); |
|
}); |
|
})(); |
|
''') |
|
|
|
if not result.get('success'): |
|
logger.error(f"Error in PDF generation: {result.get('error')}") |
|
return False |
|
|
|
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages") |
|
|
|
|
|
download = await download_promise |
|
|
|
|
|
await download.save_as(save_path) |
|
logger.info(f"Successfully saved PDF to {save_path}") |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 1000 |
|
|
|
finally: |
|
await browser.close() |
|
|
|
except Exception as e: |
|
logger.error(f"Error in viewonly PDF download process: {e}") |
|
return False |
|
|
|
async def download_viewonly_with_screenshots(self, file_id, save_path, file_type): |
|
"""Download any view-only file by taking screenshots""" |
|
try: |
|
async with self.context.new_page() as page: |
|
|
|
await page.set_viewport_size({"width": 1600, "height": 1200}) |
|
|
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle', timeout=60000) |
|
|
|
|
|
await page.wait_for_load_state('networkidle') |
|
await page.wait_for_timeout(3000) |
|
|
|
|
|
base_dir = os.path.dirname(save_path) |
|
base_name = os.path.splitext(os.path.basename(save_path))[0] |
|
screenshots_dir = os.path.join(base_dir, f"{base_name}_screenshots") |
|
os.makedirs(screenshots_dir, exist_ok=True) |
|
|
|
|
|
is_multi_page = await page.evaluate(""" |
|
() => { |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
return pages.length > 1; |
|
} |
|
""") |
|
|
|
if is_multi_page and file_type == 'pdf': |
|
|
|
page_count = await page.evaluate(""" |
|
async () => { |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
const container = document.querySelector('.drive-viewer-paginated-scrollable'); |
|
|
|
if (!container || pages.length === 0) return 0; |
|
|
|
// Scroll through to make sure all pages are loaded |
|
const scrollHeight = container.scrollHeight; |
|
const viewportHeight = container.clientHeight; |
|
const scrollStep = viewportHeight; |
|
|
|
for (let scrollPos = 0; scrollPos < scrollHeight; scrollPos += scrollStep) { |
|
container.scrollTo(0, scrollPos); |
|
await delay(300); |
|
} |
|
|
|
// Scroll back to top |
|
container.scrollTo(0, 0); |
|
await delay(300); |
|
|
|
return pages.length; |
|
} |
|
""") |
|
|
|
logger.info(f"Found {page_count} pages in document") |
|
|
|
|
|
screenshots = [] |
|
for i in range(page_count): |
|
|
|
await page.evaluate(f""" |
|
async () => {{ |
|
const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); |
|
const pages = document.querySelectorAll('.drive-viewer-paginated-page'); |
|
if (pages.length <= {i}) return false; |
|
|
|
pages[{i}].scrollIntoView(); |
|
await delay(500); |
|
return true; |
|
}} |
|
""") |
|
|
|
|
|
screenshot_path = os.path.join(screenshots_dir, f"page_{i+1}.png") |
|
await page.screenshot(path=screenshot_path, clip={ |
|
'x': 0, |
|
'y': 0, |
|
'width': 1600, |
|
'height': 1200 |
|
}) |
|
screenshots.append(screenshot_path) |
|
|
|
|
|
from PIL import Image |
|
from reportlab.pdfgen import canvas |
|
|
|
c = canvas.Canvas(save_path) |
|
for screenshot in screenshots: |
|
img = Image.open(screenshot) |
|
width, height = img.size |
|
|
|
|
|
c.setPageSize((width, height)) |
|
c.drawImage(screenshot, 0, 0, width, height) |
|
c.showPage() |
|
|
|
c.save() |
|
|
|
|
|
for screenshot in screenshots: |
|
os.remove(screenshot) |
|
os.rmdir(screenshots_dir) |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
|
|
screenshot_path = os.path.join(screenshots_dir, "screenshot.png") |
|
await page.screenshot(path=screenshot_path, fullPage=True) |
|
|
|
|
|
if file_type == 'pdf': |
|
from PIL import Image |
|
from reportlab.pdfgen import canvas |
|
|
|
|
|
img = Image.open(screenshot_path) |
|
width, height = img.size |
|
|
|
c = canvas.Canvas(save_path, pagesize=(width, height)) |
|
c.drawImage(screenshot_path, 0, 0, width, height) |
|
c.save() |
|
else: |
|
|
|
shutil.copy(screenshot_path, save_path) |
|
|
|
|
|
os.remove(screenshot_path) |
|
os.rmdir(screenshots_dir) |
|
|
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
|
|
except Exception as e: |
|
logger.error(f"Error taking screenshots: {e}") |
|
return False |
|
|
|
async def export_google_doc(self, file_id, file_type, save_path): |
|
"""Export Google Docs/Sheets/Slides to downloadable formats""" |
|
try: |
|
|
|
export_formats = { |
|
'doc': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
'sheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', |
|
'ppt': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', |
|
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', |
|
'pdf': 'application/pdf', |
|
} |
|
|
|
export_format = export_formats.get(file_type, 'application/pdf') |
|
export_url = f"https://docs.google.com/document/d/{file_id}/export?format={file_type}" |
|
|
|
if 'sheet' in file_type or 'xlsx' in file_type: |
|
export_url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" |
|
elif 'ppt' in file_type or 'presentation' in file_type: |
|
export_url = f"https://docs.google.com/presentation/d/{file_id}/export/pptx" |
|
elif file_type == 'pdf': |
|
export_url = f"https://docs.google.com/document/d/{file_id}/export?format=pdf" |
|
|
|
async with self.context.new_page() as page: |
|
|
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", wait_until='networkidle') |
|
|
|
|
|
response = await page.goto(export_url, wait_until='networkidle') |
|
|
|
if response.status == 200: |
|
content = await response.body() |
|
with open(save_path, 'wb') as f: |
|
f.write(content) |
|
return os.path.exists(save_path) and os.path.getsize(save_path) > 0 |
|
else: |
|
logger.warning(f"Export failed with status {response.status}") |
|
return False |
|
|
|
except Exception as e: |
|
logger.error(f"Error exporting Google Doc: {e}") |
|
return False |
|
|
|
async def get_google_drive_file_info(self, file_id): |
|
"""Get file type and view-only status from Google Drive""" |
|
file_type = None |
|
is_view_only = False |
|
|
|
try: |
|
async with self.context.new_page() as page: |
|
await page.goto(f"https://drive.google.com/file/d/{file_id}/view", timeout=30000) |
|
|
|
|
|
view_only_text = await page.query_selector('text="the owner has not granted you permission to download this file"') |
|
is_view_only = view_only_text is not None |
|
|
|
|
|
gdocs_viewer = await page.query_selector('iframe[src*="docs.google.com/document"]') |
|
gsheets_viewer = await page.query_selector('iframe[src*="docs.google.com/spreadsheets"]') |
|
gslides_viewer = await page.query_selector('iframe[src*="docs.google.com/presentation"]') |
|
|
|
if gdocs_viewer: |
|
file_type = 'docx' |
|
elif gsheets_viewer: |
|
file_type = 'xlsx' |
|
elif gslides_viewer: |
|
file_type = 'pptx' |
|
else: |
|
|
|
pdf_viewer = await page.query_selector('embed[type="application/pdf"]') |
|
if pdf_viewer: |
|
file_type = 'pdf' |
|
else: |
|
|
|
img_viewer = await page.query_selector('img[src*="googleusercontent.com"]') |
|
if img_viewer: |
|
|
|
img_src = await img_viewer.get_attribute('src') |
|
if 'jpg' in img_src or 'jpeg' in img_src: |
|
file_type = 'jpg' |
|
elif 'png' in img_src: |
|
file_type = 'png' |
|
else: |
|
file_type = 'jpg' |
|
else: |
|
|
|
file_type = 'pdf' |
|
|
|
|
|
if not file_type: |
|
title_element = await page.query_selector('div[role="heading"]') |
|
if title_element: |
|
title = await title_element.text_content() |
|
if title: |
|
ext_match = re.search(r'\.([a-zA-Z0-9]+)$', title) |
|
if ext_match: |
|
file_type = ext_match.group(1).lower() |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting Google Drive file info: {e}") |
|
file_type = 'pdf' |
|
|
|
return file_type, is_view_only |
|
|
|
async def get_sublinks(self, url, limit=10000): |
|
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements""" |
|
links = set() |
|
try: |
|
logger.info(f"Fetching sublinks from: {url}") |
|
|
|
|
|
if "phsms.cloud.ncnu.edu.tw" in url or any(keyword in url.lower() for keyword in |
|
["exam", "test", "pastpaper", "eduexp"]): |
|
logger.info("Using specialized exam site sublink extraction") |
|
edu_links = await self.get_edu_exam_links(url) |
|
for link in edu_links: |
|
links.add(link) |
|
|
|
|
|
if len(links) > 5: |
|
logger.info(f"Found {len(links)} sublinks with specialized method") |
|
return list(links)[:limit] |
|
|
|
|
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
|
|
|
|
parsed_base = urlparse(url) |
|
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}" |
|
path_base = os.path.dirname(parsed_base.path) |
|
|
|
|
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
|
|
if is_aspnet: |
|
logger.info("Detected ASP.NET page, using enhanced extraction method") |
|
|
|
|
|
|
|
dropdowns = await self.page.query_selector_all('select') |
|
buttons = await self.page.query_selector_all('input[type="button"], input[type="submit"], button') |
|
|
|
|
|
for dropdown in dropdowns: |
|
try: |
|
|
|
options = await self.page.evaluate(''' |
|
(dropdown) => { |
|
return Array.from(dropdown.options).map(o => o.value); |
|
} |
|
''', dropdown) |
|
|
|
|
|
for option in options: |
|
if option: |
|
await dropdown.select_option(value=option) |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error interacting with dropdown: {e}") |
|
|
|
|
|
safe_buttons = [] |
|
for button in buttons: |
|
button_text = await button.text_content() or "" |
|
button_value = await button.get_attribute("value") or "" |
|
button_id = await button.get_attribute("id") or "" |
|
combined_text = (button_text + button_value + button_id).lower() |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["delete", "remove", "cancel", "close", "logout"]): |
|
continue |
|
|
|
|
|
if any(keyword in combined_text for keyword in ["view", "show", "search", "browse", "list", "go", "display"]): |
|
safe_buttons.append(button) |
|
|
|
|
|
for button in safe_buttons[:5]: |
|
try: |
|
await button.click() |
|
await self.page.wait_for_timeout(1000) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error clicking button: {e}") |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
|
|
|
|
grid_cells = await self.page.query_selector_all('td a, tr.rgRow a, tr.rgAltRow a, .grid a, .table a') |
|
for cell in grid_cells: |
|
try: |
|
href = await cell.get_attribute('href') |
|
if href: |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links.add(full_url) |
|
except Exception as e: |
|
logger.warning(f"Error extracting grid link: {e}") |
|
|
|
|
|
postback_links = await self.page.evaluate(''' |
|
() => { |
|
const results = []; |
|
// Find elements with onclick containing __doPostBack |
|
const elements = document.querySelectorAll('*[onclick*="__doPostBack"]'); |
|
for (const el of elements) { |
|
// Extract the postback target |
|
const onclick = el.getAttribute('onclick') || ''; |
|
const match = onclick.match(/__doPostBack\\('([^']+)'.*?\\)/); |
|
if (match && match[1]) { |
|
// Get the visible text to use as description |
|
const text = el.innerText || el.textContent || 'Link'; |
|
results.push({ |
|
id: match[1], |
|
text: text.trim() |
|
}); |
|
} |
|
} |
|
return results; |
|
} |
|
''') |
|
|
|
|
|
for postback in postback_links[:10]: |
|
try: |
|
logger.info(f"Trying postback link: {postback['text']} ({postback['id']})") |
|
await self.page.evaluate(f''' |
|
() => {{ |
|
if (typeof __doPostBack === 'function') {{ |
|
__doPostBack('{postback["id"]}', ''); |
|
}} |
|
}} |
|
''') |
|
await self.page.wait_for_timeout(1500) |
|
await self.page.wait_for_load_state('networkidle', timeout=5000) |
|
|
|
|
|
await self.extract_all_link_types(links, base_url, path_base) |
|
except Exception as e: |
|
logger.warning(f"Error with postback: {e}") |
|
|
|
logger.info(f"Found {len(links)} sublinks") |
|
return list(links)[:limit] |
|
|
|
except Exception as e: |
|
logger.error(f"Error getting sublinks from {url}: {e}") |
|
return list(links)[:limit] |
|
|
|
async def extract_all_link_types(self, links_set, base_url, path_base): |
|
"""Extract all types of links from the current page""" |
|
|
|
a_links = await self.page.query_selector_all('a[href]') |
|
for a in a_links: |
|
try: |
|
href = await a.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
iframes = await self.page.query_selector_all('iframe[src]') |
|
for iframe in iframes: |
|
try: |
|
src = await iframe.get_attribute('src') |
|
if src and not src.startswith('javascript:') and not src.startswith('about:'): |
|
full_url = src if src.startswith('http') else self.resolve_relative_url(src, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
onclick_elements = await self.page.query_selector_all('*[onclick*="window.location"], *[onclick*="document.location"]') |
|
for el in onclick_elements: |
|
try: |
|
onclick = await el.get_attribute('onclick') |
|
urls = re.findall(r'(https?://[^\'"]+)', onclick) |
|
for url in urls: |
|
links_set.add(url) |
|
except Exception: |
|
pass |
|
|
|
|
|
data_elements = await self.page.query_selector_all('*[data-url], *[data-href], *[data-src]') |
|
for el in data_elements: |
|
for attr in ['data-url', 'data-href', 'data-src']: |
|
try: |
|
value = await el.get_attribute(attr) |
|
if value and not value.startswith('javascript:'): |
|
full_url = value if value.startswith('http') else self.resolve_relative_url(value, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
|
|
special_anchors = await self.page.query_selector_all('.rgMasterTable a, .grid a, #GridView1 a, #gvResults a') |
|
for anchor in special_anchors: |
|
try: |
|
href = await anchor.get_attribute('href') |
|
if href and not href.startswith('javascript:') and not href.startswith('#'): |
|
full_url = href if href.startswith('http') else self.resolve_relative_url(href, base_url, path_base) |
|
links_set.add(full_url) |
|
except Exception: |
|
pass |
|
|
|
def resolve_relative_url(self, relative_url, base_url, path_base): |
|
"""Properly resolve relative URLs considering multiple formats""" |
|
if relative_url.startswith('/'): |
|
|
|
return f"{base_url}{relative_url}" |
|
elif relative_url.startswith('./'): |
|
|
|
return f"{base_url}{path_base}/{relative_url[2:]}" |
|
elif relative_url.startswith('../'): |
|
|
|
parent_path = '/'.join(path_base.split('/')[:-1]) |
|
return f"{base_url}{parent_path}/{relative_url[3:]}" |
|
else: |
|
|
|
return f"{base_url}{path_base}/{relative_url}" |
|
|
|
async def deep_search(self, url, custom_ext_list=None, sublink_limit=10000, timeout=60): |
|
if not custom_ext_list: |
|
custom_ext_list = [] |
|
progress_text = st.empty() |
|
progress_bar = st.progress(0) |
|
file_count_text = st.empty() |
|
|
|
try: |
|
progress_text.text("Analyzing main page...") |
|
|
|
is_aspnet = False |
|
try: |
|
await self.page.goto(url, timeout=30000, wait_until='networkidle') |
|
is_aspnet = await self.page.evaluate(''' |
|
() => { |
|
return document.querySelector('form#aspnetForm') !== null || |
|
document.querySelector('input[name="__VIEWSTATE"]') !== null; |
|
} |
|
''') |
|
except Exception: |
|
pass |
|
|
|
|
|
main_files = await self.extract_downloadable_files(url, custom_ext_list) |
|
initial_count = len(main_files) |
|
file_count_text.text(f"Found {initial_count} files on main page") |
|
|
|
|
|
progress_text.text("Getting sublinks...") |
|
sublinks = await self.get_sublinks(url, sublink_limit) |
|
total_links = len(sublinks) |
|
progress_text.text(f"Found {total_links} sublinks to process") |
|
|
|
if not sublinks: |
|
progress_bar.progress(1.0) |
|
return main_files |
|
|
|
|
|
all_files = main_files |
|
for i, sublink in enumerate(sublinks, 1): |
|
progress = i / total_links |
|
progress_text.text(f"Processing sublink {i}/{total_links}: {sublink}") |
|
progress_bar.progress(progress) |
|
|
|
try: |
|
|
|
sub_timeout = timeout * 2 if is_aspnet else timeout |
|
|
|
|
|
async with async_timeout(sub_timeout): |
|
sub_files = await self.extract_downloadable_files(sublink, custom_ext_list) |
|
all_files.extend(sub_files) |
|
file_count_text.text(f"Found {len(all_files)} total files") |
|
except Exception as e: |
|
logger.warning(f"Error processing sublink {sublink}: {e}") |
|
|
|
|
|
seen_urls = set() |
|
unique_files = [] |
|
for f in all_files: |
|
if f['url'] not in seen_urls: |
|
seen_urls.add(f['url']) |
|
unique_files.append(f) |
|
|
|
final_count = len(unique_files) |
|
progress_text.text(f"Deep search complete!") |
|
file_count_text.text(f"Found {final_count} unique files") |
|
progress_bar.progress(1.0) |
|
return unique_files |
|
|
|
except Exception as e: |
|
logger.error(f"Deep search error: {e}") |
|
progress_text.text(f"Error during deep search: {str(e)}") |
|
return [] |
|
|
|
finally: |
|
await asyncio.sleep(2) |
|
if not st.session_state.get('keep_progress', False): |
|
progress_text.empty() |
|
progress_bar.empty() |
|
|
|
def extract_keywords(text, n=5): |
|
doc = nlp_model(text) |
|
keywords = [token.text for token in doc if token.is_alpha and not token.is_stop][:n] |
|
return keywords |
|
|
|
def analyze_sentiment(text): |
|
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") |
|
result = sentiment_analyzer(text[:512])[0] |
|
return result['label'], result['score'] |
|
|
|
def get_file_hash(file_path): |
|
hasher = hashlib.md5() |
|
with open(file_path, 'rb') as f: |
|
hasher.update(f.read()) |
|
return hasher.hexdigest() |
|
|
|
|
|
def main(): |
|
if 'initialized' not in st.session_state: |
|
st.session_state.initialized = True |
|
st.session_state.discovered_files = [] |
|
st.session_state.current_url = None |
|
st.session_state.google_creds = None |
|
st.session_state.selected_files = [] |
|
st.session_state.do_deep_search = False |
|
st.session_state.deep_search_url = None |
|
st.session_state.search_results = [] |
|
|
|
st.title("Advanced File Downloader") |
|
|
|
with st.sidebar: |
|
mode = st.radio("Select Mode", ["Manual URL", "Bing Search", "PDF Summarizer"], key="mode_select") |
|
with st.expander("Advanced Options", expanded=True): |
|
custom_extensions = st.text_input("Custom File Extensions", placeholder=".csv, .txt, .epub", key="custom_ext_input", help="Enter extensions like .csv, .txt") |
|
max_sublinks = st.number_input("Maximum Sublinks to Process", min_value=1, max_value=100000, value=10000, step=50, key="max_sublinks_input", help="Max sublinks to scan from main page") |
|
sublink_timeout = st.number_input("Search Timeout (seconds per sublink)", min_value=1, max_value=3000, value=30, step=5, key="timeout_input", help="Timeout for each sublink") |
|
use_proxy = st.checkbox("Use Proxy", key="proxy_checkbox") |
|
proxy = st.text_input("Proxy URL", placeholder="http://proxy:port", key="proxy_input") |
|
with st.expander("Google Drive Integration", expanded=False): |
|
if st.button("Start Google Sign-In", key="google_signin_btn"): |
|
auth_url = get_google_auth_url() |
|
st.markdown(f"[Click here to authorize]({auth_url})") |
|
auth_code = st.text_input("Enter authorization code", key="auth_code_input") |
|
if st.button("Complete Sign-In", key="complete_signin_btn") and auth_code: |
|
creds, msg = exchange_code_for_credentials(auth_code) |
|
st.session_state.google_creds = creds |
|
st.write(msg) |
|
|
|
if mode == "Manual URL": |
|
st.header("Manual URL Mode") |
|
url = st.text_input("Enter URL", placeholder="https://example.com", key="url_input") |
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
if st.button("Deep Search", use_container_width=True, key="deep_search_btn"): |
|
if url: |
|
custom_ext_list = [ext.strip().lower() for ext in custom_extensions.split(',') if ext.strip()] |
|
valid_ext_list = [ext for ext in custom_ext_list if re.match(r'^\.[a-zA-Z0-9]+$', ext)] |
|
if custom_ext_list != valid_ext_list: |
|
st.warning("Invalid extensions ignored. Use format like '.csv'.") |
|
async def run_deep_search(): |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
files = await dm.deep_search(url, valid_ext_list, max_sublinks, sublink_timeout) |
|
return files |
|
files = asyncio.run(run_deep_search()) |
|
if files: |
|
st.session_state.discovered_files = files |
|
st.session_state.current_url = url |
|
st.success(f"Found {len(files)} files!") |
|
else: |
|
st.warning("No files found.") |
|
|
|
if st.session_state.discovered_files: |
|
files = st.session_state.discovered_files |
|
st.success(f"Found {len(files)} files!") |
|
col1, col2 = st.columns([1, 4]) |
|
with col1: |
|
if st.button("Select All", key="select_all_btn"): |
|
st.session_state.selected_files = list(range(len(files))) |
|
if st.button("Clear Selection", key="clear_selection_btn"): |
|
st.session_state.selected_files = [] |
|
selected_files = st.multiselect("Select files to download", options=list(range(len(files))), default=st.session_state.selected_files, format_func=lambda x: f"{files[x]['filename']} ({files[x]['size']})", key="file_multiselect") |
|
st.session_state.selected_files = selected_files |
|
if selected_files: |
|
col1, col2, col3, col4 = st.columns(4) |
|
with col1: |
|
download_dir = st.text_input("Download Directory", value="./downloads", key="download_dir_input") |
|
with col2: |
|
create_zip = st.checkbox("Create ZIP file", value=True, key="create_zip_checkbox") |
|
with col3: |
|
delete_after = st.checkbox("Delete after creating ZIP", key="delete_after_checkbox") |
|
with col4: |
|
upload_to_drive = st.checkbox("Upload to Google Drive", key="upload_drive_checkbox") |
|
if st.button("Download Selected", key="download_btn"): |
|
if not os.path.exists(download_dir): |
|
os.makedirs(download_dir) |
|
async def download_files(): |
|
downloaded_paths = [] |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy) as dm: |
|
for i, idx in enumerate(selected_files): |
|
progress = (i + 1) / len(selected_files) |
|
file_info = files[idx] |
|
status_text.text(f"Downloading {file_info['filename']}... ({i+1}/{len(selected_files)})") |
|
progress_bar.progress(progress) |
|
path = await dm.download_file(file_info, download_dir, url) |
|
if path: |
|
downloaded_paths.append(path) |
|
status_text.empty() |
|
progress_bar.empty() |
|
return downloaded_paths |
|
downloaded = asyncio.run(download_files()) |
|
if downloaded: |
|
st.success(f"Successfully downloaded {len(downloaded)} files") |
|
if create_zip: |
|
zip_path = create_zip_file(downloaded, download_dir) |
|
st.success(f"Created ZIP file: {zip_path}") |
|
with open(zip_path, "rb") as f: |
|
zip_data = f.read() |
|
st.download_button("Download ZIP", data=zip_data, file_name=os.path.basename(zip_path), mime="application/zip") |
|
if upload_to_drive and st.session_state.google_creds: |
|
drive_service = googleapiclient.discovery.build("drive", "v3", credentials=st.session_state.google_creds) |
|
folder_id = create_drive_folder(drive_service, f"Downloads_{urlparse(url).netloc}") |
|
drive_id = google_drive_upload(zip_path, st.session_state.google_creds, folder_id) |
|
if not isinstance(drive_id, str) or not drive_id.startswith("Error"): |
|
st.success(f"Uploaded to Google Drive. File ID: {drive_id}") |
|
else: |
|
st.error(drive_id) |
|
if delete_after: |
|
for path in downloaded: |
|
try: |
|
os.remove(path) |
|
except Exception as e: |
|
st.warning(f"Could not delete {path}: {e}") |
|
st.info("Deleted original files after ZIP creation") |
|
else: |
|
for path in downloaded: |
|
with open(path, "rb") as f: |
|
file_data = f.read() |
|
st.download_button(f"Download {os.path.basename(path)}", data=file_data, file_name=os.path.basename(path)) |
|
|
|
elif mode == "Bing Search": |
|
st.header("Bing Search Mode") |
|
query = st.text_input("Enter search query", key="search_query_input") |
|
num_results = st.slider("Number of results", 1, 50, 5, key="num_results_slider") |
|
if st.button("Search", key="search_btn"): |
|
if query: |
|
async def run_search(): |
|
async with DownloadManager(use_proxy=use_proxy, proxy=proxy, query=query, num_results=num_results) as dm: |
|
with st.spinner("Searching..."): |
|
urls = await dm.search_bing() |
|
if urls: |
|
st.session_state.search_results = urls |
|
st.success(f"Found {len(urls)} results!") |
|
for i, url in enumerate(urls, 1): |
|
with st.expander(f"Result {i}: {url}", expanded=(i == 1)): |
|
if st.button(f"Deep Search Result {i}", key=f"deep_search_result_{i}"): |
|
st.session_state.deep_search_url = url |
|
st.session_state.do_deep_search = True |
|
else: |
|
st.warning("No search results found.") |
|
asyncio.run(run_search()) |
|
|
|
else: |
|
if summarizer is None: |
|
st.error("PDF summarization is not available due to model loading errors.") |
|
else: |
|
st.header("PDF Summarizer") |
|
pdf_url = st.text_input("Enter PDF URL", key="pdf_url_input") |
|
if st.button("Summarize", key="summarize_btn"): |
|
if pdf_url: |
|
with st.spinner("Generating summary..."): |
|
try: |
|
response = requests.get(pdf_url, stream=True) |
|
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") |
|
with open(temp_pdf.name, "wb") as f: |
|
f.write(response.content) |
|
reader = PdfReader(temp_pdf.name) |
|
text = " ".join([page.extract_text() or "" for page in reader.pages]) |
|
os.remove(temp_pdf.name) |
|
summary = summarizer(text[:3000], max_length=200, min_length=50, do_sample=False) |
|
st.write("Summary:", summary[0]['summary_text']) |
|
except Exception as e: |
|
st.error(f"Error summarizing PDF: {e}") |
|
|
|
if __name__ == "__main__": |
|
main() |