Update app.py
Browse files
app.py
CHANGED
@@ -251,6 +251,7 @@ class DownloadManager:
|
|
251 |
logger.error(f"Error extracting real download URL: {e}")
|
252 |
return url
|
253 |
|
|
|
254 |
async def get_edu_exam_links(self, url):
|
255 |
"""Specialized method for educational exam websites that follows a common pattern."""
|
256 |
try:
|
@@ -259,37 +260,72 @@ class DownloadManager:
|
|
259 |
|
260 |
# Use requests for a faster initial scan
|
261 |
headers = {"User-Agent": get_random_user_agent()}
|
262 |
-
|
263 |
-
|
264 |
-
if response.status_code != 200:
|
265 |
-
logger.warning(f"Failed to fetch page: {response.status_code}")
|
266 |
-
return []
|
267 |
-
|
268 |
-
# Parse with BeautifulSoup first for efficiency
|
269 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
270 |
-
parsed_base = urlparse(url)
|
271 |
-
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
272 |
-
|
273 |
-
# Look for all links
|
274 |
-
for a in soup.find_all("a", href=True):
|
275 |
-
href = a["href"]
|
276 |
-
full_url = urljoin(url, href)
|
277 |
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
|
285 |
# If we didn't find many links with direct approach, use Playwright for more thorough extraction
|
286 |
if len(links) < 5:
|
287 |
logger.info("Using browser for enhanced link extraction")
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
for grid in grid_elements:
|
294 |
grid_links = await grid.query_selector_all('a[href]')
|
295 |
for a in grid_links:
|
@@ -297,28 +333,30 @@ class DownloadManager:
|
|
297 |
if href:
|
298 |
full_url = href if href.startswith('http') else urljoin(url, href)
|
299 |
links.add(full_url)
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
|
|
|
|
322 |
|
323 |
# Filter links to likely contain exam documents
|
324 |
filtered_links = []
|
@@ -579,6 +617,7 @@ class DownloadManager:
|
|
579 |
logger.error(f"Error downloading {file_url}: {e}")
|
580 |
return None
|
581 |
|
|
|
582 |
async def force_download_viewonly(self, file_info, save_path):
|
583 |
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
|
584 |
try:
|
@@ -620,7 +659,8 @@ class DownloadManager:
|
|
620 |
context = await browser.new_context(
|
621 |
viewport={'width': 1600, 'height': 1200},
|
622 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
623 |
-
device_scale_factor=2.0
|
|
|
624 |
)
|
625 |
|
626 |
page = await context.new_page()
|
@@ -637,11 +677,10 @@ class DownloadManager:
|
|
637 |
|
638 |
# Special handling for PDFs
|
639 |
if file_type.lower() == 'pdf':
|
640 |
-
#
|
641 |
-
pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
|
642 |
|
643 |
-
#
|
644 |
-
|
645 |
() => {
|
646 |
// Method 1: Check page counter text
|
647 |
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
|
@@ -663,205 +702,119 @@ class DownloadManager:
|
|
663 |
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
|
664 |
if (thumbnails.length > 0) return thumbnails.length;
|
665 |
|
666 |
-
// Fallback: conservative guess
|
667 |
-
return 50;
|
668 |
}
|
669 |
""")
|
670 |
|
671 |
-
logger.info(f"
|
672 |
|
673 |
-
|
674 |
-
|
675 |
-
# Let's double-check by looking for next/previous buttons
|
676 |
-
next_button = await page.query_selector('button[aria-label="Next page"]')
|
677 |
-
if next_button:
|
678 |
-
disabled = await next_button.get_attribute('disabled')
|
679 |
-
if not disabled:
|
680 |
-
logger.info("Found next button that's not disabled, document has multiple pages")
|
681 |
-
total_pages = 100 # Set a high number, we'll stop when we can't go further
|
682 |
|
683 |
-
#
|
684 |
-
|
685 |
-
|
686 |
-
logger.info("Using single-page capture approach")
|
687 |
-
|
688 |
-
# Take a screenshot of the current view (should be the full document or first page)
|
689 |
-
screenshot_path = os.path.join(temp_dir, "page.png")
|
690 |
-
|
691 |
-
# Try to screenshot just the document area if we can find it
|
692 |
-
document_area = await page.query_selector('.drive-viewer-paginated-page')
|
693 |
-
if document_area:
|
694 |
-
await document_area.screenshot(path=screenshot_path)
|
695 |
-
else:
|
696 |
-
# Otherwise take a full screenshot
|
697 |
-
await page.screenshot(path=screenshot_path)
|
698 |
-
|
699 |
-
# Convert to PDF
|
700 |
-
img = Image.open(screenshot_path)
|
701 |
-
width, height = img.size
|
702 |
-
c = canvas.Canvas(save_path, pagesize=(width, height))
|
703 |
-
c.drawImage(screenshot_path, 0, 0, width, height)
|
704 |
-
c.save()
|
705 |
-
|
706 |
-
os.remove(screenshot_path)
|
707 |
-
os.rmdir(temp_dir)
|
708 |
-
|
709 |
-
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
710 |
-
return save_path
|
711 |
-
return None
|
712 |
|
713 |
-
#
|
714 |
-
|
|
|
|
|
715 |
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
return /\\d+\\s*\\/\\s*\\d+/.test(text);
|
723 |
-
});
|
724 |
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
|
|
|
|
|
|
|
|
731 |
|
732 |
-
|
733 |
-
|
734 |
-
match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
|
735 |
-
if match:
|
736 |
-
current_page = int(match.group(1))
|
737 |
|
738 |
-
#
|
739 |
-
|
740 |
-
logger.info(f"Currently on page {current_page}, navigating back to page 1")
|
741 |
-
|
742 |
-
# Look for an input field where we can directly set the page number
|
743 |
-
page_input = await page.query_selector('input[aria-label="Page"]')
|
744 |
-
if page_input:
|
745 |
-
await page_input.fill("1")
|
746 |
-
await page_input.press("Enter")
|
747 |
-
await page.wait_for_timeout(1000)
|
748 |
-
else:
|
749 |
-
# Use prev button to go back to first page
|
750 |
-
prev_button = await page.query_selector('button[aria-label="Previous page"]')
|
751 |
-
if prev_button:
|
752 |
-
# Keep clicking until we can't anymore
|
753 |
-
for _ in range(current_page - 1):
|
754 |
-
try:
|
755 |
-
await prev_button.click()
|
756 |
-
await page.wait_for_timeout(500)
|
757 |
-
except Exception as e:
|
758 |
-
logger.warning(f"Error clicking prev button: {e}")
|
759 |
-
break
|
760 |
|
761 |
-
#
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
766 |
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
// Try to find and click any "full page" or "maximize" buttons
|
771 |
-
const fullViewButtons = Array.from(document.querySelectorAll('button'))
|
772 |
-
.filter(b => b.textContent?.includes('Full') ||
|
773 |
-
b.getAttribute('aria-label')?.includes('Full') ||
|
774 |
-
b.getAttribute('aria-label')?.includes('fit page'));
|
775 |
-
if (fullViewButtons.length > 0) {
|
776 |
-
fullViewButtons[0].click();
|
777 |
-
}
|
778 |
-
}
|
779 |
-
""")
|
780 |
|
781 |
-
|
782 |
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
# Take a screenshot of the current page
|
788 |
-
screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
|
789 |
-
|
790 |
-
# Try different methods to identify and capture just the page content
|
791 |
-
page_content = await page.query_selector('.drive-viewer-paginated-page')
|
792 |
-
if page_content:
|
793 |
-
# Found the specific page element
|
794 |
-
await page_content.screenshot(path=screenshot_path)
|
795 |
-
else:
|
796 |
-
# Fall back to screenshot of visible viewport
|
797 |
-
await page.screenshot(path=screenshot_path)
|
798 |
-
|
799 |
-
screenshots.append(screenshot_path)
|
800 |
-
logger.info(f"Captured page {page_num}")
|
801 |
-
|
802 |
-
# Check if we have a disabled next button (reached the end)
|
803 |
-
if next_button:
|
804 |
-
is_disabled = await next_button.get_attribute('disabled')
|
805 |
-
if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
|
806 |
-
logger.info(f"Reached end of document after {page_num} pages")
|
807 |
-
break
|
808 |
-
|
809 |
-
# Click the next button
|
810 |
-
try:
|
811 |
-
await next_button.click()
|
812 |
-
await page.wait_for_timeout(800) # Wait for page transition
|
813 |
-
page_num += 1
|
814 |
-
except Exception as e:
|
815 |
-
logger.error(f"Error clicking next button: {e}")
|
816 |
-
# Try to get a fresh reference to the button
|
817 |
-
next_button = await page.query_selector('button[aria-label="Next page"]')
|
818 |
-
if not next_button:
|
819 |
-
logger.warning("Next button disappeared, assuming end of document")
|
820 |
-
break
|
821 |
-
else:
|
822 |
-
# Try to find the next button again
|
823 |
-
next_button = await page.query_selector('button[aria-label="Next page"]')
|
824 |
-
if not next_button:
|
825 |
-
logger.warning("Could not find next button, stopping navigation")
|
826 |
-
break
|
827 |
-
|
828 |
-
# Double-check if we've reached the expected total
|
829 |
-
if page_num >= total_pages:
|
830 |
-
logger.info(f"Reached expected total of {total_pages} pages")
|
831 |
-
break
|
832 |
|
833 |
-
#
|
834 |
-
|
|
|
|
|
|
|
835 |
|
836 |
-
# Use the size of the first screenshot to set PDF dimensions
|
837 |
-
if screenshots:
|
838 |
-
try:
|
839 |
-
img = Image.open(screenshots[0])
|
840 |
-
width, height = img.size
|
841 |
-
|
842 |
-
c = canvas.Canvas(save_path, pagesize=(width, height))
|
843 |
-
|
844 |
-
for screenshot in screenshots:
|
845 |
-
try:
|
846 |
-
if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
|
847 |
-
img = Image.open(screenshot)
|
848 |
-
c.drawImage(screenshot, 0, 0, width, height)
|
849 |
-
c.showPage()
|
850 |
-
except Exception as e:
|
851 |
-
logger.error(f"Error adding page to PDF: {e}")
|
852 |
-
|
853 |
-
c.save()
|
854 |
-
|
855 |
-
# Clean up screenshots
|
856 |
-
for screenshot in screenshots:
|
857 |
-
if os.path.exists(screenshot):
|
858 |
-
os.remove(screenshot)
|
859 |
-
|
860 |
-
logger.info(f"Successfully created PDF with {len(screenshots)} pages")
|
861 |
-
except Exception as e:
|
862 |
-
logger.error(f"Error creating PDF: {e}")
|
863 |
-
else:
|
864 |
-
logger.error("No screenshots captured to create PDF")
|
865 |
else:
|
866 |
# Non-PDF file handling
|
867 |
screenshot_path = os.path.join(temp_dir, "file.png")
|
@@ -876,12 +829,6 @@ class DownloadManager:
|
|
876 |
|
877 |
os.remove(screenshot_path)
|
878 |
|
879 |
-
# Clean up temp directory
|
880 |
-
try:
|
881 |
-
os.rmdir(temp_dir)
|
882 |
-
except:
|
883 |
-
pass
|
884 |
-
|
885 |
# Close browser
|
886 |
await browser.close()
|
887 |
|
@@ -1064,6 +1011,7 @@ class DownloadManager:
|
|
1064 |
logger.info("Waiting for all pages to load...")
|
1065 |
max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
|
1066 |
attempt = 0
|
|
|
1067 |
|
1068 |
while attempt < max_attempts:
|
1069 |
# Count blob images (which are the PDF pages)
|
@@ -1076,13 +1024,14 @@ class DownloadManager:
|
|
1076 |
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
|
1077 |
|
1078 |
# If we've loaded enough pages or reached estimated count
|
1079 |
-
if blob_count >= estimated_pages:
|
1080 |
logger.info("All pages appear to be loaded.")
|
1081 |
break
|
1082 |
|
1083 |
# Press PageDown to scroll further and trigger more loading
|
1084 |
await page.keyboard.press("PageDown")
|
1085 |
await page.wait_for_timeout(2000) # Wait for content to load
|
|
|
1086 |
attempt += 1
|
1087 |
|
1088 |
# Extra wait to ensure everything is fully loaded
|
@@ -1415,6 +1364,7 @@ class DownloadManager:
|
|
1415 |
|
1416 |
return file_type, is_view_only
|
1417 |
|
|
|
1418 |
async def get_sublinks(self, url, limit=10000):
|
1419 |
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
|
1420 |
links = set()
|
@@ -1979,9 +1929,49 @@ def main():
|
|
1979 |
else:
|
1980 |
st.warning("No files found.")
|
1981 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1982 |
# Add footer with attribution
|
1983 |
st.markdown('---')
|
1984 |
-
st.markdown('Created by [Euler314](https://github.com/
|
1985 |
|
1986 |
if __name__ == "__main__":
|
1987 |
main()
|
|
|
251 |
logger.error(f"Error extracting real download URL: {e}")
|
252 |
return url
|
253 |
|
254 |
+
# IMPROVED: Enhanced exam links extraction method
|
255 |
async def get_edu_exam_links(self, url):
|
256 |
"""Specialized method for educational exam websites that follows a common pattern."""
|
257 |
try:
|
|
|
260 |
|
261 |
# Use requests for a faster initial scan
|
262 |
headers = {"User-Agent": get_random_user_agent()}
|
263 |
+
try:
|
264 |
+
response = requests.get(url, headers=headers, timeout=30)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
+
if response.status_code == 200:
|
267 |
+
# Parse with BeautifulSoup first for efficiency
|
268 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
269 |
+
parsed_base = urlparse(url)
|
270 |
+
base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
271 |
+
|
272 |
+
# Look for all links
|
273 |
+
for a in soup.find_all("a", href=True):
|
274 |
+
href = a["href"]
|
275 |
+
full_url = urljoin(url, href)
|
276 |
+
|
277 |
+
# Special patterns for exam sites
|
278 |
+
for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
279 |
+
"/test/", "/download/", "/files/", "/assignments/",
|
280 |
+
"paper_", "question_", "exam_", "test_", "past_"]:
|
281 |
+
if pattern in full_url.lower():
|
282 |
+
links.add(full_url)
|
283 |
+
break
|
284 |
+
except Exception as e:
|
285 |
+
logger.warning(f"Request-based extraction failed: {e}")
|
286 |
|
287 |
# If we didn't find many links with direct approach, use Playwright for more thorough extraction
|
288 |
if len(links) < 5:
|
289 |
logger.info("Using browser for enhanced link extraction")
|
290 |
+
try:
|
291 |
+
await self.page.goto(url, timeout=30000, wait_until='networkidle')
|
292 |
+
|
293 |
+
# Extract all links with Playwright
|
294 |
+
page_links = await self.page.evaluate("""
|
295 |
+
() => {
|
296 |
+
const links = [];
|
297 |
+
const anchors = document.querySelectorAll('a[href]');
|
298 |
+
for (const a of anchors) {
|
299 |
+
if (a.href) {
|
300 |
+
links.push({
|
301 |
+
href: a.href,
|
302 |
+
text: a.innerText || a.textContent || ''
|
303 |
+
});
|
304 |
+
}
|
305 |
+
}
|
306 |
+
return links;
|
307 |
+
}
|
308 |
+
""")
|
309 |
+
|
310 |
+
# Process extracted links
|
311 |
+
for link_info in page_links:
|
312 |
+
href = link_info.get('href', '')
|
313 |
+
text = link_info.get('text', '').lower()
|
314 |
+
|
315 |
+
if href:
|
316 |
+
# Check for exam-related patterns in URL or link text
|
317 |
+
url_patterns = ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
|
318 |
+
"/test/", "/download/", "/files/", "/assignments/",
|
319 |
+
"paper_", "question_", "exam_", "test_", "past_"]
|
320 |
+
|
321 |
+
text_patterns = ["exam", "paper", "test", "question", "past", "download"]
|
322 |
+
|
323 |
+
if any(pattern in href.lower() for pattern in url_patterns) or \
|
324 |
+
any(pattern in text for pattern in text_patterns):
|
325 |
+
links.add(href)
|
326 |
+
|
327 |
+
# Check for ASP.NET specific elements that might contain exam links
|
328 |
+
grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
|
329 |
for grid in grid_elements:
|
330 |
grid_links = await grid.query_selector_all('a[href]')
|
331 |
for a in grid_links:
|
|
|
333 |
if href:
|
334 |
full_url = href if href.startswith('http') else urljoin(url, href)
|
335 |
links.add(full_url)
|
336 |
+
|
337 |
+
# Try clicking any controls that might reveal more exam links
|
338 |
+
buttons = await self.page.query_selector_all('input[type="button"], button')
|
339 |
+
for button in buttons:
|
340 |
+
button_text = await button.text_content() or ""
|
341 |
+
button_value = await button.get_attribute("value") or ""
|
342 |
+
if any(keyword in (button_text + button_value).lower() for keyword in
|
343 |
+
["show", "view", "display", "list", "exam", "paper", "test"]):
|
344 |
+
try:
|
345 |
+
await button.click()
|
346 |
+
await self.page.wait_for_timeout(1000)
|
347 |
+
await self.page.wait_for_load_state('networkidle', timeout=5000)
|
348 |
+
|
349 |
+
# Get any new links that appeared
|
350 |
+
new_links = await self.page.query_selector_all('a[href]')
|
351 |
+
for a in new_links:
|
352 |
+
href = await a.get_attribute('href')
|
353 |
+
if href:
|
354 |
+
full_url = href if href.startswith('http') else urljoin(url, href)
|
355 |
+
links.add(full_url)
|
356 |
+
except Exception as e:
|
357 |
+
logger.warning(f"Error clicking button: {e}")
|
358 |
+
except Exception as e:
|
359 |
+
logger.error(f"Browser-based extraction failed: {e}")
|
360 |
|
361 |
# Filter links to likely contain exam documents
|
362 |
filtered_links = []
|
|
|
617 |
logger.error(f"Error downloading {file_url}: {e}")
|
618 |
return None
|
619 |
|
620 |
+
# IMPROVED: Enhanced view-only document download method
|
621 |
async def force_download_viewonly(self, file_info, save_path):
|
622 |
"""Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
|
623 |
try:
|
|
|
659 |
context = await browser.new_context(
|
660 |
viewport={'width': 1600, 'height': 1200},
|
661 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
662 |
+
device_scale_factor=2.0,
|
663 |
+
accept_downloads=True # Critical for the download workflow
|
664 |
)
|
665 |
|
666 |
page = await context.new_page()
|
|
|
677 |
|
678 |
# Special handling for PDFs
|
679 |
if file_type.lower() == 'pdf':
|
680 |
+
# Use the improved scrolling and detection approach
|
|
|
681 |
|
682 |
+
# Check if there's a pagination control to estimate pages
|
683 |
+
estimated_pages = await page.evaluate("""
|
684 |
() => {
|
685 |
// Method 1: Check page counter text
|
686 |
const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
|
|
|
702 |
const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
|
703 |
if (thumbnails.length > 0) return thumbnails.length;
|
704 |
|
705 |
+
// Fallback: conservative guess
|
706 |
+
return 50;
|
707 |
}
|
708 |
""")
|
709 |
|
710 |
+
logger.info(f"Estimated {estimated_pages} pages in PDF")
|
711 |
|
712 |
+
# Scroll to ensure all pages are loaded
|
713 |
+
logger.info("Scrolling to load all PDF pages...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
|
715 |
+
# Initial scroll to bottom to trigger lazy loading
|
716 |
+
await page.keyboard.press("End")
|
717 |
+
await page.wait_for_timeout(3000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
|
719 |
+
# Scroll page by page to ensure all pages are loaded
|
720 |
+
max_attempts = min(estimated_pages * 3, 300)
|
721 |
+
attempt = 0
|
722 |
+
prev_blob_count = 0
|
723 |
|
724 |
+
while attempt < max_attempts:
|
725 |
+
blob_count = await page.evaluate("""
|
726 |
+
Array.from(document.getElementsByTagName('img'))
|
727 |
+
.filter(img => img.src.startsWith('blob:') && img.width > 100)
|
728 |
+
.length
|
729 |
+
""")
|
|
|
|
|
730 |
|
731 |
+
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
|
732 |
+
|
733 |
+
if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
|
734 |
+
logger.info("All pages appear to be loaded.")
|
735 |
+
break
|
736 |
+
|
737 |
+
await page.keyboard.press("PageDown")
|
738 |
+
await page.wait_for_timeout(2000)
|
739 |
+
prev_blob_count = blob_count
|
740 |
+
attempt += 1
|
741 |
|
742 |
+
# Extra wait to ensure everything is loaded
|
743 |
+
await page.wait_for_timeout(5000)
|
|
|
|
|
|
|
744 |
|
745 |
+
# Set up download event listener for the PDF
|
746 |
+
download_promise = page.wait_for_event("download")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
747 |
|
748 |
+
# Use jsPDF to generate PDF from loaded pages
|
749 |
+
logger.info("Generating PDF from loaded pages...")
|
750 |
+
result = await page.evaluate(r'''
|
751 |
+
(function() {
|
752 |
+
return new Promise((resolve, reject) => {
|
753 |
+
let script = document.createElement("script");
|
754 |
+
script.onload = function () {
|
755 |
+
try {
|
756 |
+
let pdf = new jsPDF();
|
757 |
+
let imgs = Array.from(document.getElementsByTagName("img"))
|
758 |
+
.filter(img => img.src.startsWith('blob:') && img.width > 100)
|
759 |
+
.sort((a, b) => {
|
760 |
+
const rectA = a.getBoundingClientRect();
|
761 |
+
const rectB = b.getBoundingClientRect();
|
762 |
+
return rectA.top - rectB.top;
|
763 |
+
});
|
764 |
+
|
765 |
+
console.log(`Found ${imgs.length} valid page images to add to PDF`);
|
766 |
+
|
767 |
+
let added = 0;
|
768 |
+
for (let i = 0; i < imgs.length; i++) {
|
769 |
+
let img = imgs[i];
|
770 |
+
let canvas = document.createElement("canvas");
|
771 |
+
let ctx = canvas.getContext("2d");
|
772 |
+
canvas.width = img.width;
|
773 |
+
canvas.height = img.height;
|
774 |
+
ctx.drawImage(img, 0, 0, img.width, img.height);
|
775 |
+
let imgData = canvas.toDataURL("image/jpeg", 1.0);
|
776 |
+
|
777 |
+
if (added > 0) {
|
778 |
+
pdf.addPage();
|
779 |
+
}
|
780 |
+
|
781 |
+
pdf.addImage(imgData, 'JPEG', 0, 0);
|
782 |
+
added++;
|
783 |
+
}
|
784 |
+
|
785 |
+
pdf.save("download.pdf");
|
786 |
+
resolve({success: true, pageCount: added});
|
787 |
+
} catch (error) {
|
788 |
+
reject({success: false, error: error.toString()});
|
789 |
+
}
|
790 |
+
};
|
791 |
+
|
792 |
+
script.onerror = function() {
|
793 |
+
reject({success: false, error: "Failed to load jsPDF library"});
|
794 |
+
};
|
795 |
+
|
796 |
+
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
|
797 |
+
document.body.appendChild(script);
|
798 |
+
});
|
799 |
+
})();
|
800 |
+
''')
|
801 |
|
802 |
+
if not result.get('success', False):
|
803 |
+
logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
|
804 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
805 |
|
806 |
+
logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
|
807 |
|
808 |
+
# Wait for the download and save it
|
809 |
+
download = await download_promise
|
810 |
+
await download.save_as(save_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
|
812 |
+
# Clean up temp directory
|
813 |
+
try:
|
814 |
+
os.rmdir(temp_dir)
|
815 |
+
except:
|
816 |
+
pass
|
817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
818 |
else:
|
819 |
# Non-PDF file handling
|
820 |
screenshot_path = os.path.join(temp_dir, "file.png")
|
|
|
829 |
|
830 |
os.remove(screenshot_path)
|
831 |
|
|
|
|
|
|
|
|
|
|
|
|
|
832 |
# Close browser
|
833 |
await browser.close()
|
834 |
|
|
|
1011 |
logger.info("Waiting for all pages to load...")
|
1012 |
max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
|
1013 |
attempt = 0
|
1014 |
+
prev_blob_count = 0
|
1015 |
|
1016 |
while attempt < max_attempts:
|
1017 |
# Count blob images (which are the PDF pages)
|
|
|
1024 |
logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
|
1025 |
|
1026 |
# If we've loaded enough pages or reached estimated count
|
1027 |
+
if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
|
1028 |
logger.info("All pages appear to be loaded.")
|
1029 |
break
|
1030 |
|
1031 |
# Press PageDown to scroll further and trigger more loading
|
1032 |
await page.keyboard.press("PageDown")
|
1033 |
await page.wait_for_timeout(2000) # Wait for content to load
|
1034 |
+
prev_blob_count = blob_count
|
1035 |
attempt += 1
|
1036 |
|
1037 |
# Extra wait to ensure everything is fully loaded
|
|
|
1364 |
|
1365 |
return file_type, is_view_only
|
1366 |
|
1367 |
+
# IMPROVED: Enhanced sublink extraction method
|
1368 |
async def get_sublinks(self, url, limit=10000):
|
1369 |
"""Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
|
1370 |
links = set()
|
|
|
1929 |
else:
|
1930 |
st.warning("No files found.")
|
1931 |
|
1932 |
+
# Add a special section for direct Google Drive file download
|
1933 |
+
st.markdown("---")
|
1934 |
+
with st.expander("Download View-Only Google Drive Document", expanded=False):
|
1935 |
+
st.write("Download protected/view-only Google Drive documents - just enter the file ID")
|
1936 |
+
file_id = st.text_input("Google Drive File ID",
|
1937 |
+
placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku",
|
1938 |
+
help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')")
|
1939 |
+
|
1940 |
+
if st.button("Download Document") and file_id:
|
1941 |
+
download_dir = "./downloads"
|
1942 |
+
os.makedirs(download_dir, exist_ok=True)
|
1943 |
+
output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
|
1944 |
+
|
1945 |
+
with st.spinner("Downloading view-only document... (this may take a minute)"):
|
1946 |
+
async def download_viewonly():
|
1947 |
+
async with DownloadManager() as dm:
|
1948 |
+
file_info = {
|
1949 |
+
'url': f"https://drive.google.com/file/d/{file_id}/view",
|
1950 |
+
'filename': f"gdrive_{file_id}.pdf",
|
1951 |
+
'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True}
|
1952 |
+
}
|
1953 |
+
result_path = await dm.force_download_viewonly(file_info, output_path)
|
1954 |
+
return result_path
|
1955 |
+
|
1956 |
+
result = asyncio.run(download_viewonly())
|
1957 |
+
|
1958 |
+
if result:
|
1959 |
+
st.success("Document downloaded successfully!")
|
1960 |
+
with open(result, "rb") as f:
|
1961 |
+
file_bytes = f.read()
|
1962 |
+
|
1963 |
+
st.download_button(
|
1964 |
+
label="Download PDF",
|
1965 |
+
data=file_bytes,
|
1966 |
+
file_name=os.path.basename(result),
|
1967 |
+
mime="application/pdf"
|
1968 |
+
)
|
1969 |
+
else:
|
1970 |
+
st.error("Failed to download the document. Please check the file ID and try again.")
|
1971 |
+
|
1972 |
# Add footer with attribution
|
1973 |
st.markdown('---')
|
1974 |
+
st.markdown('Created by [Euler314](https://github.com/yu314-coder)')
|
1975 |
|
1976 |
if __name__ == "__main__":
|
1977 |
main()
|