euler314 commited on
Commit
baece32
·
verified ·
1 Parent(s): 6f39f32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -244
app.py CHANGED
@@ -251,6 +251,7 @@ class DownloadManager:
251
  logger.error(f"Error extracting real download URL: {e}")
252
  return url
253
 
 
254
  async def get_edu_exam_links(self, url):
255
  """Specialized method for educational exam websites that follows a common pattern."""
256
  try:
@@ -259,37 +260,72 @@ class DownloadManager:
259
 
260
  # Use requests for a faster initial scan
261
  headers = {"User-Agent": get_random_user_agent()}
262
- response = requests.get(url, headers=headers, timeout=30)
263
-
264
- if response.status_code != 200:
265
- logger.warning(f"Failed to fetch page: {response.status_code}")
266
- return []
267
-
268
- # Parse with BeautifulSoup first for efficiency
269
- soup = BeautifulSoup(response.text, "html.parser")
270
- parsed_base = urlparse(url)
271
- base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
272
-
273
- # Look for all links
274
- for a in soup.find_all("a", href=True):
275
- href = a["href"]
276
- full_url = urljoin(url, href)
277
 
278
- # Special patterns for exam sites
279
- for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
280
- "/test/", "/download/", "/files/", "/assignments/"]:
281
- if pattern in full_url.lower():
282
- links.add(full_url)
283
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  # If we didn't find many links with direct approach, use Playwright for more thorough extraction
286
  if len(links) < 5:
287
  logger.info("Using browser for enhanced link extraction")
288
- await self.page.goto(url, timeout=30000, wait_until='networkidle')
289
-
290
- # Check for ASP.NET specific elements that might contain exam links
291
- grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
292
- if grid_elements:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  for grid in grid_elements:
294
  grid_links = await grid.query_selector_all('a[href]')
295
  for a in grid_links:
@@ -297,28 +333,30 @@ class DownloadManager:
297
  if href:
298
  full_url = href if href.startswith('http') else urljoin(url, href)
299
  links.add(full_url)
300
-
301
- # Try clicking any controls that might reveal more exam links
302
- show_buttons = await self.page.query_selector_all('input[type="button"], button')
303
- for button in show_buttons:
304
- button_text = await button.text_content() or ""
305
- button_value = await button.get_attribute("value") or ""
306
- if any(keyword in (button_text + button_value).lower() for keyword in
307
- ["show", "view", "display", "list", "exam", "paper", "test"]):
308
- try:
309
- await button.click()
310
- await self.page.wait_for_timeout(1000)
311
- await self.page.wait_for_load_state('networkidle', timeout=5000)
312
-
313
- # Get any new links that appeared
314
- new_links = await self.page.query_selector_all('a[href]')
315
- for a in new_links:
316
- href = await a.get_attribute('href')
317
- if href:
318
- full_url = href if href.startswith('http') else urljoin(url, href)
319
- links.add(full_url)
320
- except Exception as e:
321
- logger.warning(f"Error clicking button: {e}")
 
 
322
 
323
  # Filter links to likely contain exam documents
324
  filtered_links = []
@@ -579,6 +617,7 @@ class DownloadManager:
579
  logger.error(f"Error downloading {file_url}: {e}")
580
  return None
581
 
 
582
  async def force_download_viewonly(self, file_info, save_path):
583
  """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
584
  try:
@@ -620,7 +659,8 @@ class DownloadManager:
620
  context = await browser.new_context(
621
  viewport={'width': 1600, 'height': 1200},
622
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
623
- device_scale_factor=2.0
 
624
  )
625
 
626
  page = await context.new_page()
@@ -637,11 +677,10 @@ class DownloadManager:
637
 
638
  # Special handling for PDFs
639
  if file_type.lower() == 'pdf':
640
- # Check if there's a pagination control
641
- pagination_exists = await page.query_selector('div[role="toolbar"] div[role="presentation"] div[role="presentation"]:has-text("/")')
642
 
643
- # Try multiple methods to extract total pages
644
- total_pages = await page.evaluate("""
645
  () => {
646
  // Method 1: Check page counter text
647
  const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
@@ -663,205 +702,119 @@ class DownloadManager:
663
  const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
664
  if (thumbnails.length > 0) return thumbnails.length;
665
 
666
- // Fallback: conservative guess based on UI
667
- return 50; // Safe default when we can't determine
668
  }
669
  """)
670
 
671
- logger.info(f"Detected {total_pages} pages in PDF")
672
 
673
- if total_pages <= 1:
674
- # Additional check - sometimes the page count detection fails
675
- # Let's double-check by looking for next/previous buttons
676
- next_button = await page.query_selector('button[aria-label="Next page"]')
677
- if next_button:
678
- disabled = await next_button.get_attribute('disabled')
679
- if not disabled:
680
- logger.info("Found next button that's not disabled, document has multiple pages")
681
- total_pages = 100 # Set a high number, we'll stop when we can't go further
682
 
683
- # If we still think it's a single page, use a more direct approach
684
- if total_pages <= 1:
685
- # Single page approach
686
- logger.info("Using single-page capture approach")
687
-
688
- # Take a screenshot of the current view (should be the full document or first page)
689
- screenshot_path = os.path.join(temp_dir, "page.png")
690
-
691
- # Try to screenshot just the document area if we can find it
692
- document_area = await page.query_selector('.drive-viewer-paginated-page')
693
- if document_area:
694
- await document_area.screenshot(path=screenshot_path)
695
- else:
696
- # Otherwise take a full screenshot
697
- await page.screenshot(path=screenshot_path)
698
-
699
- # Convert to PDF
700
- img = Image.open(screenshot_path)
701
- width, height = img.size
702
- c = canvas.Canvas(save_path, pagesize=(width, height))
703
- c.drawImage(screenshot_path, 0, 0, width, height)
704
- c.save()
705
-
706
- os.remove(screenshot_path)
707
- os.rmdir(temp_dir)
708
-
709
- if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
710
- return save_path
711
- return None
712
 
713
- # Multi-page approach
714
- logger.info(f"Using multi-page capture approach for {total_pages} pages")
 
 
715
 
716
- # CRITICAL: We need to go to the first page first
717
- # Check if we need to reset to first page
718
- current_page_text = await page.evaluate("""
719
- () => {
720
- const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
721
- const text = el.textContent || '';
722
- return /\\d+\\s*\\/\\s*\\d+/.test(text);
723
- });
724
 
725
- if (pageCounters.length > 0) {
726
- return pageCounters[0].textContent || '';
727
- }
728
- return '';
729
- }
730
- """)
 
 
 
 
731
 
732
- current_page = 1
733
- if current_page_text:
734
- match = re.search(r'(\d+)\s*\/\s*\d+', current_page_text)
735
- if match:
736
- current_page = int(match.group(1))
737
 
738
- # If we're not on page 1, go back to first page
739
- if current_page > 1:
740
- logger.info(f"Currently on page {current_page}, navigating back to page 1")
741
-
742
- # Look for an input field where we can directly set the page number
743
- page_input = await page.query_selector('input[aria-label="Page"]')
744
- if page_input:
745
- await page_input.fill("1")
746
- await page_input.press("Enter")
747
- await page.wait_for_timeout(1000)
748
- else:
749
- # Use prev button to go back to first page
750
- prev_button = await page.query_selector('button[aria-label="Previous page"]')
751
- if prev_button:
752
- # Keep clicking until we can't anymore
753
- for _ in range(current_page - 1):
754
- try:
755
- await prev_button.click()
756
- await page.wait_for_timeout(500)
757
- except Exception as e:
758
- logger.warning(f"Error clicking prev button: {e}")
759
- break
760
 
761
- # Capture each page
762
- screenshots = []
763
- page_num = 1
764
- max_tries = min(total_pages + 10, 200) # Set a reasonable limit
765
- next_button = await page.query_selector('button[aria-label="Next page"]')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
 
767
- # Maximize the PDF view if possible
768
- await page.evaluate("""
769
- () => {
770
- // Try to find and click any "full page" or "maximize" buttons
771
- const fullViewButtons = Array.from(document.querySelectorAll('button'))
772
- .filter(b => b.textContent?.includes('Full') ||
773
- b.getAttribute('aria-label')?.includes('Full') ||
774
- b.getAttribute('aria-label')?.includes('fit page'));
775
- if (fullViewButtons.length > 0) {
776
- fullViewButtons[0].click();
777
- }
778
- }
779
- """)
780
 
781
- await page.wait_for_timeout(1000) # Wait for view to adjust
782
 
783
- while page_num <= max_tries:
784
- # Wait for the page to be fully loaded
785
- await page.wait_for_timeout(800)
786
-
787
- # Take a screenshot of the current page
788
- screenshot_path = os.path.join(temp_dir, f"page_{page_num}.png")
789
-
790
- # Try different methods to identify and capture just the page content
791
- page_content = await page.query_selector('.drive-viewer-paginated-page')
792
- if page_content:
793
- # Found the specific page element
794
- await page_content.screenshot(path=screenshot_path)
795
- else:
796
- # Fall back to screenshot of visible viewport
797
- await page.screenshot(path=screenshot_path)
798
-
799
- screenshots.append(screenshot_path)
800
- logger.info(f"Captured page {page_num}")
801
-
802
- # Check if we have a disabled next button (reached the end)
803
- if next_button:
804
- is_disabled = await next_button.get_attribute('disabled')
805
- if is_disabled == 'true' or is_disabled == 'disabled' or is_disabled is True:
806
- logger.info(f"Reached end of document after {page_num} pages")
807
- break
808
-
809
- # Click the next button
810
- try:
811
- await next_button.click()
812
- await page.wait_for_timeout(800) # Wait for page transition
813
- page_num += 1
814
- except Exception as e:
815
- logger.error(f"Error clicking next button: {e}")
816
- # Try to get a fresh reference to the button
817
- next_button = await page.query_selector('button[aria-label="Next page"]')
818
- if not next_button:
819
- logger.warning("Next button disappeared, assuming end of document")
820
- break
821
- else:
822
- # Try to find the next button again
823
- next_button = await page.query_selector('button[aria-label="Next page"]')
824
- if not next_button:
825
- logger.warning("Could not find next button, stopping navigation")
826
- break
827
-
828
- # Double-check if we've reached the expected total
829
- if page_num >= total_pages:
830
- logger.info(f"Reached expected total of {total_pages} pages")
831
- break
832
 
833
- # Combine screenshots into PDF
834
- logger.info(f"Creating PDF from {len(screenshots)} captured pages")
 
 
 
835
 
836
- # Use the size of the first screenshot to set PDF dimensions
837
- if screenshots:
838
- try:
839
- img = Image.open(screenshots[0])
840
- width, height = img.size
841
-
842
- c = canvas.Canvas(save_path, pagesize=(width, height))
843
-
844
- for screenshot in screenshots:
845
- try:
846
- if os.path.exists(screenshot) and os.path.getsize(screenshot) > 100:
847
- img = Image.open(screenshot)
848
- c.drawImage(screenshot, 0, 0, width, height)
849
- c.showPage()
850
- except Exception as e:
851
- logger.error(f"Error adding page to PDF: {e}")
852
-
853
- c.save()
854
-
855
- # Clean up screenshots
856
- for screenshot in screenshots:
857
- if os.path.exists(screenshot):
858
- os.remove(screenshot)
859
-
860
- logger.info(f"Successfully created PDF with {len(screenshots)} pages")
861
- except Exception as e:
862
- logger.error(f"Error creating PDF: {e}")
863
- else:
864
- logger.error("No screenshots captured to create PDF")
865
  else:
866
  # Non-PDF file handling
867
  screenshot_path = os.path.join(temp_dir, "file.png")
@@ -876,12 +829,6 @@ class DownloadManager:
876
 
877
  os.remove(screenshot_path)
878
 
879
- # Clean up temp directory
880
- try:
881
- os.rmdir(temp_dir)
882
- except:
883
- pass
884
-
885
  # Close browser
886
  await browser.close()
887
 
@@ -1064,6 +1011,7 @@ class DownloadManager:
1064
  logger.info("Waiting for all pages to load...")
1065
  max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
1066
  attempt = 0
 
1067
 
1068
  while attempt < max_attempts:
1069
  # Count blob images (which are the PDF pages)
@@ -1076,13 +1024,14 @@ class DownloadManager:
1076
  logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
1077
 
1078
  # If we've loaded enough pages or reached estimated count
1079
- if blob_count >= estimated_pages:
1080
  logger.info("All pages appear to be loaded.")
1081
  break
1082
 
1083
  # Press PageDown to scroll further and trigger more loading
1084
  await page.keyboard.press("PageDown")
1085
  await page.wait_for_timeout(2000) # Wait for content to load
 
1086
  attempt += 1
1087
 
1088
  # Extra wait to ensure everything is fully loaded
@@ -1415,6 +1364,7 @@ class DownloadManager:
1415
 
1416
  return file_type, is_view_only
1417
 
 
1418
  async def get_sublinks(self, url, limit=10000):
1419
  """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
1420
  links = set()
@@ -1979,9 +1929,49 @@ def main():
1979
  else:
1980
  st.warning("No files found.")
1981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1982
  # Add footer with attribution
1983
  st.markdown('---')
1984
- st.markdown('Created by [Euler314](https://github.com/euler314)')
1985
 
1986
  if __name__ == "__main__":
1987
  main()
 
251
  logger.error(f"Error extracting real download URL: {e}")
252
  return url
253
 
254
+ # IMPROVED: Enhanced exam links extraction method
255
  async def get_edu_exam_links(self, url):
256
  """Specialized method for educational exam websites that follows a common pattern."""
257
  try:
 
260
 
261
  # Use requests for a faster initial scan
262
  headers = {"User-Agent": get_random_user_agent()}
263
+ try:
264
+ response = requests.get(url, headers=headers, timeout=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
+ if response.status_code == 200:
267
+ # Parse with BeautifulSoup first for efficiency
268
+ soup = BeautifulSoup(response.text, "html.parser")
269
+ parsed_base = urlparse(url)
270
+ base_url = f"{parsed_base.scheme}://{parsed_base.netloc}"
271
+
272
+ # Look for all links
273
+ for a in soup.find_all("a", href=True):
274
+ href = a["href"]
275
+ full_url = urljoin(url, href)
276
+
277
+ # Special patterns for exam sites
278
+ for pattern in ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
279
+ "/test/", "/download/", "/files/", "/assignments/",
280
+ "paper_", "question_", "exam_", "test_", "past_"]:
281
+ if pattern in full_url.lower():
282
+ links.add(full_url)
283
+ break
284
+ except Exception as e:
285
+ logger.warning(f"Request-based extraction failed: {e}")
286
 
287
  # If we didn't find many links with direct approach, use Playwright for more thorough extraction
288
  if len(links) < 5:
289
  logger.info("Using browser for enhanced link extraction")
290
+ try:
291
+ await self.page.goto(url, timeout=30000, wait_until='networkidle')
292
+
293
+ # Extract all links with Playwright
294
+ page_links = await self.page.evaluate("""
295
+ () => {
296
+ const links = [];
297
+ const anchors = document.querySelectorAll('a[href]');
298
+ for (const a of anchors) {
299
+ if (a.href) {
300
+ links.push({
301
+ href: a.href,
302
+ text: a.innerText || a.textContent || ''
303
+ });
304
+ }
305
+ }
306
+ return links;
307
+ }
308
+ """)
309
+
310
+ # Process extracted links
311
+ for link_info in page_links:
312
+ href = link_info.get('href', '')
313
+ text = link_info.get('text', '').lower()
314
+
315
+ if href:
316
+ # Check for exam-related patterns in URL or link text
317
+ url_patterns = ["/eduexp/docs/", "/exam/", "/pastexam/", "/papers/",
318
+ "/test/", "/download/", "/files/", "/assignments/",
319
+ "paper_", "question_", "exam_", "test_", "past_"]
320
+
321
+ text_patterns = ["exam", "paper", "test", "question", "past", "download"]
322
+
323
+ if any(pattern in href.lower() for pattern in url_patterns) or \
324
+ any(pattern in text for pattern in text_patterns):
325
+ links.add(href)
326
+
327
+ # Check for ASP.NET specific elements that might contain exam links
328
+ grid_elements = await self.page.query_selector_all('table.grid, .GridView, #GridView1, .rgMasterTable')
329
  for grid in grid_elements:
330
  grid_links = await grid.query_selector_all('a[href]')
331
  for a in grid_links:
 
333
  if href:
334
  full_url = href if href.startswith('http') else urljoin(url, href)
335
  links.add(full_url)
336
+
337
+ # Try clicking any controls that might reveal more exam links
338
+ buttons = await self.page.query_selector_all('input[type="button"], button')
339
+ for button in buttons:
340
+ button_text = await button.text_content() or ""
341
+ button_value = await button.get_attribute("value") or ""
342
+ if any(keyword in (button_text + button_value).lower() for keyword in
343
+ ["show", "view", "display", "list", "exam", "paper", "test"]):
344
+ try:
345
+ await button.click()
346
+ await self.page.wait_for_timeout(1000)
347
+ await self.page.wait_for_load_state('networkidle', timeout=5000)
348
+
349
+ # Get any new links that appeared
350
+ new_links = await self.page.query_selector_all('a[href]')
351
+ for a in new_links:
352
+ href = await a.get_attribute('href')
353
+ if href:
354
+ full_url = href if href.startswith('http') else urljoin(url, href)
355
+ links.add(full_url)
356
+ except Exception as e:
357
+ logger.warning(f"Error clicking button: {e}")
358
+ except Exception as e:
359
+ logger.error(f"Browser-based extraction failed: {e}")
360
 
361
  # Filter links to likely contain exam documents
362
  filtered_links = []
 
617
  logger.error(f"Error downloading {file_url}: {e}")
618
  return None
619
 
620
+ # IMPROVED: Enhanced view-only document download method
621
  async def force_download_viewonly(self, file_info, save_path):
622
  """Completely rewritten method to handle view-only files reliably, especially multi-page PDFs"""
623
  try:
 
659
  context = await browser.new_context(
660
  viewport={'width': 1600, 'height': 1200},
661
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
662
+ device_scale_factor=2.0,
663
+ accept_downloads=True # Critical for the download workflow
664
  )
665
 
666
  page = await context.new_page()
 
677
 
678
  # Special handling for PDFs
679
  if file_type.lower() == 'pdf':
680
+ # Use the improved scrolling and detection approach
 
681
 
682
+ # Check if there's a pagination control to estimate pages
683
+ estimated_pages = await page.evaluate("""
684
  () => {
685
  // Method 1: Check page counter text
686
  const pageCounters = Array.from(document.querySelectorAll('*')).filter(el => {
 
702
  const thumbnails = document.querySelectorAll('.drive-viewer-paginated-thumb');
703
  if (thumbnails.length > 0) return thumbnails.length;
704
 
705
+ // Fallback: conservative guess
706
+ return 50;
707
  }
708
  """)
709
 
710
+ logger.info(f"Estimated {estimated_pages} pages in PDF")
711
 
712
+ # Scroll to ensure all pages are loaded
713
+ logger.info("Scrolling to load all PDF pages...")
 
 
 
 
 
 
 
714
 
715
+ # Initial scroll to bottom to trigger lazy loading
716
+ await page.keyboard.press("End")
717
+ await page.wait_for_timeout(3000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
 
719
+ # Scroll page by page to ensure all pages are loaded
720
+ max_attempts = min(estimated_pages * 3, 300)
721
+ attempt = 0
722
+ prev_blob_count = 0
723
 
724
+ while attempt < max_attempts:
725
+ blob_count = await page.evaluate("""
726
+ Array.from(document.getElementsByTagName('img'))
727
+ .filter(img => img.src.startsWith('blob:') && img.width > 100)
728
+ .length
729
+ """)
 
 
730
 
731
+ logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
732
+
733
+ if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
734
+ logger.info("All pages appear to be loaded.")
735
+ break
736
+
737
+ await page.keyboard.press("PageDown")
738
+ await page.wait_for_timeout(2000)
739
+ prev_blob_count = blob_count
740
+ attempt += 1
741
 
742
+ # Extra wait to ensure everything is loaded
743
+ await page.wait_for_timeout(5000)
 
 
 
744
 
745
+ # Set up download event listener for the PDF
746
+ download_promise = page.wait_for_event("download")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
 
748
+ # Use jsPDF to generate PDF from loaded pages
749
+ logger.info("Generating PDF from loaded pages...")
750
+ result = await page.evaluate(r'''
751
+ (function() {
752
+ return new Promise((resolve, reject) => {
753
+ let script = document.createElement("script");
754
+ script.onload = function () {
755
+ try {
756
+ let pdf = new jsPDF();
757
+ let imgs = Array.from(document.getElementsByTagName("img"))
758
+ .filter(img => img.src.startsWith('blob:') && img.width > 100)
759
+ .sort((a, b) => {
760
+ const rectA = a.getBoundingClientRect();
761
+ const rectB = b.getBoundingClientRect();
762
+ return rectA.top - rectB.top;
763
+ });
764
+
765
+ console.log(`Found ${imgs.length} valid page images to add to PDF`);
766
+
767
+ let added = 0;
768
+ for (let i = 0; i < imgs.length; i++) {
769
+ let img = imgs[i];
770
+ let canvas = document.createElement("canvas");
771
+ let ctx = canvas.getContext("2d");
772
+ canvas.width = img.width;
773
+ canvas.height = img.height;
774
+ ctx.drawImage(img, 0, 0, img.width, img.height);
775
+ let imgData = canvas.toDataURL("image/jpeg", 1.0);
776
+
777
+ if (added > 0) {
778
+ pdf.addPage();
779
+ }
780
+
781
+ pdf.addImage(imgData, 'JPEG', 0, 0);
782
+ added++;
783
+ }
784
+
785
+ pdf.save("download.pdf");
786
+ resolve({success: true, pageCount: added});
787
+ } catch (error) {
788
+ reject({success: false, error: error.toString()});
789
+ }
790
+ };
791
+
792
+ script.onerror = function() {
793
+ reject({success: false, error: "Failed to load jsPDF library"});
794
+ };
795
+
796
+ script.src = 'https://cdnjs.cloudflare.com/ajax/libs/jspdf/1.5.3/jspdf.debug.js';
797
+ document.body.appendChild(script);
798
+ });
799
+ })();
800
+ ''')
801
 
802
+ if not result.get('success', False):
803
+ logger.error(f"Error in PDF generation: {result.get('error', 'Unknown error')}")
804
+ return None
 
 
 
 
 
 
 
 
 
 
805
 
806
+ logger.info(f"PDF generation triggered with {result.get('pageCount')} pages")
807
 
808
+ # Wait for the download and save it
809
+ download = await download_promise
810
+ await download.save_as(save_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
811
 
812
+ # Clean up temp directory
813
+ try:
814
+ os.rmdir(temp_dir)
815
+ except:
816
+ pass
817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  else:
819
  # Non-PDF file handling
820
  screenshot_path = os.path.join(temp_dir, "file.png")
 
829
 
830
  os.remove(screenshot_path)
831
 
 
 
 
 
 
 
832
  # Close browser
833
  await browser.close()
834
 
 
1011
  logger.info("Waiting for all pages to load...")
1012
  max_attempts = min(estimated_pages * 3, 300) # Adjust based on document size
1013
  attempt = 0
1014
+ prev_blob_count = 0
1015
 
1016
  while attempt < max_attempts:
1017
  # Count blob images (which are the PDF pages)
 
1024
  logger.info(f"Attempt {attempt+1}: Found {blob_count} blob images")
1025
 
1026
  # If we've loaded enough pages or reached estimated count
1027
+ if blob_count >= estimated_pages or (blob_count > 0 and blob_count == prev_blob_count and attempt > 10):
1028
  logger.info("All pages appear to be loaded.")
1029
  break
1030
 
1031
  # Press PageDown to scroll further and trigger more loading
1032
  await page.keyboard.press("PageDown")
1033
  await page.wait_for_timeout(2000) # Wait for content to load
1034
+ prev_blob_count = blob_count
1035
  attempt += 1
1036
 
1037
  # Extra wait to ensure everything is fully loaded
 
1364
 
1365
  return file_type, is_view_only
1366
 
1367
+ # IMPROVED: Enhanced sublink extraction method
1368
  async def get_sublinks(self, url, limit=10000):
1369
  """Enhanced method to extract sublinks from a website, including dynamic content and interactive elements"""
1370
  links = set()
 
1929
  else:
1930
  st.warning("No files found.")
1931
 
1932
+ # Add a special section for direct Google Drive file download
1933
+ st.markdown("---")
1934
+ with st.expander("Download View-Only Google Drive Document", expanded=False):
1935
+ st.write("Download protected/view-only Google Drive documents - just enter the file ID")
1936
+ file_id = st.text_input("Google Drive File ID",
1937
+ placeholder="Example: 139CTPrz7jOuJRW6pL6eupH-7B4fnNRku",
1938
+ help="Enter the ID from the Google Drive URL (e.g., from 'drive.google.com/file/d/THIS_IS_THE_ID/view')")
1939
+
1940
+ if st.button("Download Document") and file_id:
1941
+ download_dir = "./downloads"
1942
+ os.makedirs(download_dir, exist_ok=True)
1943
+ output_path = os.path.join(download_dir, f"gdrive_{file_id}.pdf")
1944
+
1945
+ with st.spinner("Downloading view-only document... (this may take a minute)"):
1946
+ async def download_viewonly():
1947
+ async with DownloadManager() as dm:
1948
+ file_info = {
1949
+ 'url': f"https://drive.google.com/file/d/{file_id}/view",
1950
+ 'filename': f"gdrive_{file_id}.pdf",
1951
+ 'metadata': {'file_id': file_id, 'file_type': 'pdf', 'view_only': True}
1952
+ }
1953
+ result_path = await dm.force_download_viewonly(file_info, output_path)
1954
+ return result_path
1955
+
1956
+ result = asyncio.run(download_viewonly())
1957
+
1958
+ if result:
1959
+ st.success("Document downloaded successfully!")
1960
+ with open(result, "rb") as f:
1961
+ file_bytes = f.read()
1962
+
1963
+ st.download_button(
1964
+ label="Download PDF",
1965
+ data=file_bytes,
1966
+ file_name=os.path.basename(result),
1967
+ mime="application/pdf"
1968
+ )
1969
+ else:
1970
+ st.error("Failed to download the document. Please check the file ID and try again.")
1971
+
1972
  # Add footer with attribution
1973
  st.markdown('---')
1974
+ st.markdown('Created by [Euler314](https://github.com/yu314-coder)')
1975
 
1976
  if __name__ == "__main__":
1977
  main()